""" ATOM 1.0 feed generation for StarPunk This module provides ATOM 1.0 feed generation from published notes using Python's standard library xml.etree.ElementTree for proper XML handling. Functions: generate_atom: Generate ATOM 1.0 XML feed from notes generate_atom_streaming: Memory-efficient streaming ATOM generation Standards: - ATOM 1.0 (RFC 4287) specification compliant - RFC 3339 date format - Proper XML namespacing - Escaped HTML and text content """ # Standard library imports from datetime import datetime, timezone from typing import Optional import time import xml.etree.ElementTree as ET # Local imports from starpunk.models import Note from starpunk.monitoring.business import track_feed_generated # ATOM namespace ATOM_NS = "http://www.w3.org/2005/Atom" def generate_atom( site_url: str, site_name: str, site_description: str, notes: list[Note], limit: int = 50, ) -> str: """ Generate ATOM 1.0 XML feed from published notes Creates a standards-compliant ATOM 1.0 feed with proper metadata and entry elements. Uses ElementTree for safe XML generation. NOTE: For memory-efficient streaming, use generate_atom_streaming() instead. This function is kept for caching use cases. Args: site_url: Base URL of the site (e.g., 'https://example.com') site_name: Site title for feed site_description: Site description for feed (subtitle) notes: List of Note objects to include (should be published only) limit: Maximum number of entries to include (default: 50) Returns: ATOM 1.0 XML string (UTF-8 encoded) Raises: ValueError: If site_url or site_name is empty Examples: >>> notes = list_notes(published_only=True, limit=50) >>> feed_xml = generate_atom( ... site_url='https://example.com', ... site_name='My Blog', ... site_description='My personal notes', ... notes=notes ... ) >>> print(feed_xml[:38]) """ # Join streaming output for non-streaming version return ''.join(generate_atom_streaming( site_url=site_url, site_name=site_name, site_description=site_description, notes=notes, limit=limit )) def generate_atom_streaming( site_url: str, site_name: str, site_description: str, notes: list[Note], limit: int = 50, ): """ Generate ATOM 1.0 XML feed from published notes using streaming Memory-efficient generator that yields XML chunks instead of building the entire feed in memory. Recommended for large feeds (100+ entries). Args: site_url: Base URL of the site (e.g., 'https://example.com') site_name: Site title for feed site_description: Site description for feed notes: List of Note objects to include (should be published only) limit: Maximum number of entries to include (default: 50) Yields: XML chunks as strings (UTF-8) Raises: ValueError: If site_url or site_name is empty Examples: >>> from flask import Response >>> notes = list_notes(published_only=True, limit=100) >>> generator = generate_atom_streaming( ... site_url='https://example.com', ... site_name='My Blog', ... site_description='My personal notes', ... notes=notes ... ) >>> return Response(generator, mimetype='application/atom+xml') """ # Validate required parameters if not site_url or not site_url.strip(): raise ValueError("site_url is required and cannot be empty") if not site_name or not site_name.strip(): raise ValueError("site_name is required and cannot be empty") # Remove trailing slash from site_url for consistency site_url = site_url.rstrip("/") # Track feed generation timing start_time = time.time() item_count = 0 # Current timestamp for updated now = datetime.now(timezone.utc) # Yield XML declaration yield '\n' # Yield feed opening with namespace yield f'\n' # Yield feed metadata yield f' {_escape_xml(site_url)}/\n' yield f' {_escape_xml(site_name)}\n' yield f' {_format_atom_date(now)}\n' # Links yield f' \n' yield f' \n' # Optional subtitle if site_description: yield f' {_escape_xml(site_description)}\n' # Generator yield ' StarPunk\n' # Yield entries (newest first) # Notes from database are already in DESC order (newest first) for note in notes[:limit]: item_count += 1 # Build permalink URL permalink = f"{site_url}{note.permalink}" yield ' \n' # Required elements yield f' {_escape_xml(permalink)}\n' yield f' {_escape_xml(note.title)}\n' # Use created_at for both published and updated # (Note model doesn't have updated_at tracking yet) yield f' {_format_atom_date(note.created_at)}\n' yield f' {_format_atom_date(note.created_at)}\n' # Link to entry yield f' \n' # Media enclosures (v1.2.0 Phase 3, per Q24 and ADR-057) if hasattr(note, 'media') and note.media: for item in note.media: media_url = f"{site_url}/media/{item['path']}" mime_type = item.get('mime_type', 'image/jpeg') size = item.get('size', 0) yield f' \n' # Content - include media as HTML (per Q24) if note.html: # Build HTML content with media at top html_content = "" # Add media at top if present if hasattr(note, 'media') and note.media: html_content += '
' for item in note.media: media_url = f"{site_url}/media/{item['path']}" caption = item.get('caption', '') html_content += f'{caption}' html_content += '
' # Add text content below media html_content += note.html # HTML content - escaped yield ' ' yield _escape_xml(html_content) yield '\n' else: # Plain text content yield ' ' yield _escape_xml(note.content) yield '\n' yield '
\n' # Yield closing tag yield '
\n' # Track feed generation metrics duration_ms = (time.time() - start_time) * 1000 track_feed_generated( format='atom', item_count=item_count, duration_ms=duration_ms, cached=False ) def _escape_xml(text: str) -> str: """ Escape special XML characters for safe inclusion in XML elements Escapes the five predefined XML entities: &, <, >, ", ' Args: text: Text to escape Returns: XML-safe text with escaped entities Examples: >>> _escape_xml("Hello & goodbye") 'Hello & goodbye' >>> _escape_xml('

HTML

') '<p>HTML</p>' """ if not text: return "" # Escape in order: & first (to avoid double-escaping), then < > " ' text = text.replace("&", "&") text = text.replace("<", "<") text = text.replace(">", ">") text = text.replace('"', """) text = text.replace("'", "'") return text def _format_atom_date(dt: datetime) -> str: """ Format datetime to RFC 3339 format for ATOM ATOM 1.0 requires RFC 3339 date format for published and updated elements. RFC 3339 is a profile of ISO 8601. Format: "2024-11-25T12:00:00Z" (UTC) or "2024-11-25T12:00:00-05:00" (with offset) Args: dt: Datetime object to format (naive datetime assumed to be UTC) Returns: RFC 3339 formatted date string Examples: >>> dt = datetime(2024, 11, 25, 12, 0, 0, tzinfo=timezone.utc) >>> _format_atom_date(dt) '2024-11-25T12:00:00Z' """ # Ensure datetime has timezone (assume UTC if naive) if dt.tzinfo is None: dt = dt.replace(tzinfo=timezone.utc) # Format to RFC 3339 # Use 'Z' suffix for UTC, otherwise include offset if dt.tzinfo == timezone.utc: return dt.strftime("%Y-%m-%dT%H:%M:%SZ") else: # Format with timezone offset return dt.isoformat()