""" RSS feed generation for StarPunk This module provides RSS 2.0 feed generation from published notes using the feedgen library. Feeds include proper RFC-822 dates, CDATA-wrapped HTML content, and all required RSS elements. Functions: generate_feed: Generate RSS 2.0 XML feed from notes format_rfc822_date: Format datetime to RFC-822 for RSS get_note_title: Extract title from note (first line or timestamp) clean_html_for_rss: Clean HTML for CDATA safety Standards: - RSS 2.0 specification compliant - RFC-822 date format - Atom self-link for feed discovery - CDATA wrapping for HTML content """ # Standard library imports from datetime import datetime, timezone from typing import Optional # Third-party imports from feedgen.feed import FeedGenerator # Local imports from starpunk.models import Note def generate_feed( site_url: str, site_name: str, site_description: str, notes: list[Note], limit: int = 50, ) -> str: """ Generate RSS 2.0 XML feed from published notes Creates a standards-compliant RSS 2.0 feed with proper channel metadata and item entries for each note. Includes Atom self-link for discovery. Args: site_url: Base URL of the site (e.g., 'https://example.com') site_name: Site title for RSS channel site_description: Site description for RSS channel notes: List of Note objects to include (should be published only) limit: Maximum number of items to include (default: 50) Returns: RSS 2.0 XML string (UTF-8 encoded, pretty-printed) Raises: ValueError: If site_url or site_name is empty Examples: >>> notes = list_notes(published_only=True, limit=50) >>> feed_xml = generate_feed( ... site_url='https://example.com', ... site_name='My Blog', ... site_description='My personal notes', ... notes=notes ... ) >>> print(feed_xml[:38]) """ # Validate required parameters if not site_url or not site_url.strip(): raise ValueError("site_url is required and cannot be empty") if not site_name or not site_name.strip(): raise ValueError("site_name is required and cannot be empty") # Remove trailing slash from site_url for consistency site_url = site_url.rstrip("/") # Create feed generator fg = FeedGenerator() # Set channel metadata (required elements) fg.id(site_url) fg.title(site_name) fg.link(href=site_url, rel="alternate") fg.description(site_description or site_name) fg.language("en") # Add self-link for feed discovery (Atom namespace) fg.link(href=f"{site_url}/feed.xml", rel="self", type="application/rss+xml") # Set last build date to now fg.lastBuildDate(datetime.now(timezone.utc)) # Add items (limit to configured maximum) for note in notes[:limit]: # Create feed entry fe = fg.add_entry() # Build permalink URL permalink = f"{site_url}{note.permalink}" # Set required item elements fe.id(permalink) fe.title(get_note_title(note)) fe.link(href=permalink) fe.guid(permalink, permalink=True) # Set publication date (ensure UTC timezone) pubdate = note.created_at if pubdate.tzinfo is None: # If naive datetime, assume UTC pubdate = pubdate.replace(tzinfo=timezone.utc) fe.pubDate(pubdate) # Set description with HTML content in CDATA # feedgen automatically wraps content in CDATA for RSS html_content = clean_html_for_rss(note.html) fe.description(html_content) # Generate RSS 2.0 XML (pretty-printed) return fg.rss_str(pretty=True).decode("utf-8") def format_rfc822_date(dt: datetime) -> str: """ Format datetime to RFC-822 format for RSS RSS 2.0 requires RFC-822 date format for pubDate and lastBuildDate. Format: "Mon, 18 Nov 2024 12:00:00 +0000" Args: dt: Datetime object to format (naive datetime assumed to be UTC) Returns: RFC-822 formatted date string Examples: >>> dt = datetime(2024, 11, 18, 12, 0, 0) >>> format_rfc822_date(dt) 'Mon, 18 Nov 2024 12:00:00 +0000' """ # Ensure datetime has timezone (assume UTC if naive) if dt.tzinfo is None: dt = dt.replace(tzinfo=timezone.utc) # Format to RFC-822 # Format string: %a = weekday, %d = day, %b = month, %Y = year # %H:%M:%S = time, %z = timezone offset return dt.strftime("%a, %d %b %Y %H:%M:%S %z") def get_note_title(note: Note) -> str: """ Extract title from note content Attempts to extract a meaningful title from the note. Uses the first line of content (stripped of markdown heading syntax) or falls back to a formatted timestamp if content is unavailable. Algorithm: 1. Try note.title property (first line, stripped of # syntax) 2. Fall back to timestamp if title is unavailable Args: note: Note object Returns: Title string (max 100 chars, truncated if needed) Examples: >>> # Note with heading >>> note = Note(...) # content: "# My First Note\\n\\n..." >>> get_note_title(note) 'My First Note' >>> # Note without heading (timestamp fallback) >>> note = Note(...) # content: "Just some text" >>> get_note_title(note) 'November 18, 2024 at 12:00 PM' """ try: # Use Note's title property (handles extraction logic) title = note.title # Truncate to 100 characters for RSS compatibility if len(title) > 100: title = title[:100].strip() + "..." return title except (FileNotFoundError, OSError, AttributeError): # If title extraction fails, use timestamp return note.created_at.strftime("%B %d, %Y at %I:%M %p") def clean_html_for_rss(html: str) -> str: """ Ensure HTML is safe for RSS CDATA wrapping RSS readers expect HTML content wrapped in CDATA sections. The feedgen library handles CDATA wrapping automatically, but we need to ensure the HTML doesn't contain CDATA end markers that would break parsing. This function is primarily defensive - markdown-rendered HTML should not contain CDATA markers, but we check anyway. Args: html: Rendered HTML content from markdown Returns: Cleaned HTML safe for CDATA wrapping Examples: >>> html = "

Hello world

" >>> clean_html_for_rss(html) '

Hello world

' >>> # Edge case: HTML containing CDATA end marker >>> html = "

Example: ]]>

" >>> clean_html_for_rss(html) '

Example: ]] >

' """ # Check for CDATA end marker and add space to break it # This is extremely unlikely with markdown-rendered HTML but be safe if "]]>" in html: html = html.replace("]]>", "]] >") return html