feat: add RSS feed generation module

Implements RSS 2.0 feed generation using feedgen library. Features: - generate_feed() creates standards-compliant RSS 2.0 XML - RFC-822 date formatting for pubDate elements - Title extraction from note content (first line or timestamp) - CDATA safety for HTML content - Configurable feed item limits Follows ADR-014 RSS implementation strategy. Related: docs/decisions/ADR-014-rss-feed-implementation.md
2025-11-19 08:40:46 -07:00
parent b02df151a1
commit 856148209a
1 changed files with 229 additions and 0 deletions
--- a/starpunk/feed.py
+++ b/starpunk/feed.py
@@ -0,0 +1,229 @@
 """
 RSS feed generation for StarPunk
 This module provides RSS 2.0 feed generation from published notes using the
 feedgen library. Feeds include proper RFC-822 dates, CDATA-wrapped HTML
 content, and all required RSS elements.
 Functions:
    generate_feed: Generate RSS 2.0 XML feed from notes
    format_rfc822_date: Format datetime to RFC-822 for RSS
    get_note_title: Extract title from note (first line or timestamp)
    clean_html_for_rss: Clean HTML for CDATA safety
 Standards:
    - RSS 2.0 specification compliant
    - RFC-822 date format
    - Atom self-link for feed discovery
    - CDATA wrapping for HTML content
 """
 # Standard library imports
 from datetime import datetime, timezone
 from typing import Optional
 # Third-party imports
 from feedgen.feed import FeedGenerator
 # Local imports
 from starpunk.models import Note
 def generate_feed(
    site_url: str,
    site_name: str,
    site_description: str,
    notes: list[Note],
    limit: int = 50,
 ) -> str:
    """
    Generate RSS 2.0 XML feed from published notes
    Creates a standards-compliant RSS 2.0 feed with proper channel metadata
    and item entries for each note. Includes Atom self-link for discovery.
    Args:
        site_url: Base URL of the site (e.g., 'https://example.com')
        site_name: Site title for RSS channel
        site_description: Site description for RSS channel
        notes: List of Note objects to include (should be published only)
        limit: Maximum number of items to include (default: 50)
    Returns:
        RSS 2.0 XML string (UTF-8 encoded, pretty-printed)
    Raises:
        ValueError: If site_url or site_name is empty
    Examples:
        >>> notes = list_notes(published_only=True, limit=50)
        >>> feed_xml = generate_feed(
        ...     site_url='https://example.com',
        ...     site_name='My Blog',
        ...     site_description='My personal notes',
        ...     notes=notes
        ... )
        >>> print(feed_xml[:38])
        <?xml version='1.0' encoding='UTF-8'?>
    """
    # Validate required parameters
    if not site_url or not site_url.strip():
        raise ValueError("site_url is required and cannot be empty")
    if not site_name or not site_name.strip():
        raise ValueError("site_name is required and cannot be empty")
    # Remove trailing slash from site_url for consistency
    site_url = site_url.rstrip("/")
    # Create feed generator
    fg = FeedGenerator()
    # Set channel metadata (required elements)
    fg.id(site_url)
    fg.title(site_name)
    fg.link(href=site_url, rel="alternate")
    fg.description(site_description or site_name)
    fg.language("en")
    # Add self-link for feed discovery (Atom namespace)
    fg.link(href=f"{site_url}/feed.xml", rel="self", type="application/rss+xml")
    # Set last build date to now
    fg.lastBuildDate(datetime.now(timezone.utc))
    # Add items (limit to configured maximum)
    for note in notes[:limit]:
        # Create feed entry
        fe = fg.add_entry()
        # Build permalink URL
        permalink = f"{site_url}{note.permalink}"
        # Set required item elements
        fe.id(permalink)
        fe.title(get_note_title(note))
        fe.link(href=permalink)
        fe.guid(permalink, permalink=True)
        # Set publication date (ensure UTC timezone)
        pubdate = note.created_at
        if pubdate.tzinfo is None:
            # If naive datetime, assume UTC
            pubdate = pubdate.replace(tzinfo=timezone.utc)
        fe.pubDate(pubdate)
        # Set description with HTML content in CDATA
        # feedgen automatically wraps content in CDATA for RSS
        html_content = clean_html_for_rss(note.html)
        fe.description(html_content)
    # Generate RSS 2.0 XML (pretty-printed)
    return fg.rss_str(pretty=True).decode("utf-8")
 def format_rfc822_date(dt: datetime) -> str:
    """
    Format datetime to RFC-822 format for RSS
    RSS 2.0 requires RFC-822 date format for pubDate and lastBuildDate.
    Format: "Mon, 18 Nov 2024 12:00:00 +0000"
    Args:
        dt: Datetime object to format (naive datetime assumed to be UTC)
    Returns:
        RFC-822 formatted date string
    Examples:
        >>> dt = datetime(2024, 11, 18, 12, 0, 0)
        >>> format_rfc822_date(dt)
        'Mon, 18 Nov 2024 12:00:00 +0000'
    """
    # Ensure datetime has timezone (assume UTC if naive)
    if dt.tzinfo is None:
        dt = dt.replace(tzinfo=timezone.utc)
    # Format to RFC-822
    # Format string: %a = weekday, %d = day, %b = month, %Y = year
    #                %H:%M:%S = time, %z = timezone offset
    return dt.strftime("%a, %d %b %Y %H:%M:%S %z")
 def get_note_title(note: Note) -> str:
    """
    Extract title from note content
    Attempts to extract a meaningful title from the note. Uses the first
    line of content (stripped of markdown heading syntax) or falls back
    to a formatted timestamp if content is unavailable.
    Algorithm:
        1. Try note.title property (first line, stripped of # syntax)
        2. Fall back to timestamp if title is unavailable
    Args:
        note: Note object
    Returns:
        Title string (max 100 chars, truncated if needed)
    Examples:
        >>> # Note with heading
        >>> note = Note(...)  # content: "# My First Note\\n\\n..."
        >>> get_note_title(note)
        'My First Note'
        >>> # Note without heading (timestamp fallback)
        >>> note = Note(...)  # content: "Just some text"
        >>> get_note_title(note)
        'November 18, 2024 at 12:00 PM'
    """
    try:
        # Use Note's title property (handles extraction logic)
        title = note.title
        # Truncate to 100 characters for RSS compatibility
        if len(title) > 100:
            title = title[:100].strip() + "..."
        return title
    except (FileNotFoundError, OSError, AttributeError):
        # If title extraction fails, use timestamp
        return note.created_at.strftime("%B %d, %Y at %I:%M %p")
 def clean_html_for_rss(html: str) -> str:
    """
    Ensure HTML is safe for RSS CDATA wrapping
    RSS readers expect HTML content wrapped in CDATA sections. The feedgen
    library handles CDATA wrapping automatically, but we need to ensure
    the HTML doesn't contain CDATA end markers that would break parsing.
    This function is primarily defensive - markdown-rendered HTML should
    not contain CDATA markers, but we check anyway.
    Args:
        html: Rendered HTML content from markdown
    Returns:
        Cleaned HTML safe for CDATA wrapping
    Examples:
        >>> html = "<p>Hello world</p>"
        >>> clean_html_for_rss(html)
        '<p>Hello world</p>'
        >>> # Edge case: HTML containing CDATA end marker
        >>> html = "<p>Example: ]]></p>"
        >>> clean_html_for_rss(html)
        '<p>Example: ]] ></p>'
    """
    # Check for CDATA end marker and add space to break it
    # This is extremely unlikely with markdown-rendered HTML but be safe
    if "]]>" in html:
        html = html.replace("]]>", "]] >")
    return html