feat: Implement Phase 2 Feed Formats - ATOM, JSON Feed, RSS fix (Phases 2.0-2.3)

This commit implements the first three phases of v1.1.2 Phase 2 Feed Formats, adding ATOM 1.0 and JSON Feed 1.1 support alongside the existing RSS feed. CRITICAL BUG FIX: - Fixed RSS streaming feed ordering (was showing oldest-first instead of newest-first) - Streaming RSS removed incorrect reversed() call at line 198 - Feedgen RSS kept correct reversed() to compensate for library behavior NEW FEATURES: - ATOM 1.0 feed generation (RFC 4287 compliant) - Proper XML namespacing and RFC 3339 dates - Streaming and non-streaming methods - 11 comprehensive tests - JSON Feed 1.1 generation (JSON Feed spec compliant) - RFC 3339 dates and UTF-8 JSON output - Custom _starpunk extension with permalink_path and word_count - 13 comprehensive tests REFACTORING: - Restructured feed code into starpunk/feeds/ module - feeds/rss.py - RSS 2.0 (moved from feed.py) - feeds/atom.py - ATOM 1.0 (new) - feeds/json_feed.py - JSON Feed 1.1 (new) - Backward compatible feed.py shim for existing imports - Business metrics integrated into all feed generators TESTING: - Created shared test helper tests/helpers/feed_ordering.py - Helper validates newest-first ordering across all formats - 48 total feed tests, all passing - RSS: 24 tests - ATOM: 11 tests - JSON Feed: 13 tests FILES CHANGED: - Modified: starpunk/feed.py (now compatibility shim) - New: starpunk/feeds/ module with rss.py, atom.py, json_feed.py - New: tests/helpers/feed_ordering.py (shared test helper) - New: tests/test_feeds_atom.py, tests/test_feeds_json.py - Modified: CHANGELOG.md (Phase 2 entries) - New: docs/reports/2025-11-26-v1.1.2-phase2-feed-formats-partial.md NEXT STEPS: Phase 2.4 (Content Negotiation) pending - will add /feed endpoint with Accept header negotiation and explicit format endpoints. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-26 14:54:52 -07:00
parent b0230b1233
commit 59e9d402c6
14 changed files with 2663 additions and 637 deletions
--- a/starpunk/feeds/rss.py
+++ b/starpunk/feeds/rss.py
@@ -0,0 +1,397 @@
+"""
+RSS 2.0 feed generation for StarPunk
+
+This module provides RSS 2.0 feed generation from published notes using the
+feedgen library. Feeds include proper RFC-822 dates, CDATA-wrapped HTML
+content, and all required RSS elements.
+
+Functions:
+    generate_rss: Generate RSS 2.0 XML feed from notes
+    generate_rss_streaming: Memory-efficient streaming RSS generation
+    format_rfc822_date: Format datetime to RFC-822 for RSS
+    get_note_title: Extract title from note (first line or timestamp)
+    clean_html_for_rss: Clean HTML for CDATA safety
+
+Standards:
+    - RSS 2.0 specification compliant
+    - RFC-822 date format
+    - Atom self-link for feed discovery
+    - CDATA wrapping for HTML content
+"""
+
+# Standard library imports
+from datetime import datetime, timezone
+from typing import Optional
+import time
+
+# Third-party imports
+from feedgen.feed import FeedGenerator
+
+# Local imports
+from starpunk.models import Note
+from starpunk.monitoring.business import track_feed_generated
+
+
+def generate_rss(
+    site_url: str,
+    site_name: str,
+    site_description: str,
+    notes: list[Note],
+    limit: int = 50,
+) -> str:
+    """
+    Generate RSS 2.0 XML feed from published notes
+
+    Creates a standards-compliant RSS 2.0 feed with proper channel metadata
+    and item entries for each note. Includes Atom self-link for discovery.
+
+    NOTE: For memory-efficient streaming, use generate_rss_streaming() instead.
+    This function is kept for backwards compatibility and caching use cases.
+
+    Args:
+        site_url: Base URL of the site (e.g., 'https://example.com')
+        site_name: Site title for RSS channel
+        site_description: Site description for RSS channel
+        notes: List of Note objects to include (should be published only)
+        limit: Maximum number of items to include (default: 50)
+
+    Returns:
+        RSS 2.0 XML string (UTF-8 encoded, pretty-printed)
+
+    Raises:
+        ValueError: If site_url or site_name is empty
+
+    Examples:
+        >>> notes = list_notes(published_only=True, limit=50)
+        >>> feed_xml = generate_rss(
+        ...     site_url='https://example.com',
+        ...     site_name='My Blog',
+        ...     site_description='My personal notes',
+        ...     notes=notes
+        ... )
+        >>> print(feed_xml[:38])
+        <?xml version='1.0' encoding='UTF-8'?>
+    """
+    # Validate required parameters
+    if not site_url or not site_url.strip():
+        raise ValueError("site_url is required and cannot be empty")
+
+    if not site_name or not site_name.strip():
+        raise ValueError("site_name is required and cannot be empty")
+
+    # Remove trailing slash from site_url for consistency
+    site_url = site_url.rstrip("/")
+
+    # Create feed generator
+    fg = FeedGenerator()
+
+    # Set channel metadata (required elements)
+    fg.id(site_url)
+    fg.title(site_name)
+    fg.link(href=site_url, rel="alternate")
+    fg.description(site_description or site_name)
+    fg.language("en")
+
+    # Add self-link for feed discovery (Atom namespace)
+    fg.link(href=f"{site_url}/feed.xml", rel="self", type="application/rss+xml")
+
+    # Set last build date to now
+    fg.lastBuildDate(datetime.now(timezone.utc))
+
+    # Track feed generation timing
+    start_time = time.time()
+
+    # Add items (limit to configured maximum, newest first)
+    # Notes from database are DESC but feedgen reverses them, so we reverse back
+    for note in reversed(notes[:limit]):
+        # Create feed entry
+        fe = fg.add_entry()
+
+        # Build permalink URL
+        permalink = f"{site_url}{note.permalink}"
+
+        # Set required item elements
+        fe.id(permalink)
+        fe.title(get_note_title(note))
+        fe.link(href=permalink)
+        fe.guid(permalink, permalink=True)
+
+        # Set publication date (ensure UTC timezone)
+        pubdate = note.created_at
+        if pubdate.tzinfo is None:
+            # If naive datetime, assume UTC
+            pubdate = pubdate.replace(tzinfo=timezone.utc)
+        fe.pubDate(pubdate)
+
+        # Set description with HTML content in CDATA
+        # feedgen automatically wraps content in CDATA for RSS
+        html_content = clean_html_for_rss(note.html)
+        fe.description(html_content)
+
+    # Generate RSS 2.0 XML (pretty-printed)
+    feed_xml = fg.rss_str(pretty=True).decode("utf-8")
+
+    # Track feed generation metrics
+    duration_ms = (time.time() - start_time) * 1000
+    track_feed_generated(
+        format='rss',
+        item_count=min(len(notes), limit),
+        duration_ms=duration_ms,
+        cached=False
+    )
+
+    return feed_xml
+
+
+def generate_rss_streaming(
+    site_url: str,
+    site_name: str,
+    site_description: str,
+    notes: list[Note],
+    limit: int = 50,
+):
+    """
+    Generate RSS 2.0 XML feed from published notes using streaming
+
+    Memory-efficient generator that yields XML chunks instead of building
+    the entire feed in memory. Recommended for large feeds (100+ items).
+
+    Yields XML in semantic chunks (channel metadata, individual items, closing tags)
+    rather than character-by-character for optimal performance.
+
+    Args:
+        site_url: Base URL of the site (e.g., 'https://example.com')
+        site_name: Site title for RSS channel
+        site_description: Site description for RSS channel
+        notes: List of Note objects to include (should be published only)
+        limit: Maximum number of items to include (default: 50)
+
+    Yields:
+        XML chunks as strings (UTF-8)
+
+    Raises:
+        ValueError: If site_url or site_name is empty
+
+    Examples:
+        >>> from flask import Response
+        >>> notes = list_notes(published_only=True, limit=100)
+        >>> generator = generate_rss_streaming(
+        ...     site_url='https://example.com',
+        ...     site_name='My Blog',
+        ...     site_description='My personal notes',
+        ...     notes=notes
+        ... )
+        >>> return Response(generator, mimetype='application/rss+xml')
+    """
+    # Validate required parameters
+    if not site_url or not site_url.strip():
+        raise ValueError("site_url is required and cannot be empty")
+
+    if not site_name or not site_name.strip():
+        raise ValueError("site_name is required and cannot be empty")
+
+    # Remove trailing slash from site_url for consistency
+    site_url = site_url.rstrip("/")
+
+    # Track feed generation timing
+    start_time = time.time()
+    item_count = 0
+
+    # Current timestamp for lastBuildDate
+    now = datetime.now(timezone.utc)
+    last_build = format_rfc822_date(now)
+
+    # Yield XML declaration and opening RSS tag
+    yield '<?xml version="1.0" encoding="UTF-8"?>\n'
+    yield '<rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom">\n'
+    yield "  <channel>\n"
+
+    # Yield channel metadata
+    yield f"    <title>{_escape_xml(site_name)}</title>\n"
+    yield f"    <link>{_escape_xml(site_url)}</link>\n"
+    yield f"    <description>{_escape_xml(site_description or site_name)}</description>\n"
+    yield "    <language>en</language>\n"
+    yield f"    <lastBuildDate>{last_build}</lastBuildDate>\n"
+    yield f'    <atom:link href="{_escape_xml(site_url)}/feed.xml" rel="self" type="application/rss+xml"/>\n'
+
+    # Yield items (newest first)
+    # Notes from database are already in DESC order (newest first)
+    for note in notes[:limit]:
+        item_count += 1
+
+        # Build permalink URL
+        permalink = f"{site_url}{note.permalink}"
+
+        # Get note title
+        title = get_note_title(note)
+
+        # Format publication date
+        pubdate = note.created_at
+        if pubdate.tzinfo is None:
+            pubdate = pubdate.replace(tzinfo=timezone.utc)
+        pub_date_str = format_rfc822_date(pubdate)
+
+        # Get HTML content
+        html_content = clean_html_for_rss(note.html)
+
+        # Yield complete item as a single chunk
+        item_xml = f"""    <item>
+      <title>{_escape_xml(title)}</title>
+      <link>{_escape_xml(permalink)}</link>
+      <guid isPermaLink="true">{_escape_xml(permalink)}</guid>
+      <pubDate>{pub_date_str}</pubDate>
+      <description><![CDATA[{html_content}]]></description>
+    </item>
+"""
+        yield item_xml
+
+    # Yield closing tags
+    yield "  </channel>\n"
+    yield "</rss>\n"
+
+    # Track feed generation metrics
+    duration_ms = (time.time() - start_time) * 1000
+    track_feed_generated(
+        format='rss',
+        item_count=item_count,
+        duration_ms=duration_ms,
+        cached=False
+    )
+
+
+def _escape_xml(text: str) -> str:
+    """
+    Escape special XML characters for safe inclusion in XML elements
+
+    Escapes the five predefined XML entities: &, <, >, ", '
+
+    Args:
+        text: Text to escape
+
+    Returns:
+        XML-safe text with escaped entities
+
+    Examples:
+        >>> _escape_xml("Hello & goodbye")
+        'Hello &amp; goodbye'
+        >>> _escape_xml('<tag>')
+        '&lt;tag&gt;'
+    """
+    if not text:
+        return ""
+
+    # Escape in order: & first (to avoid double-escaping), then < > " '
+    text = text.replace("&", "&amp;")
+    text = text.replace("<", "&lt;")
+    text = text.replace(">", "&gt;")
+    text = text.replace('"', "&quot;")
+    text = text.replace("'", "&apos;")
+
+    return text
+
+
+def format_rfc822_date(dt: datetime) -> str:
+    """
+    Format datetime to RFC-822 format for RSS
+
+    RSS 2.0 requires RFC-822 date format for pubDate and lastBuildDate.
+    Format: "Mon, 18 Nov 2024 12:00:00 +0000"
+
+    Args:
+        dt: Datetime object to format (naive datetime assumed to be UTC)
+
+    Returns:
+        RFC-822 formatted date string
+
+    Examples:
+        >>> dt = datetime(2024, 11, 18, 12, 0, 0)
+        >>> format_rfc822_date(dt)
+        'Mon, 18 Nov 2024 12:00:00 +0000'
+    """
+    # Ensure datetime has timezone (assume UTC if naive)
+    if dt.tzinfo is None:
+        dt = dt.replace(tzinfo=timezone.utc)
+
+    # Format to RFC-822
+    # Format string: %a = weekday, %d = day, %b = month, %Y = year
+    #                %H:%M:%S = time, %z = timezone offset
+    return dt.strftime("%a, %d %b %Y %H:%M:%S %z")
+
+
+def get_note_title(note: Note) -> str:
+    """
+    Extract title from note content
+
+    Attempts to extract a meaningful title from the note. Uses the first
+    line of content (stripped of markdown heading syntax) or falls back
+    to a formatted timestamp if content is unavailable.
+
+    Algorithm:
+        1. Try note.title property (first line, stripped of # syntax)
+        2. Fall back to timestamp if title is unavailable
+
+    Args:
+        note: Note object
+
+    Returns:
+        Title string (max 100 chars, truncated if needed)
+
+    Examples:
+        >>> # Note with heading
+        >>> note = Note(...)  # content: "# My First Note\\n\\n..."
+        >>> get_note_title(note)
+        'My First Note'
+
+        >>> # Note without heading (timestamp fallback)
+        >>> note = Note(...)  # content: "Just some text"
+        >>> get_note_title(note)
+        'November 18, 2024 at 12:00 PM'
+    """
+    try:
+        # Use Note's title property (handles extraction logic)
+        title = note.title
+
+        # Truncate to 100 characters for RSS compatibility
+        if len(title) > 100:
+            title = title[:100].strip() + "..."
+
+        return title
+
+    except (FileNotFoundError, OSError, AttributeError):
+        # If title extraction fails, use timestamp
+        return note.created_at.strftime("%B %d, %Y at %I:%M %p")
+
+
+def clean_html_for_rss(html: str) -> str:
+    """
+    Ensure HTML is safe for RSS CDATA wrapping
+
+    RSS readers expect HTML content wrapped in CDATA sections. The feedgen
+    library handles CDATA wrapping automatically, but we need to ensure
+    the HTML doesn't contain CDATA end markers that would break parsing.
+
+    This function is primarily defensive - markdown-rendered HTML should
+    not contain CDATA markers, but we check anyway.
+
+    Args:
+        html: Rendered HTML content from markdown
+
+    Returns:
+        Cleaned HTML safe for CDATA wrapping
+
+    Examples:
+        >>> html = "<p>Hello world</p>"
+        >>> clean_html_for_rss(html)
+        '<p>Hello world</p>'
+
+        >>> # Edge case: HTML containing CDATA end marker
+        >>> html = "<p>Example: ]]></p>"
+        >>> clean_html_for_rss(html)
+        '<p>Example: ]] ></p>'
+    """
+    # Check for CDATA end marker and add space to break it
+    # This is extremely unlikely with markdown-rendered HTML but be safe
+    if "]]>" in html:
+        html = html.replace("]]>", "]] >")
+
+    return html