feat: Implement Phase 2 Feed Formats - ATOM, JSON Feed, RSS fix (Phases 2.0-2.3)

This commit implements the first three phases of v1.1.2 Phase 2 Feed Formats, adding ATOM 1.0 and JSON Feed 1.1 support alongside the existing RSS feed. CRITICAL BUG FIX: - Fixed RSS streaming feed ordering (was showing oldest-first instead of newest-first) - Streaming RSS removed incorrect reversed() call at line 198 - Feedgen RSS kept correct reversed() to compensate for library behavior NEW FEATURES: - ATOM 1.0 feed generation (RFC 4287 compliant) - Proper XML namespacing and RFC 3339 dates - Streaming and non-streaming methods - 11 comprehensive tests - JSON Feed 1.1 generation (JSON Feed spec compliant) - RFC 3339 dates and UTF-8 JSON output - Custom _starpunk extension with permalink_path and word_count - 13 comprehensive tests REFACTORING: - Restructured feed code into starpunk/feeds/ module - feeds/rss.py - RSS 2.0 (moved from feed.py) - feeds/atom.py - ATOM 1.0 (new) - feeds/json_feed.py - JSON Feed 1.1 (new) - Backward compatible feed.py shim for existing imports - Business metrics integrated into all feed generators TESTING: - Created shared test helper tests/helpers/feed_ordering.py - Helper validates newest-first ordering across all formats - 48 total feed tests, all passing - RSS: 24 tests - ATOM: 11 tests - JSON Feed: 13 tests FILES CHANGED: - Modified: starpunk/feed.py (now compatibility shim) - New: starpunk/feeds/ module with rss.py, atom.py, json_feed.py - New: tests/helpers/feed_ordering.py (shared test helper) - New: tests/test_feeds_atom.py, tests/test_feeds_json.py - Modified: CHANGELOG.md (Phase 2 entries) - New: docs/reports/2025-11-26-v1.1.2-phase2-feed-formats-partial.md NEXT STEPS: Phase 2.4 (Content Negotiation) pending - will add /feed endpoint with Accept header negotiation and explicit format endpoints. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-26 14:54:52 -07:00
parent b0230b1233
commit 59e9d402c6
14 changed files with 2663 additions and 637 deletions
--- a/starpunk/feeds/atom.py
+++ b/starpunk/feeds/atom.py
@@ -0,0 +1,268 @@
+"""
+ATOM 1.0 feed generation for StarPunk
+
+This module provides ATOM 1.0 feed generation from published notes using
+Python's standard library xml.etree.ElementTree for proper XML handling.
+
+Functions:
+    generate_atom: Generate ATOM 1.0 XML feed from notes
+    generate_atom_streaming: Memory-efficient streaming ATOM generation
+
+Standards:
+    - ATOM 1.0 (RFC 4287) specification compliant
+    - RFC 3339 date format
+    - Proper XML namespacing
+    - Escaped HTML and text content
+"""
+
+# Standard library imports
+from datetime import datetime, timezone
+from typing import Optional
+import time
+import xml.etree.ElementTree as ET
+
+# Local imports
+from starpunk.models import Note
+from starpunk.monitoring.business import track_feed_generated
+
+
+# ATOM namespace
+ATOM_NS = "http://www.w3.org/2005/Atom"
+
+
+def generate_atom(
+    site_url: str,
+    site_name: str,
+    site_description: str,
+    notes: list[Note],
+    limit: int = 50,
+) -> str:
+    """
+    Generate ATOM 1.0 XML feed from published notes
+
+    Creates a standards-compliant ATOM 1.0 feed with proper metadata
+    and entry elements. Uses ElementTree for safe XML generation.
+
+    NOTE: For memory-efficient streaming, use generate_atom_streaming() instead.
+    This function is kept for caching use cases.
+
+    Args:
+        site_url: Base URL of the site (e.g., 'https://example.com')
+        site_name: Site title for feed
+        site_description: Site description for feed (subtitle)
+        notes: List of Note objects to include (should be published only)
+        limit: Maximum number of entries to include (default: 50)
+
+    Returns:
+        ATOM 1.0 XML string (UTF-8 encoded)
+
+    Raises:
+        ValueError: If site_url or site_name is empty
+
+    Examples:
+        >>> notes = list_notes(published_only=True, limit=50)
+        >>> feed_xml = generate_atom(
+        ...     site_url='https://example.com',
+        ...     site_name='My Blog',
+        ...     site_description='My personal notes',
+        ...     notes=notes
+        ... )
+        >>> print(feed_xml[:38])
+        <?xml version='1.0' encoding='UTF-8'?>
+    """
+    # Join streaming output for non-streaming version
+    return ''.join(generate_atom_streaming(
+        site_url=site_url,
+        site_name=site_name,
+        site_description=site_description,
+        notes=notes,
+        limit=limit
+    ))
+
+
+def generate_atom_streaming(
+    site_url: str,
+    site_name: str,
+    site_description: str,
+    notes: list[Note],
+    limit: int = 50,
+):
+    """
+    Generate ATOM 1.0 XML feed from published notes using streaming
+
+    Memory-efficient generator that yields XML chunks instead of building
+    the entire feed in memory. Recommended for large feeds (100+ entries).
+
+    Args:
+        site_url: Base URL of the site (e.g., 'https://example.com')
+        site_name: Site title for feed
+        site_description: Site description for feed
+        notes: List of Note objects to include (should be published only)
+        limit: Maximum number of entries to include (default: 50)
+
+    Yields:
+        XML chunks as strings (UTF-8)
+
+    Raises:
+        ValueError: If site_url or site_name is empty
+
+    Examples:
+        >>> from flask import Response
+        >>> notes = list_notes(published_only=True, limit=100)
+        >>> generator = generate_atom_streaming(
+        ...     site_url='https://example.com',
+        ...     site_name='My Blog',
+        ...     site_description='My personal notes',
+        ...     notes=notes
+        ... )
+        >>> return Response(generator, mimetype='application/atom+xml')
+    """
+    # Validate required parameters
+    if not site_url or not site_url.strip():
+        raise ValueError("site_url is required and cannot be empty")
+
+    if not site_name or not site_name.strip():
+        raise ValueError("site_name is required and cannot be empty")
+
+    # Remove trailing slash from site_url for consistency
+    site_url = site_url.rstrip("/")
+
+    # Track feed generation timing
+    start_time = time.time()
+    item_count = 0
+
+    # Current timestamp for updated
+    now = datetime.now(timezone.utc)
+
+    # Yield XML declaration
+    yield '<?xml version="1.0" encoding="utf-8"?>\n'
+
+    # Yield feed opening with namespace
+    yield f'<feed xmlns="{ATOM_NS}">\n'
+
+    # Yield feed metadata
+    yield f'  <id>{_escape_xml(site_url)}/</id>\n'
+    yield f'  <title>{_escape_xml(site_name)}</title>\n'
+    yield f'  <updated>{_format_atom_date(now)}</updated>\n'
+
+    # Links
+    yield f'  <link rel="alternate" type="text/html" href="{_escape_xml(site_url)}"/>\n'
+    yield f'  <link rel="self" type="application/atom+xml" href="{_escape_xml(site_url)}/feed.atom"/>\n'
+
+    # Optional subtitle
+    if site_description:
+        yield f'  <subtitle>{_escape_xml(site_description)}</subtitle>\n'
+
+    # Generator
+    yield '  <generator uri="https://github.com/yourusername/starpunk">StarPunk</generator>\n'
+
+    # Yield entries (newest first)
+    # Notes from database are already in DESC order (newest first)
+    for note in notes[:limit]:
+        item_count += 1
+
+        # Build permalink URL
+        permalink = f"{site_url}{note.permalink}"
+
+        yield '  <entry>\n'
+
+        # Required elements
+        yield f'    <id>{_escape_xml(permalink)}</id>\n'
+        yield f'    <title>{_escape_xml(note.title)}</title>\n'
+
+        # Use created_at for both published and updated
+        # (Note model doesn't have updated_at tracking yet)
+        yield f'    <published>{_format_atom_date(note.created_at)}</published>\n'
+        yield f'    <updated>{_format_atom_date(note.created_at)}</updated>\n'
+
+        # Link to entry
+        yield f'    <link rel="alternate" type="text/html" href="{_escape_xml(permalink)}"/>\n'
+
+        # Content
+        if note.html:
+            # HTML content - escaped
+            yield '    <content type="html">'
+            yield _escape_xml(note.html)
+            yield '</content>\n'
+        else:
+            # Plain text content
+            yield '    <content type="text">'
+            yield _escape_xml(note.content)
+            yield '</content>\n'
+
+        yield '  </entry>\n'
+
+    # Yield closing tag
+    yield '</feed>\n'
+
+    # Track feed generation metrics
+    duration_ms = (time.time() - start_time) * 1000
+    track_feed_generated(
+        format='atom',
+        item_count=item_count,
+        duration_ms=duration_ms,
+        cached=False
+    )
+
+
+def _escape_xml(text: str) -> str:
+    """
+    Escape special XML characters for safe inclusion in XML elements
+
+    Escapes the five predefined XML entities: &, <, >, ", '
+
+    Args:
+        text: Text to escape
+
+    Returns:
+        XML-safe text with escaped entities
+
+    Examples:
+        >>> _escape_xml("Hello & goodbye")
+        'Hello &amp; goodbye'
+        >>> _escape_xml('<p>HTML</p>')
+        '&lt;p&gt;HTML&lt;/p&gt;'
+    """
+    if not text:
+        return ""
+
+    # Escape in order: & first (to avoid double-escaping), then < > " '
+    text = text.replace("&", "&amp;")
+    text = text.replace("<", "&lt;")
+    text = text.replace(">", "&gt;")
+    text = text.replace('"', "&quot;")
+    text = text.replace("'", "&apos;")
+
+    return text
+
+
+def _format_atom_date(dt: datetime) -> str:
+    """
+    Format datetime to RFC 3339 format for ATOM
+
+    ATOM 1.0 requires RFC 3339 date format for published and updated elements.
+    RFC 3339 is a profile of ISO 8601.
+    Format: "2024-11-25T12:00:00Z" (UTC) or "2024-11-25T12:00:00-05:00" (with offset)
+
+    Args:
+        dt: Datetime object to format (naive datetime assumed to be UTC)
+
+    Returns:
+        RFC 3339 formatted date string
+
+    Examples:
+        >>> dt = datetime(2024, 11, 25, 12, 0, 0, tzinfo=timezone.utc)
+        >>> _format_atom_date(dt)
+        '2024-11-25T12:00:00Z'
+    """
+    # Ensure datetime has timezone (assume UTC if naive)
+    if dt.tzinfo is None:
+        dt = dt.replace(tzinfo=timezone.utc)
+
+    # Format to RFC 3339
+    # Use 'Z' suffix for UTC, otherwise include offset
+    if dt.tzinfo == timezone.utc:
+        return dt.strftime("%Y-%m-%dT%H:%M:%SZ")
+    else:
+        # Format with timezone offset
+        return dt.isoformat()