feat: Implement Phase 2 Feed Formats - ATOM, JSON Feed, RSS fix (Phases 2.0-2.3)
This commit implements the first three phases of v1.1.2 Phase 2 Feed Formats, adding ATOM 1.0 and JSON Feed 1.1 support alongside the existing RSS feed. CRITICAL BUG FIX: - Fixed RSS streaming feed ordering (was showing oldest-first instead of newest-first) - Streaming RSS removed incorrect reversed() call at line 198 - Feedgen RSS kept correct reversed() to compensate for library behavior NEW FEATURES: - ATOM 1.0 feed generation (RFC 4287 compliant) - Proper XML namespacing and RFC 3339 dates - Streaming and non-streaming methods - 11 comprehensive tests - JSON Feed 1.1 generation (JSON Feed spec compliant) - RFC 3339 dates and UTF-8 JSON output - Custom _starpunk extension with permalink_path and word_count - 13 comprehensive tests REFACTORING: - Restructured feed code into starpunk/feeds/ module - feeds/rss.py - RSS 2.0 (moved from feed.py) - feeds/atom.py - ATOM 1.0 (new) - feeds/json_feed.py - JSON Feed 1.1 (new) - Backward compatible feed.py shim for existing imports - Business metrics integrated into all feed generators TESTING: - Created shared test helper tests/helpers/feed_ordering.py - Helper validates newest-first ordering across all formats - 48 total feed tests, all passing - RSS: 24 tests - ATOM: 11 tests - JSON Feed: 13 tests FILES CHANGED: - Modified: starpunk/feed.py (now compatibility shim) - New: starpunk/feeds/ module with rss.py, atom.py, json_feed.py - New: tests/helpers/feed_ordering.py (shared test helper) - New: tests/test_feeds_atom.py, tests/test_feeds_json.py - Modified: CHANGELOG.md (Phase 2 entries) - New: docs/reports/2025-11-26-v1.1.2-phase2-feed-formats-partial.md NEXT STEPS: Phase 2.4 (Content Negotiation) pending - will add /feed endpoint with Accept header negotiation and explicit format endpoints. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
47
starpunk/feeds/__init__.py
Normal file
47
starpunk/feeds/__init__.py
Normal file
@@ -0,0 +1,47 @@
|
||||
"""
|
||||
Feed generation module for StarPunk
|
||||
|
||||
This module provides feed generation in multiple formats (RSS, ATOM, JSON Feed)
|
||||
with content negotiation and caching support.
|
||||
|
||||
Exports:
|
||||
generate_rss: Generate RSS 2.0 feed
|
||||
generate_rss_streaming: Generate RSS 2.0 feed with streaming
|
||||
generate_atom: Generate ATOM 1.0 feed (coming in Phase 2.2)
|
||||
generate_atom_streaming: Generate ATOM 1.0 feed with streaming (coming in Phase 2.2)
|
||||
generate_json_feed: Generate JSON Feed 1.1 (coming in Phase 2.3)
|
||||
generate_json_feed_streaming: Generate JSON Feed 1.1 with streaming (coming in Phase 2.3)
|
||||
"""
|
||||
|
||||
from .rss import (
|
||||
generate_rss,
|
||||
generate_rss_streaming,
|
||||
format_rfc822_date,
|
||||
get_note_title,
|
||||
clean_html_for_rss,
|
||||
)
|
||||
|
||||
from .atom import (
|
||||
generate_atom,
|
||||
generate_atom_streaming,
|
||||
)
|
||||
|
||||
from .json_feed import (
|
||||
generate_json_feed,
|
||||
generate_json_feed_streaming,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
# RSS functions
|
||||
"generate_rss",
|
||||
"generate_rss_streaming",
|
||||
"format_rfc822_date",
|
||||
"get_note_title",
|
||||
"clean_html_for_rss",
|
||||
# ATOM functions
|
||||
"generate_atom",
|
||||
"generate_atom_streaming",
|
||||
# JSON Feed functions
|
||||
"generate_json_feed",
|
||||
"generate_json_feed_streaming",
|
||||
]
|
||||
268
starpunk/feeds/atom.py
Normal file
268
starpunk/feeds/atom.py
Normal file
@@ -0,0 +1,268 @@
|
||||
"""
|
||||
ATOM 1.0 feed generation for StarPunk
|
||||
|
||||
This module provides ATOM 1.0 feed generation from published notes using
|
||||
Python's standard library xml.etree.ElementTree for proper XML handling.
|
||||
|
||||
Functions:
|
||||
generate_atom: Generate ATOM 1.0 XML feed from notes
|
||||
generate_atom_streaming: Memory-efficient streaming ATOM generation
|
||||
|
||||
Standards:
|
||||
- ATOM 1.0 (RFC 4287) specification compliant
|
||||
- RFC 3339 date format
|
||||
- Proper XML namespacing
|
||||
- Escaped HTML and text content
|
||||
"""
|
||||
|
||||
# Standard library imports
|
||||
from datetime import datetime, timezone
|
||||
from typing import Optional
|
||||
import time
|
||||
import xml.etree.ElementTree as ET
|
||||
|
||||
# Local imports
|
||||
from starpunk.models import Note
|
||||
from starpunk.monitoring.business import track_feed_generated
|
||||
|
||||
|
||||
# ATOM namespace
|
||||
ATOM_NS = "http://www.w3.org/2005/Atom"
|
||||
|
||||
|
||||
def generate_atom(
|
||||
site_url: str,
|
||||
site_name: str,
|
||||
site_description: str,
|
||||
notes: list[Note],
|
||||
limit: int = 50,
|
||||
) -> str:
|
||||
"""
|
||||
Generate ATOM 1.0 XML feed from published notes
|
||||
|
||||
Creates a standards-compliant ATOM 1.0 feed with proper metadata
|
||||
and entry elements. Uses ElementTree for safe XML generation.
|
||||
|
||||
NOTE: For memory-efficient streaming, use generate_atom_streaming() instead.
|
||||
This function is kept for caching use cases.
|
||||
|
||||
Args:
|
||||
site_url: Base URL of the site (e.g., 'https://example.com')
|
||||
site_name: Site title for feed
|
||||
site_description: Site description for feed (subtitle)
|
||||
notes: List of Note objects to include (should be published only)
|
||||
limit: Maximum number of entries to include (default: 50)
|
||||
|
||||
Returns:
|
||||
ATOM 1.0 XML string (UTF-8 encoded)
|
||||
|
||||
Raises:
|
||||
ValueError: If site_url or site_name is empty
|
||||
|
||||
Examples:
|
||||
>>> notes = list_notes(published_only=True, limit=50)
|
||||
>>> feed_xml = generate_atom(
|
||||
... site_url='https://example.com',
|
||||
... site_name='My Blog',
|
||||
... site_description='My personal notes',
|
||||
... notes=notes
|
||||
... )
|
||||
>>> print(feed_xml[:38])
|
||||
<?xml version='1.0' encoding='UTF-8'?>
|
||||
"""
|
||||
# Join streaming output for non-streaming version
|
||||
return ''.join(generate_atom_streaming(
|
||||
site_url=site_url,
|
||||
site_name=site_name,
|
||||
site_description=site_description,
|
||||
notes=notes,
|
||||
limit=limit
|
||||
))
|
||||
|
||||
|
||||
def generate_atom_streaming(
|
||||
site_url: str,
|
||||
site_name: str,
|
||||
site_description: str,
|
||||
notes: list[Note],
|
||||
limit: int = 50,
|
||||
):
|
||||
"""
|
||||
Generate ATOM 1.0 XML feed from published notes using streaming
|
||||
|
||||
Memory-efficient generator that yields XML chunks instead of building
|
||||
the entire feed in memory. Recommended for large feeds (100+ entries).
|
||||
|
||||
Args:
|
||||
site_url: Base URL of the site (e.g., 'https://example.com')
|
||||
site_name: Site title for feed
|
||||
site_description: Site description for feed
|
||||
notes: List of Note objects to include (should be published only)
|
||||
limit: Maximum number of entries to include (default: 50)
|
||||
|
||||
Yields:
|
||||
XML chunks as strings (UTF-8)
|
||||
|
||||
Raises:
|
||||
ValueError: If site_url or site_name is empty
|
||||
|
||||
Examples:
|
||||
>>> from flask import Response
|
||||
>>> notes = list_notes(published_only=True, limit=100)
|
||||
>>> generator = generate_atom_streaming(
|
||||
... site_url='https://example.com',
|
||||
... site_name='My Blog',
|
||||
... site_description='My personal notes',
|
||||
... notes=notes
|
||||
... )
|
||||
>>> return Response(generator, mimetype='application/atom+xml')
|
||||
"""
|
||||
# Validate required parameters
|
||||
if not site_url or not site_url.strip():
|
||||
raise ValueError("site_url is required and cannot be empty")
|
||||
|
||||
if not site_name or not site_name.strip():
|
||||
raise ValueError("site_name is required and cannot be empty")
|
||||
|
||||
# Remove trailing slash from site_url for consistency
|
||||
site_url = site_url.rstrip("/")
|
||||
|
||||
# Track feed generation timing
|
||||
start_time = time.time()
|
||||
item_count = 0
|
||||
|
||||
# Current timestamp for updated
|
||||
now = datetime.now(timezone.utc)
|
||||
|
||||
# Yield XML declaration
|
||||
yield '<?xml version="1.0" encoding="utf-8"?>\n'
|
||||
|
||||
# Yield feed opening with namespace
|
||||
yield f'<feed xmlns="{ATOM_NS}">\n'
|
||||
|
||||
# Yield feed metadata
|
||||
yield f' <id>{_escape_xml(site_url)}/</id>\n'
|
||||
yield f' <title>{_escape_xml(site_name)}</title>\n'
|
||||
yield f' <updated>{_format_atom_date(now)}</updated>\n'
|
||||
|
||||
# Links
|
||||
yield f' <link rel="alternate" type="text/html" href="{_escape_xml(site_url)}"/>\n'
|
||||
yield f' <link rel="self" type="application/atom+xml" href="{_escape_xml(site_url)}/feed.atom"/>\n'
|
||||
|
||||
# Optional subtitle
|
||||
if site_description:
|
||||
yield f' <subtitle>{_escape_xml(site_description)}</subtitle>\n'
|
||||
|
||||
# Generator
|
||||
yield ' <generator uri="https://github.com/yourusername/starpunk">StarPunk</generator>\n'
|
||||
|
||||
# Yield entries (newest first)
|
||||
# Notes from database are already in DESC order (newest first)
|
||||
for note in notes[:limit]:
|
||||
item_count += 1
|
||||
|
||||
# Build permalink URL
|
||||
permalink = f"{site_url}{note.permalink}"
|
||||
|
||||
yield ' <entry>\n'
|
||||
|
||||
# Required elements
|
||||
yield f' <id>{_escape_xml(permalink)}</id>\n'
|
||||
yield f' <title>{_escape_xml(note.title)}</title>\n'
|
||||
|
||||
# Use created_at for both published and updated
|
||||
# (Note model doesn't have updated_at tracking yet)
|
||||
yield f' <published>{_format_atom_date(note.created_at)}</published>\n'
|
||||
yield f' <updated>{_format_atom_date(note.created_at)}</updated>\n'
|
||||
|
||||
# Link to entry
|
||||
yield f' <link rel="alternate" type="text/html" href="{_escape_xml(permalink)}"/>\n'
|
||||
|
||||
# Content
|
||||
if note.html:
|
||||
# HTML content - escaped
|
||||
yield ' <content type="html">'
|
||||
yield _escape_xml(note.html)
|
||||
yield '</content>\n'
|
||||
else:
|
||||
# Plain text content
|
||||
yield ' <content type="text">'
|
||||
yield _escape_xml(note.content)
|
||||
yield '</content>\n'
|
||||
|
||||
yield ' </entry>\n'
|
||||
|
||||
# Yield closing tag
|
||||
yield '</feed>\n'
|
||||
|
||||
# Track feed generation metrics
|
||||
duration_ms = (time.time() - start_time) * 1000
|
||||
track_feed_generated(
|
||||
format='atom',
|
||||
item_count=item_count,
|
||||
duration_ms=duration_ms,
|
||||
cached=False
|
||||
)
|
||||
|
||||
|
||||
def _escape_xml(text: str) -> str:
|
||||
"""
|
||||
Escape special XML characters for safe inclusion in XML elements
|
||||
|
||||
Escapes the five predefined XML entities: &, <, >, ", '
|
||||
|
||||
Args:
|
||||
text: Text to escape
|
||||
|
||||
Returns:
|
||||
XML-safe text with escaped entities
|
||||
|
||||
Examples:
|
||||
>>> _escape_xml("Hello & goodbye")
|
||||
'Hello & goodbye'
|
||||
>>> _escape_xml('<p>HTML</p>')
|
||||
'<p>HTML</p>'
|
||||
"""
|
||||
if not text:
|
||||
return ""
|
||||
|
||||
# Escape in order: & first (to avoid double-escaping), then < > " '
|
||||
text = text.replace("&", "&")
|
||||
text = text.replace("<", "<")
|
||||
text = text.replace(">", ">")
|
||||
text = text.replace('"', """)
|
||||
text = text.replace("'", "'")
|
||||
|
||||
return text
|
||||
|
||||
|
||||
def _format_atom_date(dt: datetime) -> str:
|
||||
"""
|
||||
Format datetime to RFC 3339 format for ATOM
|
||||
|
||||
ATOM 1.0 requires RFC 3339 date format for published and updated elements.
|
||||
RFC 3339 is a profile of ISO 8601.
|
||||
Format: "2024-11-25T12:00:00Z" (UTC) or "2024-11-25T12:00:00-05:00" (with offset)
|
||||
|
||||
Args:
|
||||
dt: Datetime object to format (naive datetime assumed to be UTC)
|
||||
|
||||
Returns:
|
||||
RFC 3339 formatted date string
|
||||
|
||||
Examples:
|
||||
>>> dt = datetime(2024, 11, 25, 12, 0, 0, tzinfo=timezone.utc)
|
||||
>>> _format_atom_date(dt)
|
||||
'2024-11-25T12:00:00Z'
|
||||
"""
|
||||
# Ensure datetime has timezone (assume UTC if naive)
|
||||
if dt.tzinfo is None:
|
||||
dt = dt.replace(tzinfo=timezone.utc)
|
||||
|
||||
# Format to RFC 3339
|
||||
# Use 'Z' suffix for UTC, otherwise include offset
|
||||
if dt.tzinfo == timezone.utc:
|
||||
return dt.strftime("%Y-%m-%dT%H:%M:%SZ")
|
||||
else:
|
||||
# Format with timezone offset
|
||||
return dt.isoformat()
|
||||
309
starpunk/feeds/json_feed.py
Normal file
309
starpunk/feeds/json_feed.py
Normal file
@@ -0,0 +1,309 @@
|
||||
"""
|
||||
JSON Feed 1.1 generation for StarPunk
|
||||
|
||||
This module provides JSON Feed 1.1 generation from published notes using
|
||||
Python's standard library json module for proper JSON serialization.
|
||||
|
||||
Functions:
|
||||
generate_json_feed: Generate JSON Feed 1.1 from notes
|
||||
generate_json_feed_streaming: Memory-efficient streaming JSON generation
|
||||
|
||||
Standards:
|
||||
- JSON Feed 1.1 specification compliant
|
||||
- RFC 3339 date format
|
||||
- Proper JSON encoding
|
||||
- UTF-8 output
|
||||
"""
|
||||
|
||||
# Standard library imports
|
||||
from datetime import datetime, timezone
|
||||
from typing import Optional, Dict, Any
|
||||
import time
|
||||
import json
|
||||
|
||||
# Local imports
|
||||
from starpunk.models import Note
|
||||
from starpunk.monitoring.business import track_feed_generated
|
||||
|
||||
|
||||
def generate_json_feed(
|
||||
site_url: str,
|
||||
site_name: str,
|
||||
site_description: str,
|
||||
notes: list[Note],
|
||||
limit: int = 50,
|
||||
) -> str:
|
||||
"""
|
||||
Generate JSON Feed 1.1 from published notes
|
||||
|
||||
Creates a standards-compliant JSON Feed 1.1 with proper metadata
|
||||
and item objects. Uses Python's json module for safe serialization.
|
||||
|
||||
NOTE: For memory-efficient streaming, use generate_json_feed_streaming() instead.
|
||||
This function is kept for caching use cases.
|
||||
|
||||
Args:
|
||||
site_url: Base URL of the site (e.g., 'https://example.com')
|
||||
site_name: Site title for feed
|
||||
site_description: Site description for feed
|
||||
notes: List of Note objects to include (should be published only)
|
||||
limit: Maximum number of items to include (default: 50)
|
||||
|
||||
Returns:
|
||||
JSON Feed 1.1 string (UTF-8 encoded, pretty-printed)
|
||||
|
||||
Raises:
|
||||
ValueError: If site_url or site_name is empty
|
||||
|
||||
Examples:
|
||||
>>> notes = list_notes(published_only=True, limit=50)
|
||||
>>> feed_json = generate_json_feed(
|
||||
... site_url='https://example.com',
|
||||
... site_name='My Blog',
|
||||
... site_description='My personal notes',
|
||||
... notes=notes
|
||||
... )
|
||||
"""
|
||||
# Validate required parameters
|
||||
if not site_url or not site_url.strip():
|
||||
raise ValueError("site_url is required and cannot be empty")
|
||||
|
||||
if not site_name or not site_name.strip():
|
||||
raise ValueError("site_name is required and cannot be empty")
|
||||
|
||||
# Remove trailing slash from site_url for consistency
|
||||
site_url = site_url.rstrip("/")
|
||||
|
||||
# Track feed generation timing
|
||||
start_time = time.time()
|
||||
|
||||
# Build feed object
|
||||
feed = _build_feed_object(
|
||||
site_url=site_url,
|
||||
site_name=site_name,
|
||||
site_description=site_description,
|
||||
notes=notes[:limit]
|
||||
)
|
||||
|
||||
# Serialize to JSON (pretty-printed)
|
||||
feed_json = json.dumps(feed, ensure_ascii=False, indent=2)
|
||||
|
||||
# Track feed generation metrics
|
||||
duration_ms = (time.time() - start_time) * 1000
|
||||
track_feed_generated(
|
||||
format='json',
|
||||
item_count=min(len(notes), limit),
|
||||
duration_ms=duration_ms,
|
||||
cached=False
|
||||
)
|
||||
|
||||
return feed_json
|
||||
|
||||
|
||||
def generate_json_feed_streaming(
|
||||
site_url: str,
|
||||
site_name: str,
|
||||
site_description: str,
|
||||
notes: list[Note],
|
||||
limit: int = 50,
|
||||
):
|
||||
"""
|
||||
Generate JSON Feed 1.1 from published notes using streaming
|
||||
|
||||
Memory-efficient generator that yields JSON chunks instead of building
|
||||
the entire feed in memory. Recommended for large feeds (100+ items).
|
||||
|
||||
Args:
|
||||
site_url: Base URL of the site (e.g., 'https://example.com')
|
||||
site_name: Site title for feed
|
||||
site_description: Site description for feed
|
||||
notes: List of Note objects to include (should be published only)
|
||||
limit: Maximum number of items to include (default: 50)
|
||||
|
||||
Yields:
|
||||
JSON chunks as strings (UTF-8)
|
||||
|
||||
Raises:
|
||||
ValueError: If site_url or site_name is empty
|
||||
|
||||
Examples:
|
||||
>>> from flask import Response
|
||||
>>> notes = list_notes(published_only=True, limit=100)
|
||||
>>> generator = generate_json_feed_streaming(
|
||||
... site_url='https://example.com',
|
||||
... site_name='My Blog',
|
||||
... site_description='My personal notes',
|
||||
... notes=notes
|
||||
... )
|
||||
>>> return Response(generator, mimetype='application/json')
|
||||
"""
|
||||
# Validate required parameters
|
||||
if not site_url or not site_url.strip():
|
||||
raise ValueError("site_url is required and cannot be empty")
|
||||
|
||||
if not site_name or not site_name.strip():
|
||||
raise ValueError("site_name is required and cannot be empty")
|
||||
|
||||
# Remove trailing slash from site_url for consistency
|
||||
site_url = site_url.rstrip("/")
|
||||
|
||||
# Track feed generation timing
|
||||
start_time = time.time()
|
||||
item_count = 0
|
||||
|
||||
# Start feed object
|
||||
yield '{\n'
|
||||
yield f' "version": "https://jsonfeed.org/version/1.1",\n'
|
||||
yield f' "title": {json.dumps(site_name)},\n'
|
||||
yield f' "home_page_url": {json.dumps(site_url)},\n'
|
||||
yield f' "feed_url": {json.dumps(f"{site_url}/feed.json")},\n'
|
||||
|
||||
if site_description:
|
||||
yield f' "description": {json.dumps(site_description)},\n'
|
||||
|
||||
yield ' "language": "en",\n'
|
||||
|
||||
# Start items array
|
||||
yield ' "items": [\n'
|
||||
|
||||
# Stream items (newest first)
|
||||
# Notes from database are already in DESC order (newest first)
|
||||
items = notes[:limit]
|
||||
for i, note in enumerate(items):
|
||||
item_count += 1
|
||||
|
||||
# Build item object
|
||||
item = _build_item_object(site_url, note)
|
||||
|
||||
# Serialize item to JSON
|
||||
item_json = json.dumps(item, ensure_ascii=False, indent=4)
|
||||
|
||||
# Indent properly for nested JSON
|
||||
indented_lines = item_json.split('\n')
|
||||
indented = '\n'.join(' ' + line for line in indented_lines)
|
||||
yield indented
|
||||
|
||||
# Add comma between items (but not after last item)
|
||||
if i < len(items) - 1:
|
||||
yield ',\n'
|
||||
else:
|
||||
yield '\n'
|
||||
|
||||
# Close items array and feed
|
||||
yield ' ]\n'
|
||||
yield '}\n'
|
||||
|
||||
# Track feed generation metrics
|
||||
duration_ms = (time.time() - start_time) * 1000
|
||||
track_feed_generated(
|
||||
format='json',
|
||||
item_count=item_count,
|
||||
duration_ms=duration_ms,
|
||||
cached=False
|
||||
)
|
||||
|
||||
|
||||
def _build_feed_object(
|
||||
site_url: str,
|
||||
site_name: str,
|
||||
site_description: str,
|
||||
notes: list[Note]
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Build complete JSON Feed object
|
||||
|
||||
Args:
|
||||
site_url: Site URL (no trailing slash)
|
||||
site_name: Feed title
|
||||
site_description: Feed description
|
||||
notes: List of notes (already limited)
|
||||
|
||||
Returns:
|
||||
JSON Feed dictionary
|
||||
"""
|
||||
feed = {
|
||||
"version": "https://jsonfeed.org/version/1.1",
|
||||
"title": site_name,
|
||||
"home_page_url": site_url,
|
||||
"feed_url": f"{site_url}/feed.json",
|
||||
"language": "en",
|
||||
"items": [_build_item_object(site_url, note) for note in notes]
|
||||
}
|
||||
|
||||
if site_description:
|
||||
feed["description"] = site_description
|
||||
|
||||
return feed
|
||||
|
||||
|
||||
def _build_item_object(site_url: str, note: Note) -> Dict[str, Any]:
|
||||
"""
|
||||
Build JSON Feed item object from note
|
||||
|
||||
Args:
|
||||
site_url: Site URL (no trailing slash)
|
||||
note: Note to convert to item
|
||||
|
||||
Returns:
|
||||
JSON Feed item dictionary
|
||||
"""
|
||||
# Build permalink URL
|
||||
permalink = f"{site_url}{note.permalink}"
|
||||
|
||||
# Create item with required fields
|
||||
item = {
|
||||
"id": permalink,
|
||||
"url": permalink,
|
||||
}
|
||||
|
||||
# Add title
|
||||
item["title"] = note.title
|
||||
|
||||
# Add content (HTML or text)
|
||||
if note.html:
|
||||
item["content_html"] = note.html
|
||||
else:
|
||||
item["content_text"] = note.content
|
||||
|
||||
# Add publication date (RFC 3339 format)
|
||||
item["date_published"] = _format_rfc3339_date(note.created_at)
|
||||
|
||||
# Add custom StarPunk extensions
|
||||
item["_starpunk"] = {
|
||||
"permalink_path": note.permalink,
|
||||
"word_count": len(note.content.split())
|
||||
}
|
||||
|
||||
return item
|
||||
|
||||
|
||||
def _format_rfc3339_date(dt: datetime) -> str:
|
||||
"""
|
||||
Format datetime to RFC 3339 format for JSON Feed
|
||||
|
||||
JSON Feed 1.1 requires RFC 3339 date format for date_published and date_modified.
|
||||
RFC 3339 is a profile of ISO 8601.
|
||||
Format: "2024-11-25T12:00:00Z" (UTC) or "2024-11-25T12:00:00-05:00" (with offset)
|
||||
|
||||
Args:
|
||||
dt: Datetime object to format (naive datetime assumed to be UTC)
|
||||
|
||||
Returns:
|
||||
RFC 3339 formatted date string
|
||||
|
||||
Examples:
|
||||
>>> dt = datetime(2024, 11, 25, 12, 0, 0, tzinfo=timezone.utc)
|
||||
>>> _format_rfc3339_date(dt)
|
||||
'2024-11-25T12:00:00Z'
|
||||
"""
|
||||
# Ensure datetime has timezone (assume UTC if naive)
|
||||
if dt.tzinfo is None:
|
||||
dt = dt.replace(tzinfo=timezone.utc)
|
||||
|
||||
# Format to RFC 3339
|
||||
# Use 'Z' suffix for UTC, otherwise include offset
|
||||
if dt.tzinfo == timezone.utc:
|
||||
return dt.strftime("%Y-%m-%dT%H:%M:%SZ")
|
||||
else:
|
||||
# Format with timezone offset
|
||||
return dt.isoformat()
|
||||
397
starpunk/feeds/rss.py
Normal file
397
starpunk/feeds/rss.py
Normal file
@@ -0,0 +1,397 @@
|
||||
"""
|
||||
RSS 2.0 feed generation for StarPunk
|
||||
|
||||
This module provides RSS 2.0 feed generation from published notes using the
|
||||
feedgen library. Feeds include proper RFC-822 dates, CDATA-wrapped HTML
|
||||
content, and all required RSS elements.
|
||||
|
||||
Functions:
|
||||
generate_rss: Generate RSS 2.0 XML feed from notes
|
||||
generate_rss_streaming: Memory-efficient streaming RSS generation
|
||||
format_rfc822_date: Format datetime to RFC-822 for RSS
|
||||
get_note_title: Extract title from note (first line or timestamp)
|
||||
clean_html_for_rss: Clean HTML for CDATA safety
|
||||
|
||||
Standards:
|
||||
- RSS 2.0 specification compliant
|
||||
- RFC-822 date format
|
||||
- Atom self-link for feed discovery
|
||||
- CDATA wrapping for HTML content
|
||||
"""
|
||||
|
||||
# Standard library imports
|
||||
from datetime import datetime, timezone
|
||||
from typing import Optional
|
||||
import time
|
||||
|
||||
# Third-party imports
|
||||
from feedgen.feed import FeedGenerator
|
||||
|
||||
# Local imports
|
||||
from starpunk.models import Note
|
||||
from starpunk.monitoring.business import track_feed_generated
|
||||
|
||||
|
||||
def generate_rss(
|
||||
site_url: str,
|
||||
site_name: str,
|
||||
site_description: str,
|
||||
notes: list[Note],
|
||||
limit: int = 50,
|
||||
) -> str:
|
||||
"""
|
||||
Generate RSS 2.0 XML feed from published notes
|
||||
|
||||
Creates a standards-compliant RSS 2.0 feed with proper channel metadata
|
||||
and item entries for each note. Includes Atom self-link for discovery.
|
||||
|
||||
NOTE: For memory-efficient streaming, use generate_rss_streaming() instead.
|
||||
This function is kept for backwards compatibility and caching use cases.
|
||||
|
||||
Args:
|
||||
site_url: Base URL of the site (e.g., 'https://example.com')
|
||||
site_name: Site title for RSS channel
|
||||
site_description: Site description for RSS channel
|
||||
notes: List of Note objects to include (should be published only)
|
||||
limit: Maximum number of items to include (default: 50)
|
||||
|
||||
Returns:
|
||||
RSS 2.0 XML string (UTF-8 encoded, pretty-printed)
|
||||
|
||||
Raises:
|
||||
ValueError: If site_url or site_name is empty
|
||||
|
||||
Examples:
|
||||
>>> notes = list_notes(published_only=True, limit=50)
|
||||
>>> feed_xml = generate_rss(
|
||||
... site_url='https://example.com',
|
||||
... site_name='My Blog',
|
||||
... site_description='My personal notes',
|
||||
... notes=notes
|
||||
... )
|
||||
>>> print(feed_xml[:38])
|
||||
<?xml version='1.0' encoding='UTF-8'?>
|
||||
"""
|
||||
# Validate required parameters
|
||||
if not site_url or not site_url.strip():
|
||||
raise ValueError("site_url is required and cannot be empty")
|
||||
|
||||
if not site_name or not site_name.strip():
|
||||
raise ValueError("site_name is required and cannot be empty")
|
||||
|
||||
# Remove trailing slash from site_url for consistency
|
||||
site_url = site_url.rstrip("/")
|
||||
|
||||
# Create feed generator
|
||||
fg = FeedGenerator()
|
||||
|
||||
# Set channel metadata (required elements)
|
||||
fg.id(site_url)
|
||||
fg.title(site_name)
|
||||
fg.link(href=site_url, rel="alternate")
|
||||
fg.description(site_description or site_name)
|
||||
fg.language("en")
|
||||
|
||||
# Add self-link for feed discovery (Atom namespace)
|
||||
fg.link(href=f"{site_url}/feed.xml", rel="self", type="application/rss+xml")
|
||||
|
||||
# Set last build date to now
|
||||
fg.lastBuildDate(datetime.now(timezone.utc))
|
||||
|
||||
# Track feed generation timing
|
||||
start_time = time.time()
|
||||
|
||||
# Add items (limit to configured maximum, newest first)
|
||||
# Notes from database are DESC but feedgen reverses them, so we reverse back
|
||||
for note in reversed(notes[:limit]):
|
||||
# Create feed entry
|
||||
fe = fg.add_entry()
|
||||
|
||||
# Build permalink URL
|
||||
permalink = f"{site_url}{note.permalink}"
|
||||
|
||||
# Set required item elements
|
||||
fe.id(permalink)
|
||||
fe.title(get_note_title(note))
|
||||
fe.link(href=permalink)
|
||||
fe.guid(permalink, permalink=True)
|
||||
|
||||
# Set publication date (ensure UTC timezone)
|
||||
pubdate = note.created_at
|
||||
if pubdate.tzinfo is None:
|
||||
# If naive datetime, assume UTC
|
||||
pubdate = pubdate.replace(tzinfo=timezone.utc)
|
||||
fe.pubDate(pubdate)
|
||||
|
||||
# Set description with HTML content in CDATA
|
||||
# feedgen automatically wraps content in CDATA for RSS
|
||||
html_content = clean_html_for_rss(note.html)
|
||||
fe.description(html_content)
|
||||
|
||||
# Generate RSS 2.0 XML (pretty-printed)
|
||||
feed_xml = fg.rss_str(pretty=True).decode("utf-8")
|
||||
|
||||
# Track feed generation metrics
|
||||
duration_ms = (time.time() - start_time) * 1000
|
||||
track_feed_generated(
|
||||
format='rss',
|
||||
item_count=min(len(notes), limit),
|
||||
duration_ms=duration_ms,
|
||||
cached=False
|
||||
)
|
||||
|
||||
return feed_xml
|
||||
|
||||
|
||||
def generate_rss_streaming(
|
||||
site_url: str,
|
||||
site_name: str,
|
||||
site_description: str,
|
||||
notes: list[Note],
|
||||
limit: int = 50,
|
||||
):
|
||||
"""
|
||||
Generate RSS 2.0 XML feed from published notes using streaming
|
||||
|
||||
Memory-efficient generator that yields XML chunks instead of building
|
||||
the entire feed in memory. Recommended for large feeds (100+ items).
|
||||
|
||||
Yields XML in semantic chunks (channel metadata, individual items, closing tags)
|
||||
rather than character-by-character for optimal performance.
|
||||
|
||||
Args:
|
||||
site_url: Base URL of the site (e.g., 'https://example.com')
|
||||
site_name: Site title for RSS channel
|
||||
site_description: Site description for RSS channel
|
||||
notes: List of Note objects to include (should be published only)
|
||||
limit: Maximum number of items to include (default: 50)
|
||||
|
||||
Yields:
|
||||
XML chunks as strings (UTF-8)
|
||||
|
||||
Raises:
|
||||
ValueError: If site_url or site_name is empty
|
||||
|
||||
Examples:
|
||||
>>> from flask import Response
|
||||
>>> notes = list_notes(published_only=True, limit=100)
|
||||
>>> generator = generate_rss_streaming(
|
||||
... site_url='https://example.com',
|
||||
... site_name='My Blog',
|
||||
... site_description='My personal notes',
|
||||
... notes=notes
|
||||
... )
|
||||
>>> return Response(generator, mimetype='application/rss+xml')
|
||||
"""
|
||||
# Validate required parameters
|
||||
if not site_url or not site_url.strip():
|
||||
raise ValueError("site_url is required and cannot be empty")
|
||||
|
||||
if not site_name or not site_name.strip():
|
||||
raise ValueError("site_name is required and cannot be empty")
|
||||
|
||||
# Remove trailing slash from site_url for consistency
|
||||
site_url = site_url.rstrip("/")
|
||||
|
||||
# Track feed generation timing
|
||||
start_time = time.time()
|
||||
item_count = 0
|
||||
|
||||
# Current timestamp for lastBuildDate
|
||||
now = datetime.now(timezone.utc)
|
||||
last_build = format_rfc822_date(now)
|
||||
|
||||
# Yield XML declaration and opening RSS tag
|
||||
yield '<?xml version="1.0" encoding="UTF-8"?>\n'
|
||||
yield '<rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom">\n'
|
||||
yield " <channel>\n"
|
||||
|
||||
# Yield channel metadata
|
||||
yield f" <title>{_escape_xml(site_name)}</title>\n"
|
||||
yield f" <link>{_escape_xml(site_url)}</link>\n"
|
||||
yield f" <description>{_escape_xml(site_description or site_name)}</description>\n"
|
||||
yield " <language>en</language>\n"
|
||||
yield f" <lastBuildDate>{last_build}</lastBuildDate>\n"
|
||||
yield f' <atom:link href="{_escape_xml(site_url)}/feed.xml" rel="self" type="application/rss+xml"/>\n'
|
||||
|
||||
# Yield items (newest first)
|
||||
# Notes from database are already in DESC order (newest first)
|
||||
for note in notes[:limit]:
|
||||
item_count += 1
|
||||
|
||||
# Build permalink URL
|
||||
permalink = f"{site_url}{note.permalink}"
|
||||
|
||||
# Get note title
|
||||
title = get_note_title(note)
|
||||
|
||||
# Format publication date
|
||||
pubdate = note.created_at
|
||||
if pubdate.tzinfo is None:
|
||||
pubdate = pubdate.replace(tzinfo=timezone.utc)
|
||||
pub_date_str = format_rfc822_date(pubdate)
|
||||
|
||||
# Get HTML content
|
||||
html_content = clean_html_for_rss(note.html)
|
||||
|
||||
# Yield complete item as a single chunk
|
||||
item_xml = f""" <item>
|
||||
<title>{_escape_xml(title)}</title>
|
||||
<link>{_escape_xml(permalink)}</link>
|
||||
<guid isPermaLink="true">{_escape_xml(permalink)}</guid>
|
||||
<pubDate>{pub_date_str}</pubDate>
|
||||
<description><![CDATA[{html_content}]]></description>
|
||||
</item>
|
||||
"""
|
||||
yield item_xml
|
||||
|
||||
# Yield closing tags
|
||||
yield " </channel>\n"
|
||||
yield "</rss>\n"
|
||||
|
||||
# Track feed generation metrics
|
||||
duration_ms = (time.time() - start_time) * 1000
|
||||
track_feed_generated(
|
||||
format='rss',
|
||||
item_count=item_count,
|
||||
duration_ms=duration_ms,
|
||||
cached=False
|
||||
)
|
||||
|
||||
|
||||
def _escape_xml(text: str) -> str:
|
||||
"""
|
||||
Escape special XML characters for safe inclusion in XML elements
|
||||
|
||||
Escapes the five predefined XML entities: &, <, >, ", '
|
||||
|
||||
Args:
|
||||
text: Text to escape
|
||||
|
||||
Returns:
|
||||
XML-safe text with escaped entities
|
||||
|
||||
Examples:
|
||||
>>> _escape_xml("Hello & goodbye")
|
||||
'Hello & goodbye'
|
||||
>>> _escape_xml('<tag>')
|
||||
'<tag>'
|
||||
"""
|
||||
if not text:
|
||||
return ""
|
||||
|
||||
# Escape in order: & first (to avoid double-escaping), then < > " '
|
||||
text = text.replace("&", "&")
|
||||
text = text.replace("<", "<")
|
||||
text = text.replace(">", ">")
|
||||
text = text.replace('"', """)
|
||||
text = text.replace("'", "'")
|
||||
|
||||
return text
|
||||
|
||||
|
||||
def format_rfc822_date(dt: datetime) -> str:
|
||||
"""
|
||||
Format datetime to RFC-822 format for RSS
|
||||
|
||||
RSS 2.0 requires RFC-822 date format for pubDate and lastBuildDate.
|
||||
Format: "Mon, 18 Nov 2024 12:00:00 +0000"
|
||||
|
||||
Args:
|
||||
dt: Datetime object to format (naive datetime assumed to be UTC)
|
||||
|
||||
Returns:
|
||||
RFC-822 formatted date string
|
||||
|
||||
Examples:
|
||||
>>> dt = datetime(2024, 11, 18, 12, 0, 0)
|
||||
>>> format_rfc822_date(dt)
|
||||
'Mon, 18 Nov 2024 12:00:00 +0000'
|
||||
"""
|
||||
# Ensure datetime has timezone (assume UTC if naive)
|
||||
if dt.tzinfo is None:
|
||||
dt = dt.replace(tzinfo=timezone.utc)
|
||||
|
||||
# Format to RFC-822
|
||||
# Format string: %a = weekday, %d = day, %b = month, %Y = year
|
||||
# %H:%M:%S = time, %z = timezone offset
|
||||
return dt.strftime("%a, %d %b %Y %H:%M:%S %z")
|
||||
|
||||
|
||||
def get_note_title(note: Note) -> str:
|
||||
"""
|
||||
Extract title from note content
|
||||
|
||||
Attempts to extract a meaningful title from the note. Uses the first
|
||||
line of content (stripped of markdown heading syntax) or falls back
|
||||
to a formatted timestamp if content is unavailable.
|
||||
|
||||
Algorithm:
|
||||
1. Try note.title property (first line, stripped of # syntax)
|
||||
2. Fall back to timestamp if title is unavailable
|
||||
|
||||
Args:
|
||||
note: Note object
|
||||
|
||||
Returns:
|
||||
Title string (max 100 chars, truncated if needed)
|
||||
|
||||
Examples:
|
||||
>>> # Note with heading
|
||||
>>> note = Note(...) # content: "# My First Note\\n\\n..."
|
||||
>>> get_note_title(note)
|
||||
'My First Note'
|
||||
|
||||
>>> # Note without heading (timestamp fallback)
|
||||
>>> note = Note(...) # content: "Just some text"
|
||||
>>> get_note_title(note)
|
||||
'November 18, 2024 at 12:00 PM'
|
||||
"""
|
||||
try:
|
||||
# Use Note's title property (handles extraction logic)
|
||||
title = note.title
|
||||
|
||||
# Truncate to 100 characters for RSS compatibility
|
||||
if len(title) > 100:
|
||||
title = title[:100].strip() + "..."
|
||||
|
||||
return title
|
||||
|
||||
except (FileNotFoundError, OSError, AttributeError):
|
||||
# If title extraction fails, use timestamp
|
||||
return note.created_at.strftime("%B %d, %Y at %I:%M %p")
|
||||
|
||||
|
||||
def clean_html_for_rss(html: str) -> str:
|
||||
"""
|
||||
Ensure HTML is safe for RSS CDATA wrapping
|
||||
|
||||
RSS readers expect HTML content wrapped in CDATA sections. The feedgen
|
||||
library handles CDATA wrapping automatically, but we need to ensure
|
||||
the HTML doesn't contain CDATA end markers that would break parsing.
|
||||
|
||||
This function is primarily defensive - markdown-rendered HTML should
|
||||
not contain CDATA markers, but we check anyway.
|
||||
|
||||
Args:
|
||||
html: Rendered HTML content from markdown
|
||||
|
||||
Returns:
|
||||
Cleaned HTML safe for CDATA wrapping
|
||||
|
||||
Examples:
|
||||
>>> html = "<p>Hello world</p>"
|
||||
>>> clean_html_for_rss(html)
|
||||
'<p>Hello world</p>'
|
||||
|
||||
>>> # Edge case: HTML containing CDATA end marker
|
||||
>>> html = "<p>Example: ]]></p>"
|
||||
>>> clean_html_for_rss(html)
|
||||
'<p>Example: ]] ></p>'
|
||||
"""
|
||||
# Check for CDATA end marker and add space to break it
|
||||
# This is extremely unlikely with markdown-rendered HTML but be safe
|
||||
if "]]>" in html:
|
||||
html = html.replace("]]>", "]] >")
|
||||
|
||||
return html
|
||||
Reference in New Issue
Block a user