## Added - Feed Media Enhancement with Media RSS namespace support - RSS enclosure, media:content, media:thumbnail elements - JSON Feed image field for first image - ADR-059: Full feed media standardization roadmap ## Fixed - Media display on homepage (was only showing on note pages) - Responsive image sizing with CSS constraints - Caption display (now alt text only, not visible) - Logging correlation ID crash in non-request contexts ## Documentation - Feed media design documents and implementation reports - Media display fixes design and validation reports - Updated ROADMAP with v1.3.0/v1.4.0 media plans 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
546 lines
18 KiB
Python
546 lines
18 KiB
Python
"""
|
|
RSS 2.0 feed generation for StarPunk
|
|
|
|
This module provides RSS 2.0 feed generation from published notes using the
|
|
feedgen library. Feeds include proper RFC-822 dates, CDATA-wrapped HTML
|
|
content, and all required RSS elements.
|
|
|
|
Functions:
|
|
generate_rss: Generate RSS 2.0 XML feed from notes
|
|
generate_rss_streaming: Memory-efficient streaming RSS generation
|
|
format_rfc822_date: Format datetime to RFC-822 for RSS
|
|
get_note_title: Extract title from note (first line or timestamp)
|
|
clean_html_for_rss: Clean HTML for CDATA safety
|
|
|
|
Standards:
|
|
- RSS 2.0 specification compliant
|
|
- RFC-822 date format
|
|
- Atom self-link for feed discovery
|
|
- CDATA wrapping for HTML content
|
|
"""
|
|
|
|
# Standard library imports
|
|
from datetime import datetime, timezone
|
|
from typing import Optional
|
|
import time
|
|
|
|
# Third-party imports
|
|
from feedgen.feed import FeedGenerator
|
|
|
|
# Local imports
|
|
from starpunk.models import Note
|
|
from starpunk.monitoring.business import track_feed_generated
|
|
|
|
|
|
def generate_rss(
|
|
site_url: str,
|
|
site_name: str,
|
|
site_description: str,
|
|
notes: list[Note],
|
|
limit: int = 50,
|
|
) -> str:
|
|
"""
|
|
Generate RSS 2.0 XML feed from published notes
|
|
|
|
Creates a standards-compliant RSS 2.0 feed with proper channel metadata
|
|
and item entries for each note. Includes Atom self-link for discovery.
|
|
|
|
NOTE: For memory-efficient streaming, use generate_rss_streaming() instead.
|
|
This function is kept for backwards compatibility and caching use cases.
|
|
|
|
Args:
|
|
site_url: Base URL of the site (e.g., 'https://example.com')
|
|
site_name: Site title for RSS channel
|
|
site_description: Site description for RSS channel
|
|
notes: List of Note objects to include (should be published only)
|
|
limit: Maximum number of items to include (default: 50)
|
|
|
|
Returns:
|
|
RSS 2.0 XML string (UTF-8 encoded, pretty-printed)
|
|
|
|
Raises:
|
|
ValueError: If site_url or site_name is empty
|
|
|
|
Examples:
|
|
>>> notes = list_notes(published_only=True, limit=50)
|
|
>>> feed_xml = generate_rss(
|
|
... site_url='https://example.com',
|
|
... site_name='My Blog',
|
|
... site_description='My personal notes',
|
|
... notes=notes
|
|
... )
|
|
>>> print(feed_xml[:38])
|
|
<?xml version='1.0' encoding='UTF-8'?>
|
|
"""
|
|
# Validate required parameters
|
|
if not site_url or not site_url.strip():
|
|
raise ValueError("site_url is required and cannot be empty")
|
|
|
|
if not site_name or not site_name.strip():
|
|
raise ValueError("site_name is required and cannot be empty")
|
|
|
|
# Remove trailing slash from site_url for consistency
|
|
site_url = site_url.rstrip("/")
|
|
|
|
# Create feed generator
|
|
fg = FeedGenerator()
|
|
|
|
# Set channel metadata (required elements)
|
|
fg.id(site_url)
|
|
fg.title(site_name)
|
|
fg.link(href=site_url, rel="alternate")
|
|
fg.description(site_description or site_name)
|
|
fg.language("en")
|
|
|
|
# Add self-link for feed discovery (Atom namespace)
|
|
fg.link(href=f"{site_url}/feed.xml", rel="self", type="application/rss+xml")
|
|
|
|
# Set last build date to now
|
|
fg.lastBuildDate(datetime.now(timezone.utc))
|
|
|
|
# Track feed generation timing
|
|
start_time = time.time()
|
|
|
|
# Add items (limit to configured maximum, newest first)
|
|
# Notes from database are DESC but feedgen reverses them, so we reverse back
|
|
for note in reversed(notes[:limit]):
|
|
# Create feed entry
|
|
fe = fg.add_entry()
|
|
|
|
# Build permalink URL
|
|
permalink = f"{site_url}{note.permalink}"
|
|
|
|
# Set required item elements
|
|
fe.id(permalink)
|
|
fe.title(get_note_title(note))
|
|
fe.link(href=permalink)
|
|
fe.guid(permalink, permalink=True)
|
|
|
|
# Set publication date (ensure UTC timezone)
|
|
pubdate = note.created_at
|
|
if pubdate.tzinfo is None:
|
|
# If naive datetime, assume UTC
|
|
pubdate = pubdate.replace(tzinfo=timezone.utc)
|
|
fe.pubDate(pubdate)
|
|
|
|
# Set description with HTML content in CDATA
|
|
# Per Q24 and ADR-057: Embed media as HTML in description
|
|
html_content = ""
|
|
|
|
# Add media at top if present (v1.2.0 Phase 3)
|
|
if hasattr(note, 'media') and note.media:
|
|
html_content += '<div class="media">'
|
|
for item in note.media:
|
|
media_url = f"{site_url}/media/{item['path']}"
|
|
caption = item.get('caption', '')
|
|
html_content += f'<img src="{media_url}" alt="{caption}" />'
|
|
html_content += '</div>'
|
|
|
|
# Add text content below media
|
|
html_content += clean_html_for_rss(note.html)
|
|
|
|
# feedgen automatically wraps content in CDATA for RSS
|
|
fe.description(html_content)
|
|
|
|
# Add RSS enclosure element (first image only, per RSS 2.0 spec)
|
|
if hasattr(note, 'media') and note.media:
|
|
first_media = note.media[0]
|
|
media_url = f"{site_url}/media/{first_media['path']}"
|
|
fe.enclosure(
|
|
url=media_url,
|
|
length=str(first_media.get('size', 0)),
|
|
type=first_media.get('mime_type', 'image/jpeg')
|
|
)
|
|
|
|
# Generate RSS 2.0 XML (pretty-printed)
|
|
feed_xml_bytes = fg.rss_str(pretty=True)
|
|
feed_xml = feed_xml_bytes.decode("utf-8")
|
|
|
|
# Add Media RSS elements manually (feedgen's media extension has issues)
|
|
# We need to inject media:content and media:thumbnail elements
|
|
feed_xml = _inject_media_rss_elements(feed_xml, site_url, notes[:limit])
|
|
|
|
# Track feed generation metrics
|
|
duration_ms = (time.time() - start_time) * 1000
|
|
track_feed_generated(
|
|
format='rss',
|
|
item_count=min(len(notes), limit),
|
|
duration_ms=duration_ms,
|
|
cached=False
|
|
)
|
|
|
|
return feed_xml
|
|
|
|
|
|
def generate_rss_streaming(
|
|
site_url: str,
|
|
site_name: str,
|
|
site_description: str,
|
|
notes: list[Note],
|
|
limit: int = 50,
|
|
):
|
|
"""
|
|
Generate RSS 2.0 XML feed from published notes using streaming
|
|
|
|
Memory-efficient generator that yields XML chunks instead of building
|
|
the entire feed in memory. Recommended for large feeds (100+ items).
|
|
|
|
Yields XML in semantic chunks (channel metadata, individual items, closing tags)
|
|
rather than character-by-character for optimal performance.
|
|
|
|
Args:
|
|
site_url: Base URL of the site (e.g., 'https://example.com')
|
|
site_name: Site title for RSS channel
|
|
site_description: Site description for RSS channel
|
|
notes: List of Note objects to include (should be published only)
|
|
limit: Maximum number of items to include (default: 50)
|
|
|
|
Yields:
|
|
XML chunks as strings (UTF-8)
|
|
|
|
Raises:
|
|
ValueError: If site_url or site_name is empty
|
|
|
|
Examples:
|
|
>>> from flask import Response
|
|
>>> notes = list_notes(published_only=True, limit=100)
|
|
>>> generator = generate_rss_streaming(
|
|
... site_url='https://example.com',
|
|
... site_name='My Blog',
|
|
... site_description='My personal notes',
|
|
... notes=notes
|
|
... )
|
|
>>> return Response(generator, mimetype='application/rss+xml')
|
|
"""
|
|
# Validate required parameters
|
|
if not site_url or not site_url.strip():
|
|
raise ValueError("site_url is required and cannot be empty")
|
|
|
|
if not site_name or not site_name.strip():
|
|
raise ValueError("site_name is required and cannot be empty")
|
|
|
|
# Remove trailing slash from site_url for consistency
|
|
site_url = site_url.rstrip("/")
|
|
|
|
# Track feed generation timing
|
|
start_time = time.time()
|
|
item_count = 0
|
|
|
|
# Current timestamp for lastBuildDate
|
|
now = datetime.now(timezone.utc)
|
|
last_build = format_rfc822_date(now)
|
|
|
|
# Yield XML declaration and opening RSS tag with Media RSS namespace
|
|
yield '<?xml version="1.0" encoding="UTF-8"?>\n'
|
|
yield '<rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom" xmlns:media="http://search.yahoo.com/mrss/">\n'
|
|
yield " <channel>\n"
|
|
|
|
# Yield channel metadata
|
|
yield f" <title>{_escape_xml(site_name)}</title>\n"
|
|
yield f" <link>{_escape_xml(site_url)}</link>\n"
|
|
yield f" <description>{_escape_xml(site_description or site_name)}</description>\n"
|
|
yield " <language>en</language>\n"
|
|
yield f" <lastBuildDate>{last_build}</lastBuildDate>\n"
|
|
yield f' <atom:link href="{_escape_xml(site_url)}/feed.xml" rel="self" type="application/rss+xml"/>\n'
|
|
|
|
# Yield items (newest first)
|
|
# Notes from database are already in DESC order (newest first)
|
|
for note in notes[:limit]:
|
|
item_count += 1
|
|
|
|
# Build permalink URL
|
|
permalink = f"{site_url}{note.permalink}"
|
|
|
|
# Get note title
|
|
title = get_note_title(note)
|
|
|
|
# Format publication date
|
|
pubdate = note.created_at
|
|
if pubdate.tzinfo is None:
|
|
pubdate = pubdate.replace(tzinfo=timezone.utc)
|
|
pub_date_str = format_rfc822_date(pubdate)
|
|
|
|
# Build HTML content with media (per Q24 and ADR-057)
|
|
html_content = ""
|
|
|
|
# Add media at top if present
|
|
if hasattr(note, 'media') and note.media:
|
|
html_content += '<div class="media">'
|
|
for item in note.media:
|
|
media_url = f"{site_url}/media/{item['path']}"
|
|
caption = item.get('caption', '')
|
|
html_content += f'<img src="{media_url}" alt="{caption}" />'
|
|
html_content += '</div>'
|
|
|
|
# Add text content below media
|
|
html_content += clean_html_for_rss(note.html)
|
|
|
|
# Build item XML
|
|
item_xml = f""" <item>
|
|
<title>{_escape_xml(title)}</title>
|
|
<link>{_escape_xml(permalink)}</link>
|
|
<guid isPermaLink="true">{_escape_xml(permalink)}</guid>
|
|
<pubDate>{pub_date_str}</pubDate>"""
|
|
|
|
# Add enclosure element (first image only, per RSS 2.0 spec)
|
|
if hasattr(note, 'media') and note.media:
|
|
first_media = note.media[0]
|
|
media_url = f"{site_url}/media/{first_media['path']}"
|
|
item_xml += f"""
|
|
<enclosure url="{_escape_xml(media_url)}" length="{first_media.get('size', 0)}" type="{first_media.get('mime_type', 'image/jpeg')}"/>"""
|
|
|
|
# Add description with HTML content
|
|
item_xml += f"""
|
|
<description><![CDATA[{html_content}]]></description>"""
|
|
|
|
# Add media:content elements (all images)
|
|
if hasattr(note, 'media') and note.media:
|
|
for media_item in note.media:
|
|
media_url = f"{site_url}/media/{media_item['path']}"
|
|
item_xml += f"""
|
|
<media:content url="{_escape_xml(media_url)}" type="{media_item.get('mime_type', 'image/jpeg')}" medium="image" fileSize="{media_item.get('size', 0)}"/>"""
|
|
|
|
# Add media:thumbnail (first image only)
|
|
first_media = note.media[0]
|
|
media_url = f"{site_url}/media/{first_media['path']}"
|
|
item_xml += f"""
|
|
<media:thumbnail url="{_escape_xml(media_url)}"/>"""
|
|
|
|
# Close item
|
|
item_xml += """
|
|
</item>
|
|
"""
|
|
yield item_xml
|
|
|
|
# Yield closing tags
|
|
yield " </channel>\n"
|
|
yield "</rss>\n"
|
|
|
|
# Track feed generation metrics
|
|
duration_ms = (time.time() - start_time) * 1000
|
|
track_feed_generated(
|
|
format='rss',
|
|
item_count=item_count,
|
|
duration_ms=duration_ms,
|
|
cached=False
|
|
)
|
|
|
|
|
|
def _inject_media_rss_elements(feed_xml: str, site_url: str, notes: list[Note]) -> str:
|
|
"""
|
|
Inject Media RSS elements into generated RSS feed
|
|
|
|
Adds media:content and media:thumbnail elements for notes with media using
|
|
string manipulation. This approach is simpler than XML parsing and preserves
|
|
the original formatting from feedgen.
|
|
|
|
Args:
|
|
feed_xml: Generated RSS XML string
|
|
site_url: Base site URL (no trailing slash)
|
|
notes: List of notes (already reversed for feedgen)
|
|
|
|
Returns:
|
|
Modified RSS XML with Media RSS elements
|
|
"""
|
|
# Step 1: Add Media RSS namespace to <rss> tag
|
|
# Handle both possible attribute orderings from feedgen
|
|
if '<rss xmlns:atom' in feed_xml:
|
|
feed_xml = feed_xml.replace(
|
|
'<rss xmlns:atom',
|
|
'<rss xmlns:media="http://search.yahoo.com/mrss/" xmlns:atom',
|
|
1 # Only replace first occurrence
|
|
)
|
|
elif '<rss version="2.0"' in feed_xml:
|
|
feed_xml = feed_xml.replace(
|
|
'<rss version="2.0"',
|
|
'<rss version="2.0" xmlns:media="http://search.yahoo.com/mrss/"',
|
|
1
|
|
)
|
|
else:
|
|
# Fallback
|
|
feed_xml = feed_xml.replace('<rss ', '<rss xmlns:media="http://search.yahoo.com/mrss/" ', 1)
|
|
|
|
# Step 2: Inject media elements for each note with media
|
|
# We need to find each <enclosure> element and inject media elements after it
|
|
# Notes are reversed in generate_rss, so notes[0] = first item in feed
|
|
|
|
for i, note in enumerate(notes):
|
|
# Skip if note has no media
|
|
if not hasattr(note, 'media') or not note.media:
|
|
continue
|
|
|
|
# Build media elements for this note
|
|
media_elements = []
|
|
|
|
# Add media:content for each image
|
|
for media_item in note.media:
|
|
media_url = f"{site_url}/media/{media_item['path']}"
|
|
media_url_escaped = _escape_xml(media_url)
|
|
mime_type = media_item.get('mime_type', 'image/jpeg')
|
|
file_size = media_item.get('size', 0)
|
|
|
|
media_content = f'<media:content url="{media_url_escaped}" type="{mime_type}" medium="image" fileSize="{file_size}"/>'
|
|
media_elements.append(media_content)
|
|
|
|
# Add media:thumbnail for first image
|
|
first_media = note.media[0]
|
|
media_url = f"{site_url}/media/{first_media['path']}"
|
|
media_url_escaped = _escape_xml(media_url)
|
|
media_thumbnail = f'<media:thumbnail url="{media_url_escaped}"/>'
|
|
media_elements.append(media_thumbnail)
|
|
|
|
# Find the enclosure for this note and inject media elements after it
|
|
# Look for the enclosure with the first media item's path
|
|
enclosure_pattern = f'<enclosure url="{media_url_escaped}"'
|
|
|
|
if enclosure_pattern in feed_xml:
|
|
# Find the end of the enclosure tag
|
|
enclosure_pos = feed_xml.find(enclosure_pattern)
|
|
enclosure_end = feed_xml.find('/>', enclosure_pos)
|
|
|
|
if enclosure_end != -1:
|
|
# Inject media elements right after the enclosure tag
|
|
insertion_point = enclosure_end + 2
|
|
media_xml = ''.join(media_elements)
|
|
feed_xml = feed_xml[:insertion_point] + media_xml + feed_xml[insertion_point:]
|
|
|
|
return feed_xml
|
|
|
|
|
|
def _escape_xml(text: str) -> str:
|
|
"""
|
|
Escape special XML characters for safe inclusion in XML elements
|
|
|
|
Escapes the five predefined XML entities: &, <, >, ", '
|
|
|
|
Args:
|
|
text: Text to escape
|
|
|
|
Returns:
|
|
XML-safe text with escaped entities
|
|
|
|
Examples:
|
|
>>> _escape_xml("Hello & goodbye")
|
|
'Hello & goodbye'
|
|
>>> _escape_xml('<tag>')
|
|
'<tag>'
|
|
"""
|
|
if not text:
|
|
return ""
|
|
|
|
# Escape in order: & first (to avoid double-escaping), then < > " '
|
|
text = text.replace("&", "&")
|
|
text = text.replace("<", "<")
|
|
text = text.replace(">", ">")
|
|
text = text.replace('"', """)
|
|
text = text.replace("'", "'")
|
|
|
|
return text
|
|
|
|
|
|
def format_rfc822_date(dt: datetime) -> str:
|
|
"""
|
|
Format datetime to RFC-822 format for RSS
|
|
|
|
RSS 2.0 requires RFC-822 date format for pubDate and lastBuildDate.
|
|
Format: "Mon, 18 Nov 2024 12:00:00 +0000"
|
|
|
|
Args:
|
|
dt: Datetime object to format (naive datetime assumed to be UTC)
|
|
|
|
Returns:
|
|
RFC-822 formatted date string
|
|
|
|
Examples:
|
|
>>> dt = datetime(2024, 11, 18, 12, 0, 0)
|
|
>>> format_rfc822_date(dt)
|
|
'Mon, 18 Nov 2024 12:00:00 +0000'
|
|
"""
|
|
# Ensure datetime has timezone (assume UTC if naive)
|
|
if dt.tzinfo is None:
|
|
dt = dt.replace(tzinfo=timezone.utc)
|
|
|
|
# Format to RFC-822
|
|
# Format string: %a = weekday, %d = day, %b = month, %Y = year
|
|
# %H:%M:%S = time, %z = timezone offset
|
|
return dt.strftime("%a, %d %b %Y %H:%M:%S %z")
|
|
|
|
|
|
def get_note_title(note: Note) -> str:
|
|
"""
|
|
Extract title from note content
|
|
|
|
Attempts to extract a meaningful title from the note. Uses the first
|
|
line of content (stripped of markdown heading syntax) or falls back
|
|
to a formatted timestamp if content is unavailable.
|
|
|
|
Algorithm:
|
|
1. Try note.title property (first line, stripped of # syntax)
|
|
2. Fall back to timestamp if title is unavailable
|
|
|
|
Args:
|
|
note: Note object
|
|
|
|
Returns:
|
|
Title string (max 100 chars, truncated if needed)
|
|
|
|
Examples:
|
|
>>> # Note with heading
|
|
>>> note = Note(...) # content: "# My First Note\\n\\n..."
|
|
>>> get_note_title(note)
|
|
'My First Note'
|
|
|
|
>>> # Note without heading (timestamp fallback)
|
|
>>> note = Note(...) # content: "Just some text"
|
|
>>> get_note_title(note)
|
|
'November 18, 2024 at 12:00 PM'
|
|
"""
|
|
try:
|
|
# Use Note's title property (handles extraction logic)
|
|
title = note.title
|
|
|
|
# Truncate to 100 characters for RSS compatibility
|
|
if len(title) > 100:
|
|
title = title[:100].strip() + "..."
|
|
|
|
return title
|
|
|
|
except (FileNotFoundError, OSError, AttributeError):
|
|
# If title extraction fails, use timestamp
|
|
return note.created_at.strftime("%B %d, %Y at %I:%M %p")
|
|
|
|
|
|
def clean_html_for_rss(html: str) -> str:
|
|
"""
|
|
Ensure HTML is safe for RSS CDATA wrapping
|
|
|
|
RSS readers expect HTML content wrapped in CDATA sections. The feedgen
|
|
library handles CDATA wrapping automatically, but we need to ensure
|
|
the HTML doesn't contain CDATA end markers that would break parsing.
|
|
|
|
This function is primarily defensive - markdown-rendered HTML should
|
|
not contain CDATA markers, but we check anyway.
|
|
|
|
Args:
|
|
html: Rendered HTML content from markdown
|
|
|
|
Returns:
|
|
Cleaned HTML safe for CDATA wrapping
|
|
|
|
Examples:
|
|
>>> html = "<p>Hello world</p>"
|
|
>>> clean_html_for_rss(html)
|
|
'<p>Hello world</p>'
|
|
|
|
>>> # Edge case: HTML containing CDATA end marker
|
|
>>> html = "<p>Example: ]]></p>"
|
|
>>> clean_html_for_rss(html)
|
|
'<p>Example: ]] ></p>'
|
|
"""
|
|
# Check for CDATA end marker and add space to break it
|
|
# This is extremely unlikely with markdown-rendered HTML but be safe
|
|
if "]]>" in html:
|
|
html = html.replace("]]>", "]] >")
|
|
|
|
return html
|