feat: add RSS feed generation module
Implements RSS 2.0 feed generation using feedgen library. Features: - generate_feed() creates standards-compliant RSS 2.0 XML - RFC-822 date formatting for pubDate elements - Title extraction from note content (first line or timestamp) - CDATA safety for HTML content - Configurable feed item limits Follows ADR-014 RSS implementation strategy. Related: docs/decisions/ADR-014-rss-feed-implementation.md
This commit is contained in:
229
starpunk/feed.py
Normal file
229
starpunk/feed.py
Normal file
@@ -0,0 +1,229 @@
|
|||||||
|
"""
|
||||||
|
RSS feed generation for StarPunk
|
||||||
|
|
||||||
|
This module provides RSS 2.0 feed generation from published notes using the
|
||||||
|
feedgen library. Feeds include proper RFC-822 dates, CDATA-wrapped HTML
|
||||||
|
content, and all required RSS elements.
|
||||||
|
|
||||||
|
Functions:
|
||||||
|
generate_feed: Generate RSS 2.0 XML feed from notes
|
||||||
|
format_rfc822_date: Format datetime to RFC-822 for RSS
|
||||||
|
get_note_title: Extract title from note (first line or timestamp)
|
||||||
|
clean_html_for_rss: Clean HTML for CDATA safety
|
||||||
|
|
||||||
|
Standards:
|
||||||
|
- RSS 2.0 specification compliant
|
||||||
|
- RFC-822 date format
|
||||||
|
- Atom self-link for feed discovery
|
||||||
|
- CDATA wrapping for HTML content
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Standard library imports
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
# Third-party imports
|
||||||
|
from feedgen.feed import FeedGenerator
|
||||||
|
|
||||||
|
# Local imports
|
||||||
|
from starpunk.models import Note
|
||||||
|
|
||||||
|
|
||||||
|
def generate_feed(
|
||||||
|
site_url: str,
|
||||||
|
site_name: str,
|
||||||
|
site_description: str,
|
||||||
|
notes: list[Note],
|
||||||
|
limit: int = 50,
|
||||||
|
) -> str:
|
||||||
|
"""
|
||||||
|
Generate RSS 2.0 XML feed from published notes
|
||||||
|
|
||||||
|
Creates a standards-compliant RSS 2.0 feed with proper channel metadata
|
||||||
|
and item entries for each note. Includes Atom self-link for discovery.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
site_url: Base URL of the site (e.g., 'https://example.com')
|
||||||
|
site_name: Site title for RSS channel
|
||||||
|
site_description: Site description for RSS channel
|
||||||
|
notes: List of Note objects to include (should be published only)
|
||||||
|
limit: Maximum number of items to include (default: 50)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
RSS 2.0 XML string (UTF-8 encoded, pretty-printed)
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
ValueError: If site_url or site_name is empty
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
>>> notes = list_notes(published_only=True, limit=50)
|
||||||
|
>>> feed_xml = generate_feed(
|
||||||
|
... site_url='https://example.com',
|
||||||
|
... site_name='My Blog',
|
||||||
|
... site_description='My personal notes',
|
||||||
|
... notes=notes
|
||||||
|
... )
|
||||||
|
>>> print(feed_xml[:38])
|
||||||
|
<?xml version='1.0' encoding='UTF-8'?>
|
||||||
|
"""
|
||||||
|
# Validate required parameters
|
||||||
|
if not site_url or not site_url.strip():
|
||||||
|
raise ValueError("site_url is required and cannot be empty")
|
||||||
|
|
||||||
|
if not site_name or not site_name.strip():
|
||||||
|
raise ValueError("site_name is required and cannot be empty")
|
||||||
|
|
||||||
|
# Remove trailing slash from site_url for consistency
|
||||||
|
site_url = site_url.rstrip("/")
|
||||||
|
|
||||||
|
# Create feed generator
|
||||||
|
fg = FeedGenerator()
|
||||||
|
|
||||||
|
# Set channel metadata (required elements)
|
||||||
|
fg.id(site_url)
|
||||||
|
fg.title(site_name)
|
||||||
|
fg.link(href=site_url, rel="alternate")
|
||||||
|
fg.description(site_description or site_name)
|
||||||
|
fg.language("en")
|
||||||
|
|
||||||
|
# Add self-link for feed discovery (Atom namespace)
|
||||||
|
fg.link(href=f"{site_url}/feed.xml", rel="self", type="application/rss+xml")
|
||||||
|
|
||||||
|
# Set last build date to now
|
||||||
|
fg.lastBuildDate(datetime.now(timezone.utc))
|
||||||
|
|
||||||
|
# Add items (limit to configured maximum)
|
||||||
|
for note in notes[:limit]:
|
||||||
|
# Create feed entry
|
||||||
|
fe = fg.add_entry()
|
||||||
|
|
||||||
|
# Build permalink URL
|
||||||
|
permalink = f"{site_url}{note.permalink}"
|
||||||
|
|
||||||
|
# Set required item elements
|
||||||
|
fe.id(permalink)
|
||||||
|
fe.title(get_note_title(note))
|
||||||
|
fe.link(href=permalink)
|
||||||
|
fe.guid(permalink, permalink=True)
|
||||||
|
|
||||||
|
# Set publication date (ensure UTC timezone)
|
||||||
|
pubdate = note.created_at
|
||||||
|
if pubdate.tzinfo is None:
|
||||||
|
# If naive datetime, assume UTC
|
||||||
|
pubdate = pubdate.replace(tzinfo=timezone.utc)
|
||||||
|
fe.pubDate(pubdate)
|
||||||
|
|
||||||
|
# Set description with HTML content in CDATA
|
||||||
|
# feedgen automatically wraps content in CDATA for RSS
|
||||||
|
html_content = clean_html_for_rss(note.html)
|
||||||
|
fe.description(html_content)
|
||||||
|
|
||||||
|
# Generate RSS 2.0 XML (pretty-printed)
|
||||||
|
return fg.rss_str(pretty=True).decode("utf-8")
|
||||||
|
|
||||||
|
|
||||||
|
def format_rfc822_date(dt: datetime) -> str:
|
||||||
|
"""
|
||||||
|
Format datetime to RFC-822 format for RSS
|
||||||
|
|
||||||
|
RSS 2.0 requires RFC-822 date format for pubDate and lastBuildDate.
|
||||||
|
Format: "Mon, 18 Nov 2024 12:00:00 +0000"
|
||||||
|
|
||||||
|
Args:
|
||||||
|
dt: Datetime object to format (naive datetime assumed to be UTC)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
RFC-822 formatted date string
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
>>> dt = datetime(2024, 11, 18, 12, 0, 0)
|
||||||
|
>>> format_rfc822_date(dt)
|
||||||
|
'Mon, 18 Nov 2024 12:00:00 +0000'
|
||||||
|
"""
|
||||||
|
# Ensure datetime has timezone (assume UTC if naive)
|
||||||
|
if dt.tzinfo is None:
|
||||||
|
dt = dt.replace(tzinfo=timezone.utc)
|
||||||
|
|
||||||
|
# Format to RFC-822
|
||||||
|
# Format string: %a = weekday, %d = day, %b = month, %Y = year
|
||||||
|
# %H:%M:%S = time, %z = timezone offset
|
||||||
|
return dt.strftime("%a, %d %b %Y %H:%M:%S %z")
|
||||||
|
|
||||||
|
|
||||||
|
def get_note_title(note: Note) -> str:
|
||||||
|
"""
|
||||||
|
Extract title from note content
|
||||||
|
|
||||||
|
Attempts to extract a meaningful title from the note. Uses the first
|
||||||
|
line of content (stripped of markdown heading syntax) or falls back
|
||||||
|
to a formatted timestamp if content is unavailable.
|
||||||
|
|
||||||
|
Algorithm:
|
||||||
|
1. Try note.title property (first line, stripped of # syntax)
|
||||||
|
2. Fall back to timestamp if title is unavailable
|
||||||
|
|
||||||
|
Args:
|
||||||
|
note: Note object
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Title string (max 100 chars, truncated if needed)
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
>>> # Note with heading
|
||||||
|
>>> note = Note(...) # content: "# My First Note\\n\\n..."
|
||||||
|
>>> get_note_title(note)
|
||||||
|
'My First Note'
|
||||||
|
|
||||||
|
>>> # Note without heading (timestamp fallback)
|
||||||
|
>>> note = Note(...) # content: "Just some text"
|
||||||
|
>>> get_note_title(note)
|
||||||
|
'November 18, 2024 at 12:00 PM'
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
# Use Note's title property (handles extraction logic)
|
||||||
|
title = note.title
|
||||||
|
|
||||||
|
# Truncate to 100 characters for RSS compatibility
|
||||||
|
if len(title) > 100:
|
||||||
|
title = title[:100].strip() + "..."
|
||||||
|
|
||||||
|
return title
|
||||||
|
|
||||||
|
except (FileNotFoundError, OSError, AttributeError):
|
||||||
|
# If title extraction fails, use timestamp
|
||||||
|
return note.created_at.strftime("%B %d, %Y at %I:%M %p")
|
||||||
|
|
||||||
|
|
||||||
|
def clean_html_for_rss(html: str) -> str:
|
||||||
|
"""
|
||||||
|
Ensure HTML is safe for RSS CDATA wrapping
|
||||||
|
|
||||||
|
RSS readers expect HTML content wrapped in CDATA sections. The feedgen
|
||||||
|
library handles CDATA wrapping automatically, but we need to ensure
|
||||||
|
the HTML doesn't contain CDATA end markers that would break parsing.
|
||||||
|
|
||||||
|
This function is primarily defensive - markdown-rendered HTML should
|
||||||
|
not contain CDATA markers, but we check anyway.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
html: Rendered HTML content from markdown
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Cleaned HTML safe for CDATA wrapping
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
>>> html = "<p>Hello world</p>"
|
||||||
|
>>> clean_html_for_rss(html)
|
||||||
|
'<p>Hello world</p>'
|
||||||
|
|
||||||
|
>>> # Edge case: HTML containing CDATA end marker
|
||||||
|
>>> html = "<p>Example: ]]></p>"
|
||||||
|
>>> clean_html_for_rss(html)
|
||||||
|
'<p>Example: ]] ></p>'
|
||||||
|
"""
|
||||||
|
# Check for CDATA end marker and add space to break it
|
||||||
|
# This is extremely unlikely with markdown-rendered HTML but be safe
|
||||||
|
if "]]>" in html:
|
||||||
|
html = html.replace("]]>", "]] >")
|
||||||
|
|
||||||
|
return html
|
||||||
Reference in New Issue
Block a user