diff --git a/starpunk/feed.py b/starpunk/feed.py new file mode 100644 index 0000000..0fe9d02 --- /dev/null +++ b/starpunk/feed.py @@ -0,0 +1,229 @@ +""" +RSS feed generation for StarPunk + +This module provides RSS 2.0 feed generation from published notes using the +feedgen library. Feeds include proper RFC-822 dates, CDATA-wrapped HTML +content, and all required RSS elements. + +Functions: + generate_feed: Generate RSS 2.0 XML feed from notes + format_rfc822_date: Format datetime to RFC-822 for RSS + get_note_title: Extract title from note (first line or timestamp) + clean_html_for_rss: Clean HTML for CDATA safety + +Standards: + - RSS 2.0 specification compliant + - RFC-822 date format + - Atom self-link for feed discovery + - CDATA wrapping for HTML content +""" + +# Standard library imports +from datetime import datetime, timezone +from typing import Optional + +# Third-party imports +from feedgen.feed import FeedGenerator + +# Local imports +from starpunk.models import Note + + +def generate_feed( + site_url: str, + site_name: str, + site_description: str, + notes: list[Note], + limit: int = 50, +) -> str: + """ + Generate RSS 2.0 XML feed from published notes + + Creates a standards-compliant RSS 2.0 feed with proper channel metadata + and item entries for each note. Includes Atom self-link for discovery. + + Args: + site_url: Base URL of the site (e.g., 'https://example.com') + site_name: Site title for RSS channel + site_description: Site description for RSS channel + notes: List of Note objects to include (should be published only) + limit: Maximum number of items to include (default: 50) + + Returns: + RSS 2.0 XML string (UTF-8 encoded, pretty-printed) + + Raises: + ValueError: If site_url or site_name is empty + + Examples: + >>> notes = list_notes(published_only=True, limit=50) + >>> feed_xml = generate_feed( + ... site_url='https://example.com', + ... site_name='My Blog', + ... site_description='My personal notes', + ... notes=notes + ... ) + >>> print(feed_xml[:38]) + + """ + # Validate required parameters + if not site_url or not site_url.strip(): + raise ValueError("site_url is required and cannot be empty") + + if not site_name or not site_name.strip(): + raise ValueError("site_name is required and cannot be empty") + + # Remove trailing slash from site_url for consistency + site_url = site_url.rstrip("/") + + # Create feed generator + fg = FeedGenerator() + + # Set channel metadata (required elements) + fg.id(site_url) + fg.title(site_name) + fg.link(href=site_url, rel="alternate") + fg.description(site_description or site_name) + fg.language("en") + + # Add self-link for feed discovery (Atom namespace) + fg.link(href=f"{site_url}/feed.xml", rel="self", type="application/rss+xml") + + # Set last build date to now + fg.lastBuildDate(datetime.now(timezone.utc)) + + # Add items (limit to configured maximum) + for note in notes[:limit]: + # Create feed entry + fe = fg.add_entry() + + # Build permalink URL + permalink = f"{site_url}{note.permalink}" + + # Set required item elements + fe.id(permalink) + fe.title(get_note_title(note)) + fe.link(href=permalink) + fe.guid(permalink, permalink=True) + + # Set publication date (ensure UTC timezone) + pubdate = note.created_at + if pubdate.tzinfo is None: + # If naive datetime, assume UTC + pubdate = pubdate.replace(tzinfo=timezone.utc) + fe.pubDate(pubdate) + + # Set description with HTML content in CDATA + # feedgen automatically wraps content in CDATA for RSS + html_content = clean_html_for_rss(note.html) + fe.description(html_content) + + # Generate RSS 2.0 XML (pretty-printed) + return fg.rss_str(pretty=True).decode("utf-8") + + +def format_rfc822_date(dt: datetime) -> str: + """ + Format datetime to RFC-822 format for RSS + + RSS 2.0 requires RFC-822 date format for pubDate and lastBuildDate. + Format: "Mon, 18 Nov 2024 12:00:00 +0000" + + Args: + dt: Datetime object to format (naive datetime assumed to be UTC) + + Returns: + RFC-822 formatted date string + + Examples: + >>> dt = datetime(2024, 11, 18, 12, 0, 0) + >>> format_rfc822_date(dt) + 'Mon, 18 Nov 2024 12:00:00 +0000' + """ + # Ensure datetime has timezone (assume UTC if naive) + if dt.tzinfo is None: + dt = dt.replace(tzinfo=timezone.utc) + + # Format to RFC-822 + # Format string: %a = weekday, %d = day, %b = month, %Y = year + # %H:%M:%S = time, %z = timezone offset + return dt.strftime("%a, %d %b %Y %H:%M:%S %z") + + +def get_note_title(note: Note) -> str: + """ + Extract title from note content + + Attempts to extract a meaningful title from the note. Uses the first + line of content (stripped of markdown heading syntax) or falls back + to a formatted timestamp if content is unavailable. + + Algorithm: + 1. Try note.title property (first line, stripped of # syntax) + 2. Fall back to timestamp if title is unavailable + + Args: + note: Note object + + Returns: + Title string (max 100 chars, truncated if needed) + + Examples: + >>> # Note with heading + >>> note = Note(...) # content: "# My First Note\\n\\n..." + >>> get_note_title(note) + 'My First Note' + + >>> # Note without heading (timestamp fallback) + >>> note = Note(...) # content: "Just some text" + >>> get_note_title(note) + 'November 18, 2024 at 12:00 PM' + """ + try: + # Use Note's title property (handles extraction logic) + title = note.title + + # Truncate to 100 characters for RSS compatibility + if len(title) > 100: + title = title[:100].strip() + "..." + + return title + + except (FileNotFoundError, OSError, AttributeError): + # If title extraction fails, use timestamp + return note.created_at.strftime("%B %d, %Y at %I:%M %p") + + +def clean_html_for_rss(html: str) -> str: + """ + Ensure HTML is safe for RSS CDATA wrapping + + RSS readers expect HTML content wrapped in CDATA sections. The feedgen + library handles CDATA wrapping automatically, but we need to ensure + the HTML doesn't contain CDATA end markers that would break parsing. + + This function is primarily defensive - markdown-rendered HTML should + not contain CDATA markers, but we check anyway. + + Args: + html: Rendered HTML content from markdown + + Returns: + Cleaned HTML safe for CDATA wrapping + + Examples: + >>> html = "

Hello world

" + >>> clean_html_for_rss(html) + '

Hello world

' + + >>> # Edge case: HTML containing CDATA end marker + >>> html = "

Example: ]]>

" + >>> clean_html_for_rss(html) + '

Example: ]] >

' + """ + # Check for CDATA end marker and add space to break it + # This is extremely unlikely with markdown-rendered HTML but be safe + if "]]>" in html: + html = html.replace("]]>", "]] >") + + return html