StarPunk/starpunk/feed.py

"""
RSS feed generation for StarPunk

This module provides RSS 2.0 feed generation from published notes using the
feedgen library. Feeds include proper RFC-822 dates, CDATA-wrapped HTML
content, and all required RSS elements.

Functions:
    generate_feed: Generate RSS 2.0 XML feed from notes
    format_rfc822_date: Format datetime to RFC-822 for RSS
    get_note_title: Extract title from note (first line or timestamp)
    clean_html_for_rss: Clean HTML for CDATA safety

Standards:
    - RSS 2.0 specification compliant
    - RFC-822 date format
    - Atom self-link for feed discovery
    - CDATA wrapping for HTML content
"""

# Standard library imports
from datetime import datetime, timezone
from typing import Optional

# Third-party imports
from feedgen.feed import FeedGenerator

# Local imports
from starpunk.models import Note


def generate_feed(
    site_url: str,
    site_name: str,
    site_description: str,
    notes: list[Note],
    limit: int = 50,
) -> str:
    """
    Generate RSS 2.0 XML feed from published notes

    Creates a standards-compliant RSS 2.0 feed with proper channel metadata
    and item entries for each note. Includes Atom self-link for discovery.

    Args:
        site_url: Base URL of the site (e.g., 'https://example.com')
        site_name: Site title for RSS channel
        site_description: Site description for RSS channel
        notes: List of Note objects to include (should be published only)
        limit: Maximum number of items to include (default: 50)

    Returns:
        RSS 2.0 XML string (UTF-8 encoded, pretty-printed)

    Raises:
        ValueError: If site_url or site_name is empty

    Examples:
        >>> notes = list_notes(published_only=True, limit=50)
        >>> feed_xml = generate_feed(
        ...     site_url='https://example.com',
        ...     site_name='My Blog',
        ...     site_description='My personal notes',
        ...     notes=notes
        ... )
        >>> print(feed_xml[:38])
        <?xml version='1.0' encoding='UTF-8'?>
    """
    # Validate required parameters
    if not site_url or not site_url.strip():
        raise ValueError("site_url is required and cannot be empty")

    if not site_name or not site_name.strip():
        raise ValueError("site_name is required and cannot be empty")

    # Remove trailing slash from site_url for consistency
    site_url = site_url.rstrip("/")

    # Create feed generator
    fg = FeedGenerator()

    # Set channel metadata (required elements)
    fg.id(site_url)
    fg.title(site_name)
    fg.link(href=site_url, rel="alternate")
    fg.description(site_description or site_name)
    fg.language("en")

    # Add self-link for feed discovery (Atom namespace)
    fg.link(href=f"{site_url}/feed.xml", rel="self", type="application/rss+xml")

    # Set last build date to now
    fg.lastBuildDate(datetime.now(timezone.utc))

    # Add items (limit to configured maximum)
    for note in notes[:limit]:
        # Create feed entry
        fe = fg.add_entry()

        # Build permalink URL
        permalink = f"{site_url}{note.permalink}"

        # Set required item elements
        fe.id(permalink)
        fe.title(get_note_title(note))
        fe.link(href=permalink)
        fe.guid(permalink, permalink=True)

        # Set publication date (ensure UTC timezone)
        pubdate = note.created_at
        if pubdate.tzinfo is None:
            # If naive datetime, assume UTC
            pubdate = pubdate.replace(tzinfo=timezone.utc)
        fe.pubDate(pubdate)

        # Set description with HTML content in CDATA
        # feedgen automatically wraps content in CDATA for RSS
        html_content = clean_html_for_rss(note.html)
        fe.description(html_content)

    # Generate RSS 2.0 XML (pretty-printed)
    return fg.rss_str(pretty=True).decode("utf-8")


def format_rfc822_date(dt: datetime) -> str:
    """
    Format datetime to RFC-822 format for RSS

    RSS 2.0 requires RFC-822 date format for pubDate and lastBuildDate.
    Format: "Mon, 18 Nov 2024 12:00:00 +0000"

    Args:
        dt: Datetime object to format (naive datetime assumed to be UTC)

    Returns:
        RFC-822 formatted date string

    Examples:
        >>> dt = datetime(2024, 11, 18, 12, 0, 0)
        >>> format_rfc822_date(dt)
        'Mon, 18 Nov 2024 12:00:00 +0000'
    """
    # Ensure datetime has timezone (assume UTC if naive)
    if dt.tzinfo is None:
        dt = dt.replace(tzinfo=timezone.utc)

    # Format to RFC-822
    # Format string: %a = weekday, %d = day, %b = month, %Y = year
    #                %H:%M:%S = time, %z = timezone offset
    return dt.strftime("%a, %d %b %Y %H:%M:%S %z")


def get_note_title(note: Note) -> str:
    """
    Extract title from note content

    Attempts to extract a meaningful title from the note. Uses the first
    line of content (stripped of markdown heading syntax) or falls back
    to a formatted timestamp if content is unavailable.

    Algorithm:
        1. Try note.title property (first line, stripped of # syntax)
        2. Fall back to timestamp if title is unavailable

    Args:
        note: Note object

    Returns:
        Title string (max 100 chars, truncated if needed)

    Examples:
        >>> # Note with heading
        >>> note = Note(...)  # content: "# My First Note\\n\\n..."
        >>> get_note_title(note)
        'My First Note'

        >>> # Note without heading (timestamp fallback)
        >>> note = Note(...)  # content: "Just some text"
        >>> get_note_title(note)
        'November 18, 2024 at 12:00 PM'
    """
    try:
        # Use Note's title property (handles extraction logic)
        title = note.title

        # Truncate to 100 characters for RSS compatibility
        if len(title) > 100:
            title = title[:100].strip() + "..."

        return title

    except (FileNotFoundError, OSError, AttributeError):
        # If title extraction fails, use timestamp
        return note.created_at.strftime("%B %d, %Y at %I:%M %p")


def clean_html_for_rss(html: str) -> str:
    """
    Ensure HTML is safe for RSS CDATA wrapping

    RSS readers expect HTML content wrapped in CDATA sections. The feedgen
    library handles CDATA wrapping automatically, but we need to ensure
    the HTML doesn't contain CDATA end markers that would break parsing.

    This function is primarily defensive - markdown-rendered HTML should
    not contain CDATA markers, but we check anyway.

    Args:
        html: Rendered HTML content from markdown

    Returns:
        Cleaned HTML safe for CDATA wrapping

    Examples:
        >>> html = "<p>Hello world</p>"
        >>> clean_html_for_rss(html)
        '<p>Hello world</p>'

        >>> # Edge case: HTML containing CDATA end marker
        >>> html = "<p>Example: ]]></p>"
        >>> clean_html_for_rss(html)
        '<p>Example: ]] ></p>'
    """
    # Check for CDATA end marker and add space to break it
    # This is extremely unlikely with markdown-rendered HTML but be safe
    if "]]>" in html:
        html = html.replace("]]>", "]] >")

    return html