StarPunk/starpunk/feed.py

"""
RSS feed generation for StarPunk

This module provides RSS 2.0 feed generation from published notes using the
feedgen library. Feeds include proper RFC-822 dates, CDATA-wrapped HTML
content, and all required RSS elements.

Functions:
    generate_feed: Generate RSS 2.0 XML feed from notes
    format_rfc822_date: Format datetime to RFC-822 for RSS
    get_note_title: Extract title from note (first line or timestamp)
    clean_html_for_rss: Clean HTML for CDATA safety

Standards:
    - RSS 2.0 specification compliant
    - RFC-822 date format
    - Atom self-link for feed discovery
    - CDATA wrapping for HTML content
"""

# Standard library imports
from datetime import datetime, timezone
from typing import Optional

# Third-party imports
from feedgen.feed import FeedGenerator

# Local imports
from starpunk.models import Note


def generate_feed(
    site_url: str,
    site_name: str,
    site_description: str,
    notes: list[Note],
    limit: int = 50,
) -> str:
    """
    Generate RSS 2.0 XML feed from published notes

    Creates a standards-compliant RSS 2.0 feed with proper channel metadata
    and item entries for each note. Includes Atom self-link for discovery.

    NOTE: For memory-efficient streaming, use generate_feed_streaming() instead.
    This function is kept for backwards compatibility and caching use cases.

    Args:
        site_url: Base URL of the site (e.g., 'https://example.com')
        site_name: Site title for RSS channel
        site_description: Site description for RSS channel
        notes: List of Note objects to include (should be published only)
        limit: Maximum number of items to include (default: 50)

    Returns:
        RSS 2.0 XML string (UTF-8 encoded, pretty-printed)

    Raises:
        ValueError: If site_url or site_name is empty

    Examples:
        >>> notes = list_notes(published_only=True, limit=50)
        >>> feed_xml = generate_feed(
        ...     site_url='https://example.com',
        ...     site_name='My Blog',
        ...     site_description='My personal notes',
        ...     notes=notes
        ... )
        >>> print(feed_xml[:38])
        <?xml version='1.0' encoding='UTF-8'?>
    """
    # Validate required parameters
    if not site_url or not site_url.strip():
        raise ValueError("site_url is required and cannot be empty")

    if not site_name or not site_name.strip():
        raise ValueError("site_name is required and cannot be empty")

    # Remove trailing slash from site_url for consistency
    site_url = site_url.rstrip("/")

    # Create feed generator
    fg = FeedGenerator()

    # Set channel metadata (required elements)
    fg.id(site_url)
    fg.title(site_name)
    fg.link(href=site_url, rel="alternate")
    fg.description(site_description or site_name)
    fg.language("en")

    # Add self-link for feed discovery (Atom namespace)
    fg.link(href=f"{site_url}/feed.xml", rel="self", type="application/rss+xml")

    # Set last build date to now
    fg.lastBuildDate(datetime.now(timezone.utc))

    # Add items (limit to configured maximum, newest first)
    # Notes from database are DESC but feedgen reverses them, so we reverse back
    for note in reversed(notes[:limit]):
        # Create feed entry
        fe = fg.add_entry()

        # Build permalink URL
        permalink = f"{site_url}{note.permalink}"

        # Set required item elements
        fe.id(permalink)
        fe.title(get_note_title(note))
        fe.link(href=permalink)
        fe.guid(permalink, permalink=True)

        # Set publication date (ensure UTC timezone)
        pubdate = note.created_at
        if pubdate.tzinfo is None:
            # If naive datetime, assume UTC
            pubdate = pubdate.replace(tzinfo=timezone.utc)
        fe.pubDate(pubdate)

        # Set description with HTML content in CDATA
        # feedgen automatically wraps content in CDATA for RSS
        html_content = clean_html_for_rss(note.html)
        fe.description(html_content)

    # Generate RSS 2.0 XML (pretty-printed)
    return fg.rss_str(pretty=True).decode("utf-8")


def generate_feed_streaming(
    site_url: str,
    site_name: str,
    site_description: str,
    notes: list[Note],
    limit: int = 50,
):
    """
    Generate RSS 2.0 XML feed from published notes using streaming

    Memory-efficient generator that yields XML chunks instead of building
    the entire feed in memory. Recommended for large feeds (100+ items).

    Yields XML in semantic chunks (channel metadata, individual items, closing tags)
    rather than character-by-character for optimal performance.

    Args:
        site_url: Base URL of the site (e.g., 'https://example.com')
        site_name: Site title for RSS channel
        site_description: Site description for RSS channel
        notes: List of Note objects to include (should be published only)
        limit: Maximum number of items to include (default: 50)

    Yields:
        XML chunks as strings (UTF-8)

    Raises:
        ValueError: If site_url or site_name is empty

    Examples:
        >>> from flask import Response
        >>> notes = list_notes(published_only=True, limit=100)
        >>> generator = generate_feed_streaming(
        ...     site_url='https://example.com',
        ...     site_name='My Blog',
        ...     site_description='My personal notes',
        ...     notes=notes
        ... )
        >>> return Response(generator, mimetype='application/rss+xml')
    """
    # Validate required parameters
    if not site_url or not site_url.strip():
        raise ValueError("site_url is required and cannot be empty")

    if not site_name or not site_name.strip():
        raise ValueError("site_name is required and cannot be empty")

    # Remove trailing slash from site_url for consistency
    site_url = site_url.rstrip("/")

    # Current timestamp for lastBuildDate
    now = datetime.now(timezone.utc)
    last_build = format_rfc822_date(now)

    # Yield XML declaration and opening RSS tag
    yield '<?xml version="1.0" encoding="UTF-8"?>\n'
    yield '<rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom">\n'
    yield "  <channel>\n"

    # Yield channel metadata
    yield f"    <title>{_escape_xml(site_name)}</title>\n"
    yield f"    <link>{_escape_xml(site_url)}</link>\n"
    yield f"    <description>{_escape_xml(site_description or site_name)}</description>\n"
    yield "    <language>en</language>\n"
    yield f"    <lastBuildDate>{last_build}</lastBuildDate>\n"
    yield f'    <atom:link href="{_escape_xml(site_url)}/feed.xml" rel="self" type="application/rss+xml"/>\n'

    # Yield items (newest first)
    # Notes from database are DESC but feedgen reverses them, so we reverse back
    for note in reversed(notes[:limit]):
        # Build permalink URL
        permalink = f"{site_url}{note.permalink}"

        # Get note title
        title = get_note_title(note)

        # Format publication date
        pubdate = note.created_at
        if pubdate.tzinfo is None:
            pubdate = pubdate.replace(tzinfo=timezone.utc)
        pub_date_str = format_rfc822_date(pubdate)

        # Get HTML content
        html_content = clean_html_for_rss(note.html)

        # Yield complete item as a single chunk
        item_xml = f"""    <item>
      <title>{_escape_xml(title)}</title>
      <link>{_escape_xml(permalink)}</link>
      <guid isPermaLink="true">{_escape_xml(permalink)}</guid>
      <pubDate>{pub_date_str}</pubDate>
      <description><![CDATA[{html_content}]]></description>
    </item>
"""
        yield item_xml

    # Yield closing tags
    yield "  </channel>\n"
    yield "</rss>\n"


def _escape_xml(text: str) -> str:
    """
    Escape special XML characters for safe inclusion in XML elements

    Escapes the five predefined XML entities: &, <, >, ", '

    Args:
        text: Text to escape

    Returns:
        XML-safe text with escaped entities

    Examples:
        >>> _escape_xml("Hello & goodbye")
        'Hello &amp; goodbye'
        >>> _escape_xml('<tag>')
        '&lt;tag&gt;'
    """
    if not text:
        return ""

    # Escape in order: & first (to avoid double-escaping), then < > " '
    text = text.replace("&", "&amp;")
    text = text.replace("<", "&lt;")
    text = text.replace(">", "&gt;")
    text = text.replace('"', "&quot;")
    text = text.replace("'", "&apos;")

    return text


def format_rfc822_date(dt: datetime) -> str:
    """
    Format datetime to RFC-822 format for RSS

    RSS 2.0 requires RFC-822 date format for pubDate and lastBuildDate.
    Format: "Mon, 18 Nov 2024 12:00:00 +0000"

    Args:
        dt: Datetime object to format (naive datetime assumed to be UTC)

    Returns:
        RFC-822 formatted date string

    Examples:
        >>> dt = datetime(2024, 11, 18, 12, 0, 0)
        >>> format_rfc822_date(dt)
        'Mon, 18 Nov 2024 12:00:00 +0000'
    """
    # Ensure datetime has timezone (assume UTC if naive)
    if dt.tzinfo is None:
        dt = dt.replace(tzinfo=timezone.utc)

    # Format to RFC-822
    # Format string: %a = weekday, %d = day, %b = month, %Y = year
    #                %H:%M:%S = time, %z = timezone offset
    return dt.strftime("%a, %d %b %Y %H:%M:%S %z")


def get_note_title(note: Note) -> str:
    """
    Extract title from note content

    Attempts to extract a meaningful title from the note. Uses the first
    line of content (stripped of markdown heading syntax) or falls back
    to a formatted timestamp if content is unavailable.

    Algorithm:
        1. Try note.title property (first line, stripped of # syntax)
        2. Fall back to timestamp if title is unavailable

    Args:
        note: Note object

    Returns:
        Title string (max 100 chars, truncated if needed)

    Examples:
        >>> # Note with heading
        >>> note = Note(...)  # content: "# My First Note\\n\\n..."
        >>> get_note_title(note)
        'My First Note'

        >>> # Note without heading (timestamp fallback)
        >>> note = Note(...)  # content: "Just some text"
        >>> get_note_title(note)
        'November 18, 2024 at 12:00 PM'
    """
    try:
        # Use Note's title property (handles extraction logic)
        title = note.title

        # Truncate to 100 characters for RSS compatibility
        if len(title) > 100:
            title = title[:100].strip() + "..."

        return title

    except (FileNotFoundError, OSError, AttributeError):
        # If title extraction fails, use timestamp
        return note.created_at.strftime("%B %d, %Y at %I:%M %p")


def clean_html_for_rss(html: str) -> str:
    """
    Ensure HTML is safe for RSS CDATA wrapping

    RSS readers expect HTML content wrapped in CDATA sections. The feedgen
    library handles CDATA wrapping automatically, but we need to ensure
    the HTML doesn't contain CDATA end markers that would break parsing.

    This function is primarily defensive - markdown-rendered HTML should
    not contain CDATA markers, but we check anyway.

    Args:
        html: Rendered HTML content from markdown

    Returns:
        Cleaned HTML safe for CDATA wrapping

    Examples:
        >>> html = "<p>Hello world</p>"
        >>> clean_html_for_rss(html)
        '<p>Hello world</p>'

        >>> # Edge case: HTML containing CDATA end marker
        >>> html = "<p>Example: ]]></p>"
        >>> clean_html_for_rss(html)
        '<p>Example: ]] ></p>'
    """
    # Check for CDATA end marker and add space to break it
    # This is extremely unlikely with markdown-rendered HTML but be safe
    if "]]>" in html:
        html = html.replace("]]>", "]] >")

    return html