StarPunk/starpunk/feeds/rss.py

"""
RSS 2.0 feed generation for StarPunk

This module provides RSS 2.0 feed generation from published notes using the
feedgen library. Feeds include proper RFC-822 dates, CDATA-wrapped HTML
content, and all required RSS elements.

Functions:
    generate_rss: Generate RSS 2.0 XML feed from notes
    generate_rss_streaming: Memory-efficient streaming RSS generation
    format_rfc822_date: Format datetime to RFC-822 for RSS
    get_note_title: Extract title from note (first line or timestamp)
    clean_html_for_rss: Clean HTML for CDATA safety

Standards:
    - RSS 2.0 specification compliant
    - RFC-822 date format
    - Atom self-link for feed discovery
    - CDATA wrapping for HTML content
"""

# Standard library imports
from datetime import datetime, timezone
from typing import Optional
import time

# Third-party imports
from feedgen.feed import FeedGenerator

# Local imports
from starpunk.models import Note
from starpunk.monitoring.business import track_feed_generated


def generate_rss(
    site_url: str,
    site_name: str,
    site_description: str,
    notes: list[Note],
    limit: int = 50,
) -> str:
    """
    Generate RSS 2.0 XML feed from published notes

    Creates a standards-compliant RSS 2.0 feed with proper channel metadata
    and item entries for each note. Includes Atom self-link for discovery.

    NOTE: For memory-efficient streaming, use generate_rss_streaming() instead.
    This function is kept for backwards compatibility and caching use cases.

    Args:
        site_url: Base URL of the site (e.g., 'https://example.com')
        site_name: Site title for RSS channel
        site_description: Site description for RSS channel
        notes: List of Note objects to include (should be published only)
        limit: Maximum number of items to include (default: 50)

    Returns:
        RSS 2.0 XML string (UTF-8 encoded, pretty-printed)

    Raises:
        ValueError: If site_url or site_name is empty

    Examples:
        >>> notes = list_notes(published_only=True, limit=50)
        >>> feed_xml = generate_rss(
        ...     site_url='https://example.com',
        ...     site_name='My Blog',
        ...     site_description='My personal notes',
        ...     notes=notes
        ... )
        >>> print(feed_xml[:38])
        <?xml version='1.0' encoding='UTF-8'?>
    """
    # Validate required parameters
    if not site_url or not site_url.strip():
        raise ValueError("site_url is required and cannot be empty")

    if not site_name or not site_name.strip():
        raise ValueError("site_name is required and cannot be empty")

    # Remove trailing slash from site_url for consistency
    site_url = site_url.rstrip("/")

    # Create feed generator
    fg = FeedGenerator()

    # Set channel metadata (required elements)
    fg.id(site_url)
    fg.title(site_name)
    fg.link(href=site_url, rel="alternate")
    fg.description(site_description or site_name)
    fg.language("en")

    # Add self-link for feed discovery (Atom namespace)
    fg.link(href=f"{site_url}/feed.xml", rel="self", type="application/rss+xml")

    # Set last build date to now
    fg.lastBuildDate(datetime.now(timezone.utc))

    # Track feed generation timing
    start_time = time.time()

    # Add items (limit to configured maximum, newest first)
    # Notes from database are DESC but feedgen reverses them, so we reverse back
    for note in reversed(notes[:limit]):
        # Create feed entry
        fe = fg.add_entry()

        # Build permalink URL
        permalink = f"{site_url}{note.permalink}"

        # Set required item elements
        fe.id(permalink)
        fe.title(get_note_title(note))
        fe.link(href=permalink)
        fe.guid(permalink, permalink=True)

        # Set publication date (ensure UTC timezone)
        pubdate = note.created_at
        if pubdate.tzinfo is None:
            # If naive datetime, assume UTC
            pubdate = pubdate.replace(tzinfo=timezone.utc)
        fe.pubDate(pubdate)

        # Set description with HTML content in CDATA
        # Per Q24 and ADR-057: Embed media as HTML in description
        html_content = ""

        # Add media at top if present (v1.2.0 Phase 3)
        if hasattr(note, 'media') and note.media:
            html_content += '<div class="media">'
            for item in note.media:
                media_url = f"{site_url}/media/{item['path']}"
                caption = item.get('caption', '')
                html_content += f'<img src="{media_url}" alt="{caption}" />'
            html_content += '</div>'

        # Add text content below media
        html_content += clean_html_for_rss(note.html)

        # feedgen automatically wraps content in CDATA for RSS
        fe.description(html_content)

        # Add RSS enclosure element (first image only, per RSS 2.0 spec)
        if hasattr(note, 'media') and note.media:
            first_media = note.media[0]
            media_url = f"{site_url}/media/{first_media['path']}"
            fe.enclosure(
                url=media_url,
                length=str(first_media.get('size', 0)),
                type=first_media.get('mime_type', 'image/jpeg')
            )

    # Generate RSS 2.0 XML (pretty-printed)
    feed_xml_bytes = fg.rss_str(pretty=True)
    feed_xml = feed_xml_bytes.decode("utf-8")

    # Add Media RSS elements manually (feedgen's media extension has issues)
    # We need to inject media:content and media:thumbnail elements
    feed_xml = _inject_media_rss_elements(feed_xml, site_url, notes[:limit])

    # Track feed generation metrics
    duration_ms = (time.time() - start_time) * 1000
    track_feed_generated(
        format='rss',
        item_count=min(len(notes), limit),
        duration_ms=duration_ms,
        cached=False
    )

    return feed_xml


def generate_rss_streaming(
    site_url: str,
    site_name: str,
    site_description: str,
    notes: list[Note],
    limit: int = 50,
):
    """
    Generate RSS 2.0 XML feed from published notes using streaming

    Memory-efficient generator that yields XML chunks instead of building
    the entire feed in memory. Recommended for large feeds (100+ items).

    Yields XML in semantic chunks (channel metadata, individual items, closing tags)
    rather than character-by-character for optimal performance.

    Args:
        site_url: Base URL of the site (e.g., 'https://example.com')
        site_name: Site title for RSS channel
        site_description: Site description for RSS channel
        notes: List of Note objects to include (should be published only)
        limit: Maximum number of items to include (default: 50)

    Yields:
        XML chunks as strings (UTF-8)

    Raises:
        ValueError: If site_url or site_name is empty

    Examples:
        >>> from flask import Response
        >>> notes = list_notes(published_only=True, limit=100)
        >>> generator = generate_rss_streaming(
        ...     site_url='https://example.com',
        ...     site_name='My Blog',
        ...     site_description='My personal notes',
        ...     notes=notes
        ... )
        >>> return Response(generator, mimetype='application/rss+xml')
    """
    # Validate required parameters
    if not site_url or not site_url.strip():
        raise ValueError("site_url is required and cannot be empty")

    if not site_name or not site_name.strip():
        raise ValueError("site_name is required and cannot be empty")

    # Remove trailing slash from site_url for consistency
    site_url = site_url.rstrip("/")

    # Track feed generation timing
    start_time = time.time()
    item_count = 0

    # Current timestamp for lastBuildDate
    now = datetime.now(timezone.utc)
    last_build = format_rfc822_date(now)

    # Yield XML declaration and opening RSS tag with Media RSS namespace
    yield '<?xml version="1.0" encoding="UTF-8"?>\n'
    yield '<rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom" xmlns:media="http://search.yahoo.com/mrss/">\n'
    yield "  <channel>\n"

    # Yield channel metadata
    yield f"    <title>{_escape_xml(site_name)}</title>\n"
    yield f"    <link>{_escape_xml(site_url)}</link>\n"
    yield f"    <description>{_escape_xml(site_description or site_name)}</description>\n"
    yield "    <language>en</language>\n"
    yield f"    <lastBuildDate>{last_build}</lastBuildDate>\n"
    yield f'    <atom:link href="{_escape_xml(site_url)}/feed.xml" rel="self" type="application/rss+xml"/>\n'

    # Yield items (newest first)
    # Notes from database are already in DESC order (newest first)
    for note in notes[:limit]:
        item_count += 1

        # Build permalink URL
        permalink = f"{site_url}{note.permalink}"

        # Get note title
        title = get_note_title(note)

        # Format publication date
        pubdate = note.created_at
        if pubdate.tzinfo is None:
            pubdate = pubdate.replace(tzinfo=timezone.utc)
        pub_date_str = format_rfc822_date(pubdate)

        # Build HTML content with media (per Q24 and ADR-057)
        html_content = ""

        # Add media at top if present
        if hasattr(note, 'media') and note.media:
            html_content += '<div class="media">'
            for item in note.media:
                media_url = f"{site_url}/media/{item['path']}"
                caption = item.get('caption', '')
                html_content += f'<img src="{media_url}" alt="{caption}" />'
            html_content += '</div>'

        # Add text content below media
        html_content += clean_html_for_rss(note.html)

        # Build item XML
        item_xml = f"""    <item>
      <title>{_escape_xml(title)}</title>
      <link>{_escape_xml(permalink)}</link>
      <guid isPermaLink="true">{_escape_xml(permalink)}</guid>
      <pubDate>{pub_date_str}</pubDate>"""

        # Add enclosure element (first image only, per RSS 2.0 spec)
        if hasattr(note, 'media') and note.media:
            first_media = note.media[0]
            media_url = f"{site_url}/media/{first_media['path']}"
            item_xml += f"""
      <enclosure url="{_escape_xml(media_url)}" length="{first_media.get('size', 0)}" type="{first_media.get('mime_type', 'image/jpeg')}"/>"""

        # Add description with HTML content
        item_xml += f"""
      <description><![CDATA[{html_content}]]></description>"""

        # Add media:content elements (all images)
        if hasattr(note, 'media') and note.media:
            for media_item in note.media:
                media_url = f"{site_url}/media/{media_item['path']}"
                item_xml += f"""
      <media:content url="{_escape_xml(media_url)}" type="{media_item.get('mime_type', 'image/jpeg')}" medium="image" fileSize="{media_item.get('size', 0)}"/>"""

            # Add media:thumbnail (first image only)
            first_media = note.media[0]
            media_url = f"{site_url}/media/{first_media['path']}"
            item_xml += f"""
      <media:thumbnail url="{_escape_xml(media_url)}"/>"""

        # Close item
        item_xml += """
    </item>
"""
        yield item_xml

    # Yield closing tags
    yield "  </channel>\n"
    yield "</rss>\n"

    # Track feed generation metrics
    duration_ms = (time.time() - start_time) * 1000
    track_feed_generated(
        format='rss',
        item_count=item_count,
        duration_ms=duration_ms,
        cached=False
    )


def _inject_media_rss_elements(feed_xml: str, site_url: str, notes: list[Note]) -> str:
    """
    Inject Media RSS elements into generated RSS feed

    Adds media:content and media:thumbnail elements for notes with media using
    string manipulation. This approach is simpler than XML parsing and preserves
    the original formatting from feedgen.

    Args:
        feed_xml: Generated RSS XML string
        site_url: Base site URL (no trailing slash)
        notes: List of notes (already reversed for feedgen)

    Returns:
        Modified RSS XML with Media RSS elements
    """
    # Step 1: Add Media RSS namespace to <rss> tag
    # Handle both possible attribute orderings from feedgen
    if '<rss xmlns:atom' in feed_xml:
        feed_xml = feed_xml.replace(
            '<rss xmlns:atom',
            '<rss xmlns:media="http://search.yahoo.com/mrss/" xmlns:atom',
            1  # Only replace first occurrence
        )
    elif '<rss version="2.0"' in feed_xml:
        feed_xml = feed_xml.replace(
            '<rss version="2.0"',
            '<rss version="2.0" xmlns:media="http://search.yahoo.com/mrss/"',
            1
        )
    else:
        # Fallback
        feed_xml = feed_xml.replace('<rss ', '<rss xmlns:media="http://search.yahoo.com/mrss/" ', 1)

    # Step 2: Inject media elements for each note with media
    # We need to find each <enclosure> element and inject media elements after it
    # Notes are reversed in generate_rss, so notes[0] = first item in feed

    for i, note in enumerate(notes):
        # Skip if note has no media
        if not hasattr(note, 'media') or not note.media:
            continue

        # Build media elements for this note
        media_elements = []

        # Add media:content for each image
        for media_item in note.media:
            media_url = f"{site_url}/media/{media_item['path']}"
            media_url_escaped = _escape_xml(media_url)
            mime_type = media_item.get('mime_type', 'image/jpeg')
            file_size = media_item.get('size', 0)

            media_content = f'<media:content url="{media_url_escaped}" type="{mime_type}" medium="image" fileSize="{file_size}"/>'
            media_elements.append(media_content)

        # Add media:thumbnail for first image
        first_media = note.media[0]
        media_url = f"{site_url}/media/{first_media['path']}"
        media_url_escaped = _escape_xml(media_url)
        media_thumbnail = f'<media:thumbnail url="{media_url_escaped}"/>'
        media_elements.append(media_thumbnail)

        # Find the enclosure for this note and inject media elements after it
        # Look for the enclosure with the first media item's path
        enclosure_pattern = f'<enclosure url="{media_url_escaped}"'

        if enclosure_pattern in feed_xml:
            # Find the end of the enclosure tag
            enclosure_pos = feed_xml.find(enclosure_pattern)
            enclosure_end = feed_xml.find('/>', enclosure_pos)

            if enclosure_end != -1:
                # Inject media elements right after the enclosure tag
                insertion_point = enclosure_end + 2
                media_xml = ''.join(media_elements)
                feed_xml = feed_xml[:insertion_point] + media_xml + feed_xml[insertion_point:]

    return feed_xml


def _escape_xml(text: str) -> str:
    """
    Escape special XML characters for safe inclusion in XML elements

    Escapes the five predefined XML entities: &, <, >, ", '

    Args:
        text: Text to escape

    Returns:
        XML-safe text with escaped entities

    Examples:
        >>> _escape_xml("Hello & goodbye")
        'Hello &amp; goodbye'
        >>> _escape_xml('<tag>')
        '&lt;tag&gt;'
    """
    if not text:
        return ""

    # Escape in order: & first (to avoid double-escaping), then < > " '
    text = text.replace("&", "&amp;")
    text = text.replace("<", "&lt;")
    text = text.replace(">", "&gt;")
    text = text.replace('"', "&quot;")
    text = text.replace("'", "&apos;")

    return text


def format_rfc822_date(dt: datetime) -> str:
    """
    Format datetime to RFC-822 format for RSS

    RSS 2.0 requires RFC-822 date format for pubDate and lastBuildDate.
    Format: "Mon, 18 Nov 2024 12:00:00 +0000"

    Args:
        dt: Datetime object to format (naive datetime assumed to be UTC)

    Returns:
        RFC-822 formatted date string

    Examples:
        >>> dt = datetime(2024, 11, 18, 12, 0, 0)
        >>> format_rfc822_date(dt)
        'Mon, 18 Nov 2024 12:00:00 +0000'
    """
    # Ensure datetime has timezone (assume UTC if naive)
    if dt.tzinfo is None:
        dt = dt.replace(tzinfo=timezone.utc)

    # Format to RFC-822
    # Format string: %a = weekday, %d = day, %b = month, %Y = year
    #                %H:%M:%S = time, %z = timezone offset
    return dt.strftime("%a, %d %b %Y %H:%M:%S %z")


def get_note_title(note: Note) -> str:
    """
    Extract title from note content

    Attempts to extract a meaningful title from the note. Uses the first
    line of content (stripped of markdown heading syntax) or falls back
    to a formatted timestamp if content is unavailable.

    Algorithm:
        1. Try note.title property (first line, stripped of # syntax)
        2. Fall back to timestamp if title is unavailable

    Args:
        note: Note object

    Returns:
        Title string (max 100 chars, truncated if needed)

    Examples:
        >>> # Note with heading
        >>> note = Note(...)  # content: "# My First Note\\n\\n..."
        >>> get_note_title(note)
        'My First Note'

        >>> # Note without heading (timestamp fallback)
        >>> note = Note(...)  # content: "Just some text"
        >>> get_note_title(note)
        'November 18, 2024 at 12:00 PM'
    """
    try:
        # Use Note's title property (handles extraction logic)
        title = note.title

        # Truncate to 100 characters for RSS compatibility
        if len(title) > 100:
            title = title[:100].strip() + "..."

        return title

    except (FileNotFoundError, OSError, AttributeError):
        # If title extraction fails, use timestamp
        return note.created_at.strftime("%B %d, %Y at %I:%M %p")


def clean_html_for_rss(html: str) -> str:
    """
    Ensure HTML is safe for RSS CDATA wrapping

    RSS readers expect HTML content wrapped in CDATA sections. The feedgen
    library handles CDATA wrapping automatically, but we need to ensure
    the HTML doesn't contain CDATA end markers that would break parsing.

    This function is primarily defensive - markdown-rendered HTML should
    not contain CDATA markers, but we check anyway.

    Args:
        html: Rendered HTML content from markdown

    Returns:
        Cleaned HTML safe for CDATA wrapping

    Examples:
        >>> html = "<p>Hello world</p>"
        >>> clean_html_for_rss(html)
        '<p>Hello world</p>'

        >>> # Edge case: HTML containing CDATA end marker
        >>> html = "<p>Example: ]]></p>"
        >>> clean_html_for_rss(html)
        '<p>Example: ]] ></p>'
    """
    # Check for CDATA end marker and add space to break it
    # This is extremely unlikely with markdown-rendered HTML but be safe
    if "]]>" in html:
        html = html.replace("]]>", "]] >")

    return html