StarPunk/starpunk/feeds/atom.py

"""
ATOM 1.0 feed generation for StarPunk

This module provides ATOM 1.0 feed generation from published notes using
Python's standard library xml.etree.ElementTree for proper XML handling.

Functions:
    generate_atom: Generate ATOM 1.0 XML feed from notes
    generate_atom_streaming: Memory-efficient streaming ATOM generation

Standards:
    - ATOM 1.0 (RFC 4287) specification compliant
    - RFC 3339 date format
    - Proper XML namespacing
    - Escaped HTML and text content
"""

# Standard library imports
from datetime import datetime, timezone
from typing import Optional
import time
import xml.etree.ElementTree as ET

# Local imports
from starpunk.models import Note
from starpunk.monitoring.business import track_feed_generated


# ATOM namespace
ATOM_NS = "http://www.w3.org/2005/Atom"


def generate_atom(
    site_url: str,
    site_name: str,
    site_description: str,
    notes: list[Note],
    limit: int = 50,
) -> str:
    """
    Generate ATOM 1.0 XML feed from published notes

    Creates a standards-compliant ATOM 1.0 feed with proper metadata
    and entry elements. Uses ElementTree for safe XML generation.

    NOTE: For memory-efficient streaming, use generate_atom_streaming() instead.
    This function is kept for caching use cases.

    Args:
        site_url: Base URL of the site (e.g., 'https://example.com')
        site_name: Site title for feed
        site_description: Site description for feed (subtitle)
        notes: List of Note objects to include (should be published only)
        limit: Maximum number of entries to include (default: 50)

    Returns:
        ATOM 1.0 XML string (UTF-8 encoded)

    Raises:
        ValueError: If site_url or site_name is empty

    Examples:
        >>> notes = list_notes(published_only=True, limit=50)
        >>> feed_xml = generate_atom(
        ...     site_url='https://example.com',
        ...     site_name='My Blog',
        ...     site_description='My personal notes',
        ...     notes=notes
        ... )
        >>> print(feed_xml[:38])
        <?xml version='1.0' encoding='UTF-8'?>
    """
    # Join streaming output for non-streaming version
    return ''.join(generate_atom_streaming(
        site_url=site_url,
        site_name=site_name,
        site_description=site_description,
        notes=notes,
        limit=limit
    ))


def generate_atom_streaming(
    site_url: str,
    site_name: str,
    site_description: str,
    notes: list[Note],
    limit: int = 50,
):
    """
    Generate ATOM 1.0 XML feed from published notes using streaming

    Memory-efficient generator that yields XML chunks instead of building
    the entire feed in memory. Recommended for large feeds (100+ entries).

    Args:
        site_url: Base URL of the site (e.g., 'https://example.com')
        site_name: Site title for feed
        site_description: Site description for feed
        notes: List of Note objects to include (should be published only)
        limit: Maximum number of entries to include (default: 50)

    Yields:
        XML chunks as strings (UTF-8)

    Raises:
        ValueError: If site_url or site_name is empty

    Examples:
        >>> from flask import Response
        >>> notes = list_notes(published_only=True, limit=100)
        >>> generator = generate_atom_streaming(
        ...     site_url='https://example.com',
        ...     site_name='My Blog',
        ...     site_description='My personal notes',
        ...     notes=notes
        ... )
        >>> return Response(generator, mimetype='application/atom+xml')
    """
    # Validate required parameters
    if not site_url or not site_url.strip():
        raise ValueError("site_url is required and cannot be empty")

    if not site_name or not site_name.strip():
        raise ValueError("site_name is required and cannot be empty")

    # Remove trailing slash from site_url for consistency
    site_url = site_url.rstrip("/")

    # Track feed generation timing
    start_time = time.time()
    item_count = 0

    # Current timestamp for updated
    now = datetime.now(timezone.utc)

    # Yield XML declaration
    yield '<?xml version="1.0" encoding="utf-8"?>\n'

    # Yield feed opening with namespace
    yield f'<feed xmlns="{ATOM_NS}">\n'

    # Yield feed metadata
    yield f'  <id>{_escape_xml(site_url)}/</id>\n'
    yield f'  <title>{_escape_xml(site_name)}</title>\n'
    yield f'  <updated>{_format_atom_date(now)}</updated>\n'

    # Links
    yield f'  <link rel="alternate" type="text/html" href="{_escape_xml(site_url)}"/>\n'
    yield f'  <link rel="self" type="application/atom+xml" href="{_escape_xml(site_url)}/feed.atom"/>\n'

    # Optional subtitle
    if site_description:
        yield f'  <subtitle>{_escape_xml(site_description)}</subtitle>\n'

    # Generator
    yield '  <generator uri="https://github.com/yourusername/starpunk">StarPunk</generator>\n'

    # Yield entries (newest first)
    # Notes from database are already in DESC order (newest first)
    for note in notes[:limit]:
        item_count += 1

        # Build permalink URL
        permalink = f"{site_url}{note.permalink}"

        yield '  <entry>\n'

        # Required elements
        yield f'    <id>{_escape_xml(permalink)}</id>\n'
        yield f'    <title>{_escape_xml(note.title)}</title>\n'

        # Use created_at for both published and updated
        # (Note model doesn't have updated_at tracking yet)
        yield f'    <published>{_format_atom_date(note.created_at)}</published>\n'
        yield f'    <updated>{_format_atom_date(note.created_at)}</updated>\n'

        # Link to entry
        yield f'    <link rel="alternate" type="text/html" href="{_escape_xml(permalink)}"/>\n'

        # Add category elements for tags (v1.3.1)
        if hasattr(note, 'tags') and note.tags:
            for tag in note.tags:
                yield f'    <category term="{_escape_xml(tag["name"])}" label="{_escape_xml(tag["display_name"])}"/>\n'

        # Media enclosures (v1.2.0 Phase 3, per Q24 and ADR-057)
        # Enhanced with title attribute for captions (v1.4.0 Phase 4)
        if hasattr(note, 'media') and note.media:
            for item in note.media:
                media_url = f"{site_url}/media/{item['path']}"
                mime_type = item.get('mime_type', 'image/jpeg')
                size = item.get('size', 0)
                caption = item.get('caption', '')

                # Include title attribute for caption
                title_attr = f' title="{_escape_xml(caption)}"' if caption else ''

                yield f'    <link rel="enclosure" type="{_escape_xml(mime_type)}" href="{_escape_xml(media_url)}" length="{size}"{title_attr}/>\n'

        # Content - include media as HTML (per Q24)
        if note.html:
            # Build HTML content with media at top
            html_content = ""

            # Add media at top if present
            if hasattr(note, 'media') and note.media:
                html_content += '<div class="media">'
                for item in note.media:
                    media_url = f"{site_url}/media/{item['path']}"
                    caption = item.get('caption', '')
                    html_content += f'<img src="{media_url}" alt="{caption}" />'
                html_content += '</div>'

            # Add text content below media
            html_content += note.html

            # HTML content - escaped
            yield '    <content type="html">'
            yield _escape_xml(html_content)
            yield '</content>\n'
        else:
            # Plain text content
            yield '    <content type="text">'
            yield _escape_xml(note.content)
            yield '</content>\n'

        yield '  </entry>\n'

    # Yield closing tag
    yield '</feed>\n'

    # Track feed generation metrics
    duration_ms = (time.time() - start_time) * 1000
    track_feed_generated(
        format='atom',
        item_count=item_count,
        duration_ms=duration_ms,
        cached=False
    )


def _escape_xml(text: str) -> str:
    """
    Escape special XML characters for safe inclusion in XML elements

    Escapes the five predefined XML entities: &, <, >, ", '

    Args:
        text: Text to escape

    Returns:
        XML-safe text with escaped entities

    Examples:
        >>> _escape_xml("Hello & goodbye")
        'Hello &amp; goodbye'
        >>> _escape_xml('<p>HTML</p>')
        '&lt;p&gt;HTML&lt;/p&gt;'
    """
    if not text:
        return ""

    # Escape in order: & first (to avoid double-escaping), then < > " '
    text = text.replace("&", "&amp;")
    text = text.replace("<", "&lt;")
    text = text.replace(">", "&gt;")
    text = text.replace('"', "&quot;")
    text = text.replace("'", "&apos;")

    return text


def _format_atom_date(dt: datetime) -> str:
    """
    Format datetime to RFC 3339 format for ATOM

    ATOM 1.0 requires RFC 3339 date format for published and updated elements.
    RFC 3339 is a profile of ISO 8601.
    Format: "2024-11-25T12:00:00Z" (UTC) or "2024-11-25T12:00:00-05:00" (with offset)

    Args:
        dt: Datetime object to format (naive datetime assumed to be UTC)

    Returns:
        RFC 3339 formatted date string

    Examples:
        >>> dt = datetime(2024, 11, 25, 12, 0, 0, tzinfo=timezone.utc)
        >>> _format_atom_date(dt)
        '2024-11-25T12:00:00Z'
    """
    # Ensure datetime has timezone (assume UTC if naive)
    if dt.tzinfo is None:
        dt = dt.replace(tzinfo=timezone.utc)

    # Format to RFC 3339
    # Use 'Z' suffix for UTC, otherwise include offset
    if dt.tzinfo == timezone.utc:
        return dt.strftime("%Y-%m-%dT%H:%M:%SZ")
    else:
        # Format with timezone offset
        return dt.isoformat()