StarPunk/starpunk/feeds/json_feed.py

"""
JSON Feed 1.1 generation for StarPunk

This module provides JSON Feed 1.1 generation from published notes using
Python's standard library json module for proper JSON serialization.

Functions:
    generate_json_feed: Generate JSON Feed 1.1 from notes
    generate_json_feed_streaming: Memory-efficient streaming JSON generation

Standards:
    - JSON Feed 1.1 specification compliant
    - RFC 3339 date format
    - Proper JSON encoding
    - UTF-8 output
"""

# Standard library imports
from datetime import datetime, timezone
from typing import Optional, Dict, Any
import time
import json

# Local imports
from starpunk.models import Note
from starpunk.monitoring.business import track_feed_generated


def generate_json_feed(
    site_url: str,
    site_name: str,
    site_description: str,
    notes: list[Note],
    limit: int = 50,
) -> str:
    """
    Generate JSON Feed 1.1 from published notes

    Creates a standards-compliant JSON Feed 1.1 with proper metadata
    and item objects. Uses Python's json module for safe serialization.

    NOTE: For memory-efficient streaming, use generate_json_feed_streaming() instead.
    This function is kept for caching use cases.

    Args:
        site_url: Base URL of the site (e.g., 'https://example.com')
        site_name: Site title for feed
        site_description: Site description for feed
        notes: List of Note objects to include (should be published only)
        limit: Maximum number of items to include (default: 50)

    Returns:
        JSON Feed 1.1 string (UTF-8 encoded, pretty-printed)

    Raises:
        ValueError: If site_url or site_name is empty

    Examples:
        >>> notes = list_notes(published_only=True, limit=50)
        >>> feed_json = generate_json_feed(
        ...     site_url='https://example.com',
        ...     site_name='My Blog',
        ...     site_description='My personal notes',
        ...     notes=notes
        ... )
    """
    # Validate required parameters
    if not site_url or not site_url.strip():
        raise ValueError("site_url is required and cannot be empty")

    if not site_name or not site_name.strip():
        raise ValueError("site_name is required and cannot be empty")

    # Remove trailing slash from site_url for consistency
    site_url = site_url.rstrip("/")

    # Track feed generation timing
    start_time = time.time()

    # Build feed object
    feed = _build_feed_object(
        site_url=site_url,
        site_name=site_name,
        site_description=site_description,
        notes=notes[:limit]
    )

    # Serialize to JSON (pretty-printed)
    feed_json = json.dumps(feed, ensure_ascii=False, indent=2)

    # Track feed generation metrics
    duration_ms = (time.time() - start_time) * 1000
    track_feed_generated(
        format='json',
        item_count=min(len(notes), limit),
        duration_ms=duration_ms,
        cached=False
    )

    return feed_json


def generate_json_feed_streaming(
    site_url: str,
    site_name: str,
    site_description: str,
    notes: list[Note],
    limit: int = 50,
):
    """
    Generate JSON Feed 1.1 from published notes using streaming

    Memory-efficient generator that yields JSON chunks instead of building
    the entire feed in memory. Recommended for large feeds (100+ items).

    Args:
        site_url: Base URL of the site (e.g., 'https://example.com')
        site_name: Site title for feed
        site_description: Site description for feed
        notes: List of Note objects to include (should be published only)
        limit: Maximum number of items to include (default: 50)

    Yields:
        JSON chunks as strings (UTF-8)

    Raises:
        ValueError: If site_url or site_name is empty

    Examples:
        >>> from flask import Response
        >>> notes = list_notes(published_only=True, limit=100)
        >>> generator = generate_json_feed_streaming(
        ...     site_url='https://example.com',
        ...     site_name='My Blog',
        ...     site_description='My personal notes',
        ...     notes=notes
        ... )
        >>> return Response(generator, mimetype='application/json')
    """
    # Validate required parameters
    if not site_url or not site_url.strip():
        raise ValueError("site_url is required and cannot be empty")

    if not site_name or not site_name.strip():
        raise ValueError("site_name is required and cannot be empty")

    # Remove trailing slash from site_url for consistency
    site_url = site_url.rstrip("/")

    # Track feed generation timing
    start_time = time.time()
    item_count = 0

    # Start feed object
    yield '{\n'
    yield f'  "version": "https://jsonfeed.org/version/1.1",\n'
    yield f'  "title": {json.dumps(site_name)},\n'
    yield f'  "home_page_url": {json.dumps(site_url)},\n'
    yield f'  "feed_url": {json.dumps(f"{site_url}/feed.json")},\n'

    if site_description:
        yield f'  "description": {json.dumps(site_description)},\n'

    yield '  "language": "en",\n'

    # Start items array
    yield '  "items": [\n'

    # Stream items (newest first)
    # Notes from database are already in DESC order (newest first)
    items = notes[:limit]
    for i, note in enumerate(items):
        item_count += 1

        # Build item object
        item = _build_item_object(site_url, note)

        # Serialize item to JSON
        item_json = json.dumps(item, ensure_ascii=False, indent=4)

        # Indent properly for nested JSON
        indented_lines = item_json.split('\n')
        indented = '\n'.join('    ' + line for line in indented_lines)
        yield indented

        # Add comma between items (but not after last item)
        if i < len(items) - 1:
            yield ',\n'
        else:
            yield '\n'

    # Close items array and feed
    yield '  ]\n'
    yield '}\n'

    # Track feed generation metrics
    duration_ms = (time.time() - start_time) * 1000
    track_feed_generated(
        format='json',
        item_count=item_count,
        duration_ms=duration_ms,
        cached=False
    )


def _build_feed_object(
    site_url: str,
    site_name: str,
    site_description: str,
    notes: list[Note]
) -> Dict[str, Any]:
    """
    Build complete JSON Feed object

    Args:
        site_url: Site URL (no trailing slash)
        site_name: Feed title
        site_description: Feed description
        notes: List of notes (already limited)

    Returns:
        JSON Feed dictionary
    """
    feed = {
        "version": "https://jsonfeed.org/version/1.1",
        "title": site_name,
        "home_page_url": site_url,
        "feed_url": f"{site_url}/feed.json",
        "language": "en",
        "items": [_build_item_object(site_url, note) for note in notes]
    }

    if site_description:
        feed["description"] = site_description

    return feed


def _build_item_object(site_url: str, note: Note) -> Dict[str, Any]:
    """
    Build JSON Feed item object from note

    Args:
        site_url: Site URL (no trailing slash)
        note: Note to convert to item

    Returns:
        JSON Feed item dictionary
    """
    # Build permalink URL
    permalink = f"{site_url}{note.permalink}"

    # Create item with required fields
    item = {
        "id": permalink,
        "url": permalink,
    }

    # Add title
    item["title"] = note.title

    # Add image field (URL of first/main image) - per JSON Feed 1.1 spec
    # Per Q7: Field should be absent (not null) when no media
    if hasattr(note, 'media') and note.media:
        first_media = note.media[0]
        item["image"] = f"{site_url}/media/{first_media['path']}"

    # Add content (HTML or text)
    # Per Q24: Include media as HTML in content_html
    if note.html:
        content_html = ""

        # Add media at top if present (v1.2.0 Phase 3)
        if hasattr(note, 'media') and note.media:
            content_html += '<div class="media">'
            for media_item in note.media:
                media_url = f"{site_url}/media/{media_item['path']}"
                caption = media_item.get('caption', '')
                content_html += f'<img src="{media_url}" alt="{caption}" />'
            content_html += '</div>'

        # Add text content below media
        content_html += note.html
        item["content_html"] = content_html
    else:
        item["content_text"] = note.content

    # Add publication date (RFC 3339 format)
    item["date_published"] = _format_rfc3339_date(note.created_at)

    # Add attachments array (v1.2.0 Phase 3, per Q24 and ADR-057)
    # JSON Feed 1.1 native support for attachments
    if hasattr(note, 'media') and note.media:
        attachments = []
        for media_item in note.media:
            media_url = f"{site_url}/media/{media_item['path']}"
            attachment = {
                'url': media_url,
                'mime_type': media_item.get('mime_type', 'image/jpeg'),
                'size_in_bytes': media_item.get('size', 0)
            }
            # Add title (caption) if present
            if media_item.get('caption'):
                attachment['title'] = media_item['caption']

            attachments.append(attachment)

        item["attachments"] = attachments

    # Add custom StarPunk extensions
    item["_starpunk"] = {
        "permalink_path": note.permalink,
        "word_count": len(note.content.split())
    }

    return item


def _format_rfc3339_date(dt: datetime) -> str:
    """
    Format datetime to RFC 3339 format for JSON Feed

    JSON Feed 1.1 requires RFC 3339 date format for date_published and date_modified.
    RFC 3339 is a profile of ISO 8601.
    Format: "2024-11-25T12:00:00Z" (UTC) or "2024-11-25T12:00:00-05:00" (with offset)

    Args:
        dt: Datetime object to format (naive datetime assumed to be UTC)

    Returns:
        RFC 3339 formatted date string

    Examples:
        >>> dt = datetime(2024, 11, 25, 12, 0, 0, tzinfo=timezone.utc)
        >>> _format_rfc3339_date(dt)
        '2024-11-25T12:00:00Z'
    """
    # Ensure datetime has timezone (assume UTC if naive)
    if dt.tzinfo is None:
        dt = dt.replace(tzinfo=timezone.utc)

    # Format to RFC 3339
    # Use 'Z' suffix for UTC, otherwise include offset
    if dt.tzinfo == timezone.utc:
        return dt.strftime("%Y-%m-%dT%H:%M:%SZ")
    else:
        # Format with timezone offset
        return dt.isoformat()