Implement Phase 4 of v1.4.0 Media release - Enhanced Feed Media support. RSS Feed Enhancements (starpunk/feeds/rss.py): - Wrap size variants in <media:group> elements - Add <media:content> for large/medium/small variants with attributes: url, type, medium, isDefault, width, height, fileSize - Add <media:thumbnail> for thumb variant with dimensions - Add <media:title type="plain"> for image captions - Implement isDefault logic: largest available variant (large→medium→small fallback) - Maintain backwards compatibility for media without variants (legacy fallback) JSON Feed Enhancements (starpunk/feeds/json_feed.py): - Add _starpunk.about URL (configurable via STARPUNK_ABOUT_URL config) - Add _starpunk.media_variants array with variant data when variants exist - Each variant entry includes: url, width, height, size_in_bytes, mime_type ATOM Feed Enhancements (starpunk/feeds/atom.py): - Add title attribute to enclosure links for captions - Keep simple (no variants in ATOM per design decision) Test Updates (tests/test_feeds_rss.py): - Update streaming media test to search descendants for media:content - Now inside media:group for images with variants (v1.4.0 behavior) Per design document: /docs/design/v1.4.0/media-implementation-design.md Following ADR-059: Full Feed Media Standardization Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
303 lines
9.5 KiB
Python
303 lines
9.5 KiB
Python
"""
|
|
ATOM 1.0 feed generation for StarPunk
|
|
|
|
This module provides ATOM 1.0 feed generation from published notes using
|
|
Python's standard library xml.etree.ElementTree for proper XML handling.
|
|
|
|
Functions:
|
|
generate_atom: Generate ATOM 1.0 XML feed from notes
|
|
generate_atom_streaming: Memory-efficient streaming ATOM generation
|
|
|
|
Standards:
|
|
- ATOM 1.0 (RFC 4287) specification compliant
|
|
- RFC 3339 date format
|
|
- Proper XML namespacing
|
|
- Escaped HTML and text content
|
|
"""
|
|
|
|
# Standard library imports
|
|
from datetime import datetime, timezone
|
|
from typing import Optional
|
|
import time
|
|
import xml.etree.ElementTree as ET
|
|
|
|
# Local imports
|
|
from starpunk.models import Note
|
|
from starpunk.monitoring.business import track_feed_generated
|
|
|
|
|
|
# ATOM namespace
|
|
ATOM_NS = "http://www.w3.org/2005/Atom"
|
|
|
|
|
|
def generate_atom(
|
|
site_url: str,
|
|
site_name: str,
|
|
site_description: str,
|
|
notes: list[Note],
|
|
limit: int = 50,
|
|
) -> str:
|
|
"""
|
|
Generate ATOM 1.0 XML feed from published notes
|
|
|
|
Creates a standards-compliant ATOM 1.0 feed with proper metadata
|
|
and entry elements. Uses ElementTree for safe XML generation.
|
|
|
|
NOTE: For memory-efficient streaming, use generate_atom_streaming() instead.
|
|
This function is kept for caching use cases.
|
|
|
|
Args:
|
|
site_url: Base URL of the site (e.g., 'https://example.com')
|
|
site_name: Site title for feed
|
|
site_description: Site description for feed (subtitle)
|
|
notes: List of Note objects to include (should be published only)
|
|
limit: Maximum number of entries to include (default: 50)
|
|
|
|
Returns:
|
|
ATOM 1.0 XML string (UTF-8 encoded)
|
|
|
|
Raises:
|
|
ValueError: If site_url or site_name is empty
|
|
|
|
Examples:
|
|
>>> notes = list_notes(published_only=True, limit=50)
|
|
>>> feed_xml = generate_atom(
|
|
... site_url='https://example.com',
|
|
... site_name='My Blog',
|
|
... site_description='My personal notes',
|
|
... notes=notes
|
|
... )
|
|
>>> print(feed_xml[:38])
|
|
<?xml version='1.0' encoding='UTF-8'?>
|
|
"""
|
|
# Join streaming output for non-streaming version
|
|
return ''.join(generate_atom_streaming(
|
|
site_url=site_url,
|
|
site_name=site_name,
|
|
site_description=site_description,
|
|
notes=notes,
|
|
limit=limit
|
|
))
|
|
|
|
|
|
def generate_atom_streaming(
|
|
site_url: str,
|
|
site_name: str,
|
|
site_description: str,
|
|
notes: list[Note],
|
|
limit: int = 50,
|
|
):
|
|
"""
|
|
Generate ATOM 1.0 XML feed from published notes using streaming
|
|
|
|
Memory-efficient generator that yields XML chunks instead of building
|
|
the entire feed in memory. Recommended for large feeds (100+ entries).
|
|
|
|
Args:
|
|
site_url: Base URL of the site (e.g., 'https://example.com')
|
|
site_name: Site title for feed
|
|
site_description: Site description for feed
|
|
notes: List of Note objects to include (should be published only)
|
|
limit: Maximum number of entries to include (default: 50)
|
|
|
|
Yields:
|
|
XML chunks as strings (UTF-8)
|
|
|
|
Raises:
|
|
ValueError: If site_url or site_name is empty
|
|
|
|
Examples:
|
|
>>> from flask import Response
|
|
>>> notes = list_notes(published_only=True, limit=100)
|
|
>>> generator = generate_atom_streaming(
|
|
... site_url='https://example.com',
|
|
... site_name='My Blog',
|
|
... site_description='My personal notes',
|
|
... notes=notes
|
|
... )
|
|
>>> return Response(generator, mimetype='application/atom+xml')
|
|
"""
|
|
# Validate required parameters
|
|
if not site_url or not site_url.strip():
|
|
raise ValueError("site_url is required and cannot be empty")
|
|
|
|
if not site_name or not site_name.strip():
|
|
raise ValueError("site_name is required and cannot be empty")
|
|
|
|
# Remove trailing slash from site_url for consistency
|
|
site_url = site_url.rstrip("/")
|
|
|
|
# Track feed generation timing
|
|
start_time = time.time()
|
|
item_count = 0
|
|
|
|
# Current timestamp for updated
|
|
now = datetime.now(timezone.utc)
|
|
|
|
# Yield XML declaration
|
|
yield '<?xml version="1.0" encoding="utf-8"?>\n'
|
|
|
|
# Yield feed opening with namespace
|
|
yield f'<feed xmlns="{ATOM_NS}">\n'
|
|
|
|
# Yield feed metadata
|
|
yield f' <id>{_escape_xml(site_url)}/</id>\n'
|
|
yield f' <title>{_escape_xml(site_name)}</title>\n'
|
|
yield f' <updated>{_format_atom_date(now)}</updated>\n'
|
|
|
|
# Links
|
|
yield f' <link rel="alternate" type="text/html" href="{_escape_xml(site_url)}"/>\n'
|
|
yield f' <link rel="self" type="application/atom+xml" href="{_escape_xml(site_url)}/feed.atom"/>\n'
|
|
|
|
# Optional subtitle
|
|
if site_description:
|
|
yield f' <subtitle>{_escape_xml(site_description)}</subtitle>\n'
|
|
|
|
# Generator
|
|
yield ' <generator uri="https://github.com/yourusername/starpunk">StarPunk</generator>\n'
|
|
|
|
# Yield entries (newest first)
|
|
# Notes from database are already in DESC order (newest first)
|
|
for note in notes[:limit]:
|
|
item_count += 1
|
|
|
|
# Build permalink URL
|
|
permalink = f"{site_url}{note.permalink}"
|
|
|
|
yield ' <entry>\n'
|
|
|
|
# Required elements
|
|
yield f' <id>{_escape_xml(permalink)}</id>\n'
|
|
yield f' <title>{_escape_xml(note.title)}</title>\n'
|
|
|
|
# Use created_at for both published and updated
|
|
# (Note model doesn't have updated_at tracking yet)
|
|
yield f' <published>{_format_atom_date(note.created_at)}</published>\n'
|
|
yield f' <updated>{_format_atom_date(note.created_at)}</updated>\n'
|
|
|
|
# Link to entry
|
|
yield f' <link rel="alternate" type="text/html" href="{_escape_xml(permalink)}"/>\n'
|
|
|
|
# Add category elements for tags (v1.3.1)
|
|
if hasattr(note, 'tags') and note.tags:
|
|
for tag in note.tags:
|
|
yield f' <category term="{_escape_xml(tag["name"])}" label="{_escape_xml(tag["display_name"])}"/>\n'
|
|
|
|
# Media enclosures (v1.2.0 Phase 3, per Q24 and ADR-057)
|
|
# Enhanced with title attribute for captions (v1.4.0 Phase 4)
|
|
if hasattr(note, 'media') and note.media:
|
|
for item in note.media:
|
|
media_url = f"{site_url}/media/{item['path']}"
|
|
mime_type = item.get('mime_type', 'image/jpeg')
|
|
size = item.get('size', 0)
|
|
caption = item.get('caption', '')
|
|
|
|
# Include title attribute for caption
|
|
title_attr = f' title="{_escape_xml(caption)}"' if caption else ''
|
|
|
|
yield f' <link rel="enclosure" type="{_escape_xml(mime_type)}" href="{_escape_xml(media_url)}" length="{size}"{title_attr}/>\n'
|
|
|
|
# Content - include media as HTML (per Q24)
|
|
if note.html:
|
|
# Build HTML content with media at top
|
|
html_content = ""
|
|
|
|
# Add media at top if present
|
|
if hasattr(note, 'media') and note.media:
|
|
html_content += '<div class="media">'
|
|
for item in note.media:
|
|
media_url = f"{site_url}/media/{item['path']}"
|
|
caption = item.get('caption', '')
|
|
html_content += f'<img src="{media_url}" alt="{caption}" />'
|
|
html_content += '</div>'
|
|
|
|
# Add text content below media
|
|
html_content += note.html
|
|
|
|
# HTML content - escaped
|
|
yield ' <content type="html">'
|
|
yield _escape_xml(html_content)
|
|
yield '</content>\n'
|
|
else:
|
|
# Plain text content
|
|
yield ' <content type="text">'
|
|
yield _escape_xml(note.content)
|
|
yield '</content>\n'
|
|
|
|
yield ' </entry>\n'
|
|
|
|
# Yield closing tag
|
|
yield '</feed>\n'
|
|
|
|
# Track feed generation metrics
|
|
duration_ms = (time.time() - start_time) * 1000
|
|
track_feed_generated(
|
|
format='atom',
|
|
item_count=item_count,
|
|
duration_ms=duration_ms,
|
|
cached=False
|
|
)
|
|
|
|
|
|
def _escape_xml(text: str) -> str:
|
|
"""
|
|
Escape special XML characters for safe inclusion in XML elements
|
|
|
|
Escapes the five predefined XML entities: &, <, >, ", '
|
|
|
|
Args:
|
|
text: Text to escape
|
|
|
|
Returns:
|
|
XML-safe text with escaped entities
|
|
|
|
Examples:
|
|
>>> _escape_xml("Hello & goodbye")
|
|
'Hello & goodbye'
|
|
>>> _escape_xml('<p>HTML</p>')
|
|
'<p>HTML</p>'
|
|
"""
|
|
if not text:
|
|
return ""
|
|
|
|
# Escape in order: & first (to avoid double-escaping), then < > " '
|
|
text = text.replace("&", "&")
|
|
text = text.replace("<", "<")
|
|
text = text.replace(">", ">")
|
|
text = text.replace('"', """)
|
|
text = text.replace("'", "'")
|
|
|
|
return text
|
|
|
|
|
|
def _format_atom_date(dt: datetime) -> str:
|
|
"""
|
|
Format datetime to RFC 3339 format for ATOM
|
|
|
|
ATOM 1.0 requires RFC 3339 date format for published and updated elements.
|
|
RFC 3339 is a profile of ISO 8601.
|
|
Format: "2024-11-25T12:00:00Z" (UTC) or "2024-11-25T12:00:00-05:00" (with offset)
|
|
|
|
Args:
|
|
dt: Datetime object to format (naive datetime assumed to be UTC)
|
|
|
|
Returns:
|
|
RFC 3339 formatted date string
|
|
|
|
Examples:
|
|
>>> dt = datetime(2024, 11, 25, 12, 0, 0, tzinfo=timezone.utc)
|
|
>>> _format_atom_date(dt)
|
|
'2024-11-25T12:00:00Z'
|
|
"""
|
|
# Ensure datetime has timezone (assume UTC if naive)
|
|
if dt.tzinfo is None:
|
|
dt = dt.replace(tzinfo=timezone.utc)
|
|
|
|
# Format to RFC 3339
|
|
# Use 'Z' suffix for UTC, otherwise include offset
|
|
if dt.tzinfo == timezone.utc:
|
|
return dt.strftime("%Y-%m-%dT%H:%M:%SZ")
|
|
else:
|
|
# Format with timezone offset
|
|
return dt.isoformat()
|