Files
StarPunk/starpunk/author_discovery.py
Phil Skentelbery dd822a35b5 feat: v1.2.0-rc.1 - IndieWeb Features Release Candidate
Complete implementation of v1.2.0 "IndieWeb Features" release.

## Phase 1: Custom Slugs
- Optional custom slug field in note creation form
- Auto-sanitization (lowercase, hyphens only)
- Uniqueness validation with auto-numbering
- Read-only after creation to preserve permalinks
- Matches Micropub mp-slug behavior

## Phase 2: Author Discovery + Microformats2
- Automatic h-card discovery from IndieAuth identity URL
- 24-hour caching with graceful fallback
- Never blocks login (per ADR-061)
- Complete h-entry, h-card, h-feed markup
- All required Microformats2 properties
- rel-me links for identity verification
- Passes IndieWeb validation

## Phase 3: Media Upload
- Upload up to 4 images per note (JPEG, PNG, GIF, WebP)
- Automatic optimization with Pillow
  - Auto-resize to 2048px
  - EXIF orientation correction
  - 95% quality compression
- Social media-style layout (media top, text below)
- Optional captions for accessibility
- Integration with all feed formats (RSS, ATOM, JSON Feed)
- Date-organized storage with UUID filenames
- Immutable caching (1 year)

## Database Changes
- migrations/006_add_author_profile.sql - Author discovery cache
- migrations/007_add_media_support.sql - Media storage

## New Modules
- starpunk/author_discovery.py - h-card discovery and caching
- starpunk/media.py - Image upload, validation, optimization

## Documentation
- 4 new ADRs (056, 057, 058, 061)
- Complete design specifications
- Developer Q&A with 40+ questions answered
- 3 implementation reports
- 3 architect reviews (all approved)

## Testing
- 56 new tests for v1.2.0 features
- 842 total tests in suite
- All v1.2.0 feature tests passing

## Dependencies
- Added: mf2py (Microformats2 parser)
- Added: Pillow (image processing)

Version: 1.2.0-rc.1

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-28 15:02:20 -07:00

378 lines
11 KiB
Python

"""
Author profile discovery from IndieAuth identity
Per ADR-061 and v1.2.0 Phase 2:
- Discover h-card from user's IndieAuth 'me' URL
- Cache for 24 hours (per Q14)
- Graceful fallback if discovery fails
- Never block login functionality
Discovery Process:
1. Fetch user's profile URL
2. Parse h-card microformats using mf2py
3. Extract: name, photo, url, note (bio), rel-me links
4. Cache in author_profile table with 24-hour TTL
5. Return cached data on subsequent requests
Fallback Behavior (per Q14):
- If discovery fails, use cached data even if expired
- If no cache exists, use minimal defaults (domain as name)
- Never block or fail login due to discovery issues
"""
import json
import logging
from datetime import datetime, timedelta
from typing import Dict, Optional
from urllib.parse import urlparse
import httpx
import mf2py
from flask import current_app
from starpunk.database import get_db
# Discovery timeout (per Q&A Q38)
DISCOVERY_TIMEOUT = 5.0
# Cache TTL (per Q&A Q14, Q19)
CACHE_TTL_HOURS = 24
class DiscoveryError(Exception):
"""Raised when author profile discovery fails"""
pass
def discover_author_profile(me_url: str) -> Optional[Dict]:
"""
Discover author h-card from IndieAuth profile URL
Per Q15: Use mf2py library (already a dependency)
Per Q14: Graceful fallback, never block login
Per Q16: Use first representative h-card
Args:
me_url: User's IndieAuth identity URL
Returns:
Dict with author profile data or None on failure
Profile dict contains:
- name: Author name (from p-name)
- photo: Author photo URL (from u-photo)
- url: Author canonical URL (from u-url)
- note: Author bio (from p-note)
- rel_me_links: List of rel-me URLs
"""
try:
current_app.logger.info(f"Discovering author profile from {me_url}")
# Fetch profile page with timeout
response = httpx.get(
me_url,
timeout=DISCOVERY_TIMEOUT,
follow_redirects=True,
headers={
'Accept': 'text/html,application/xhtml+xml',
'User-Agent': f'StarPunk/{current_app.config.get("VERSION", "1.2.0")}'
}
)
response.raise_for_status()
# Parse microformats from HTML
parsed = mf2py.parse(doc=response.text, url=me_url)
# Extract h-card (per Q16: first representative h-card)
hcard = _find_representative_hcard(parsed, me_url)
if not hcard:
current_app.logger.warning(f"No h-card found at {me_url}")
return None
# Extract h-card properties
profile = {
'name': _get_property(hcard, 'name'),
'photo': _get_property(hcard, 'photo'),
'url': _get_property(hcard, 'url') or me_url,
'note': _get_property(hcard, 'note'),
}
# Extract rel-me links (per Q17: store as list)
rel_me_links = parsed.get('rels', {}).get('me', [])
profile['rel_me_links'] = rel_me_links
current_app.logger.info(
f"Discovered author profile: name={profile.get('name')}, "
f"photo={'yes' if profile.get('photo') else 'no'}, "
f"rel_me_count={len(rel_me_links)}"
)
return profile
except httpx.TimeoutException:
current_app.logger.warning(f"Timeout discovering profile at {me_url}")
raise DiscoveryError(f"Timeout fetching profile: {me_url}")
except httpx.HTTPStatusError as e:
current_app.logger.warning(
f"HTTP {e.response.status_code} discovering profile at {me_url}"
)
raise DiscoveryError(f"HTTP error fetching profile: {e.response.status_code}")
except httpx.RequestError as e:
current_app.logger.warning(f"Network error discovering profile at {me_url}: {e}")
raise DiscoveryError(f"Network error: {e}")
except Exception as e:
current_app.logger.error(f"Unexpected error discovering profile at {me_url}: {e}")
raise DiscoveryError(f"Discovery failed: {e}")
def _find_representative_hcard(parsed: dict, me_url: str) -> Optional[dict]:
"""
Find representative h-card from parsed microformats
Per Q16: First representative h-card = first h-card with p-name
Per Q18: First h-card with url property matching profile URL
Args:
parsed: Parsed microformats data from mf2py
me_url: Profile URL for matching
Returns:
h-card dict or None if not found
"""
items = parsed.get('items', [])
# First try: h-card with matching URL (most specific)
for item in items:
if 'h-card' in item.get('type', []):
properties = item.get('properties', {})
urls = properties.get('url', [])
# Check if any URL matches the profile URL
for url in urls:
if isinstance(url, dict):
url = url.get('value', '')
if _normalize_url(url) == _normalize_url(me_url):
# Found matching h-card
return item
# Second try: First h-card with p-name (representative h-card)
for item in items:
if 'h-card' in item.get('type', []):
properties = item.get('properties', {})
if properties.get('name'):
return item
# Third try: Just use first h-card if any
for item in items:
if 'h-card' in item.get('type', []):
return item
return None
def _get_property(hcard: dict, prop_name: str) -> Optional[str]:
"""
Extract property value from h-card
Handles both string values and nested objects (for u-* properties)
Args:
hcard: h-card item dict
prop_name: Property name (e.g., 'name', 'photo', 'url')
Returns:
Property value as string or None
"""
properties = hcard.get('properties', {})
values = properties.get(prop_name, [])
if not values:
return None
# Get first value
value = values[0]
# Handle nested objects (e.g., u-photo might be {'value': '...', 'alt': '...'})
if isinstance(value, dict):
return value.get('value')
return value
def _normalize_url(url: str) -> str:
"""
Normalize URL for comparison
Removes trailing slash and converts to lowercase
Args:
url: URL to normalize
Returns:
Normalized URL
"""
if not url:
return ''
return url.rstrip('/').lower()
def get_author_profile(me_url: str, refresh: bool = False) -> Dict:
"""
Get author profile with caching
Per Q14: 24-hour cache, never block on failure
Per Q19: Use database for caching
Args:
me_url: User's IndieAuth identity URL
refresh: If True, force refresh from profile URL
Returns:
Author profile dict (from cache or fresh discovery)
Always returns a dict, never None (uses fallback defaults)
Profile dict contains:
- me: IndieAuth identity URL
- name: Author name
- photo: Author photo URL (may be None)
- url: Author canonical URL
- note: Author bio (may be None)
- rel_me_links: List of rel-me URLs
"""
db = get_db(current_app)
# Check cache unless refresh requested
if not refresh:
cached = db.execute(
"""
SELECT me, name, photo, url, note, rel_me_links, cached_until
FROM author_profile
WHERE me = ?
""",
(me_url,)
).fetchone()
if cached:
# Check if cache is still valid
cached_until = datetime.fromisoformat(cached['cached_until'])
if datetime.utcnow() < cached_until:
current_app.logger.debug(f"Using cached author profile for {me_url}")
# Parse rel_me_links from JSON
rel_me_links = json.loads(cached['rel_me_links']) if cached['rel_me_links'] else []
return {
'me': cached['me'],
'name': cached['name'],
'photo': cached['photo'],
'url': cached['url'],
'note': cached['note'],
'rel_me_links': rel_me_links,
}
# Attempt discovery
try:
profile = discover_author_profile(me_url)
if profile:
# Save to cache
save_author_profile(me_url, profile)
# Return with me_url added
profile['me'] = me_url
return profile
except DiscoveryError as e:
current_app.logger.warning(f"Discovery failed: {e}")
# Try to use expired cache as fallback (per Q14)
cached = db.execute(
"""
SELECT me, name, photo, url, note, rel_me_links
FROM author_profile
WHERE me = ?
""",
(me_url,)
).fetchone()
if cached:
current_app.logger.info(f"Using expired cache as fallback for {me_url}")
rel_me_links = json.loads(cached['rel_me_links']) if cached['rel_me_links'] else []
return {
'me': cached['me'],
'name': cached['name'],
'photo': cached['photo'],
'url': cached['url'],
'note': cached['note'],
'rel_me_links': rel_me_links,
}
# No cache, discovery failed - use minimal defaults (per Q14, Q21)
current_app.logger.warning(
f"No cached profile for {me_url}, using default fallback"
)
# Extract domain from URL for default name
try:
parsed_url = urlparse(me_url)
default_name = parsed_url.netloc or me_url
except Exception:
default_name = me_url
return {
'me': me_url,
'name': default_name,
'photo': None,
'url': me_url,
'note': None,
'rel_me_links': [],
}
def save_author_profile(me_url: str, profile: Dict) -> None:
"""
Save author profile to database
Per Q14: Sets cached_until to 24 hours from now
Per Q17: Store rel-me as JSON
Args:
me_url: User's IndieAuth identity URL
profile: Author profile dict from discovery
"""
db = get_db(current_app)
# Calculate cache expiry (24 hours from now)
cached_until = datetime.utcnow() + timedelta(hours=CACHE_TTL_HOURS)
# Convert rel_me_links to JSON (per Q17)
rel_me_json = json.dumps(profile.get('rel_me_links', []))
# Upsert (insert or replace)
db.execute(
"""
INSERT OR REPLACE INTO author_profile
(me, name, photo, url, note, rel_me_links, discovered_at, cached_until)
VALUES (?, ?, ?, ?, ?, ?, CURRENT_TIMESTAMP, ?)
""",
(
me_url,
profile.get('name'),
profile.get('photo'),
profile.get('url'),
profile.get('note'),
rel_me_json,
cached_until.isoformat(),
)
)
db.commit()
current_app.logger.info(f"Saved author profile for {me_url} (expires {cached_until})")