""" Author profile discovery from IndieAuth identity Per ADR-061 and v1.2.0 Phase 2: - Discover h-card from user's IndieAuth 'me' URL - Cache for 24 hours (per Q14) - Graceful fallback if discovery fails - Never block login functionality Discovery Process: 1. Fetch user's profile URL 2. Parse h-card microformats using mf2py 3. Extract: name, photo, url, note (bio), rel-me links 4. Cache in author_profile table with 24-hour TTL 5. Return cached data on subsequent requests Fallback Behavior (per Q14): - If discovery fails, use cached data even if expired - If no cache exists, use minimal defaults (domain as name) - Never block or fail login due to discovery issues """ import json import logging from datetime import datetime, timedelta from typing import Dict, Optional from urllib.parse import urlparse import httpx import mf2py from flask import current_app from starpunk.database import get_db # Discovery timeout (per Q&A Q38) DISCOVERY_TIMEOUT = 5.0 # Cache TTL (per Q&A Q14, Q19) CACHE_TTL_HOURS = 24 class DiscoveryError(Exception): """Raised when author profile discovery fails""" pass def discover_author_profile(me_url: str) -> Optional[Dict]: """ Discover author h-card from IndieAuth profile URL Per Q15: Use mf2py library (already a dependency) Per Q14: Graceful fallback, never block login Per Q16: Use first representative h-card Args: me_url: User's IndieAuth identity URL Returns: Dict with author profile data or None on failure Profile dict contains: - name: Author name (from p-name) - photo: Author photo URL (from u-photo) - url: Author canonical URL (from u-url) - note: Author bio (from p-note) - rel_me_links: List of rel-me URLs """ try: current_app.logger.info(f"Discovering author profile from {me_url}") # Fetch profile page with timeout response = httpx.get( me_url, timeout=DISCOVERY_TIMEOUT, follow_redirects=True, headers={ 'Accept': 'text/html,application/xhtml+xml', 'User-Agent': f'StarPunk/{current_app.config.get("VERSION", "1.2.0")}' } ) response.raise_for_status() # Parse microformats from HTML parsed = mf2py.parse(doc=response.text, url=me_url) # Extract h-card (per Q16: first representative h-card) hcard = _find_representative_hcard(parsed, me_url) if not hcard: current_app.logger.warning(f"No h-card found at {me_url}") return None # Extract h-card properties profile = { 'name': _get_property(hcard, 'name'), 'photo': _get_property(hcard, 'photo'), 'url': _get_property(hcard, 'url') or me_url, 'note': _get_property(hcard, 'note'), } # Extract rel-me links (per Q17: store as list) rel_me_links = parsed.get('rels', {}).get('me', []) profile['rel_me_links'] = rel_me_links current_app.logger.info( f"Discovered author profile: name={profile.get('name')}, " f"photo={'yes' if profile.get('photo') else 'no'}, " f"rel_me_count={len(rel_me_links)}" ) return profile except httpx.TimeoutException: current_app.logger.warning(f"Timeout discovering profile at {me_url}") raise DiscoveryError(f"Timeout fetching profile: {me_url}") except httpx.HTTPStatusError as e: current_app.logger.warning( f"HTTP {e.response.status_code} discovering profile at {me_url}" ) raise DiscoveryError(f"HTTP error fetching profile: {e.response.status_code}") except httpx.RequestError as e: current_app.logger.warning(f"Network error discovering profile at {me_url}: {e}") raise DiscoveryError(f"Network error: {e}") except Exception as e: current_app.logger.error(f"Unexpected error discovering profile at {me_url}: {e}") raise DiscoveryError(f"Discovery failed: {e}") def _find_representative_hcard(parsed: dict, me_url: str) -> Optional[dict]: """ Find representative h-card from parsed microformats Per Q16: First representative h-card = first h-card with p-name Per Q18: First h-card with url property matching profile URL Args: parsed: Parsed microformats data from mf2py me_url: Profile URL for matching Returns: h-card dict or None if not found """ items = parsed.get('items', []) # First try: h-card with matching URL (most specific) for item in items: if 'h-card' in item.get('type', []): properties = item.get('properties', {}) urls = properties.get('url', []) # Check if any URL matches the profile URL for url in urls: if isinstance(url, dict): url = url.get('value', '') if _normalize_url(url) == _normalize_url(me_url): # Found matching h-card return item # Second try: First h-card with p-name (representative h-card) for item in items: if 'h-card' in item.get('type', []): properties = item.get('properties', {}) if properties.get('name'): return item # Third try: Just use first h-card if any for item in items: if 'h-card' in item.get('type', []): return item return None def _get_property(hcard: dict, prop_name: str) -> Optional[str]: """ Extract property value from h-card Handles both string values and nested objects (for u-* properties) Args: hcard: h-card item dict prop_name: Property name (e.g., 'name', 'photo', 'url') Returns: Property value as string or None """ properties = hcard.get('properties', {}) values = properties.get(prop_name, []) if not values: return None # Get first value value = values[0] # Handle nested objects (e.g., u-photo might be {'value': '...', 'alt': '...'}) if isinstance(value, dict): return value.get('value') return value def _normalize_url(url: str) -> str: """ Normalize URL for comparison Removes trailing slash and converts to lowercase Args: url: URL to normalize Returns: Normalized URL """ if not url: return '' return url.rstrip('/').lower() def get_author_profile(me_url: str, refresh: bool = False) -> Dict: """ Get author profile with caching Per Q14: 24-hour cache, never block on failure Per Q19: Use database for caching Args: me_url: User's IndieAuth identity URL refresh: If True, force refresh from profile URL Returns: Author profile dict (from cache or fresh discovery) Always returns a dict, never None (uses fallback defaults) Profile dict contains: - me: IndieAuth identity URL - name: Author name - photo: Author photo URL (may be None) - url: Author canonical URL - note: Author bio (may be None) - rel_me_links: List of rel-me URLs """ db = get_db(current_app) # Check cache unless refresh requested if not refresh: cached = db.execute( """ SELECT me, name, photo, url, note, rel_me_links, cached_until FROM author_profile WHERE me = ? """, (me_url,) ).fetchone() if cached: # Check if cache is still valid cached_until = datetime.fromisoformat(cached['cached_until']) if datetime.utcnow() < cached_until: current_app.logger.debug(f"Using cached author profile for {me_url}") # Parse rel_me_links from JSON rel_me_links = json.loads(cached['rel_me_links']) if cached['rel_me_links'] else [] return { 'me': cached['me'], 'name': cached['name'], 'photo': cached['photo'], 'url': cached['url'], 'note': cached['note'], 'rel_me_links': rel_me_links, } # Attempt discovery try: profile = discover_author_profile(me_url) if profile: # Save to cache save_author_profile(me_url, profile) # Return with me_url added profile['me'] = me_url return profile except DiscoveryError as e: current_app.logger.warning(f"Discovery failed: {e}") # Try to use expired cache as fallback (per Q14) cached = db.execute( """ SELECT me, name, photo, url, note, rel_me_links FROM author_profile WHERE me = ? """, (me_url,) ).fetchone() if cached: current_app.logger.info(f"Using expired cache as fallback for {me_url}") rel_me_links = json.loads(cached['rel_me_links']) if cached['rel_me_links'] else [] return { 'me': cached['me'], 'name': cached['name'], 'photo': cached['photo'], 'url': cached['url'], 'note': cached['note'], 'rel_me_links': rel_me_links, } # No cache, discovery failed - use minimal defaults (per Q14, Q21) current_app.logger.warning( f"No cached profile for {me_url}, using default fallback" ) # Extract domain from URL for default name try: parsed_url = urlparse(me_url) default_name = parsed_url.netloc or me_url except Exception: default_name = me_url return { 'me': me_url, 'name': default_name, 'photo': None, 'url': me_url, 'note': None, 'rel_me_links': [], } def save_author_profile(me_url: str, profile: Dict) -> None: """ Save author profile to database Per Q14: Sets cached_until to 24 hours from now Per Q17: Store rel-me as JSON Args: me_url: User's IndieAuth identity URL profile: Author profile dict from discovery """ db = get_db(current_app) # Calculate cache expiry (24 hours from now) cached_until = datetime.utcnow() + timedelta(hours=CACHE_TTL_HOURS) # Convert rel_me_links to JSON (per Q17) rel_me_json = json.dumps(profile.get('rel_me_links', [])) # Upsert (insert or replace) db.execute( """ INSERT OR REPLACE INTO author_profile (me, name, photo, url, note, rel_me_links, discovered_at, cached_until) VALUES (?, ?, ?, ?, ?, ?, CURRENT_TIMESTAMP, ?) """, ( me_url, profile.get('name'), profile.get('photo'), profile.get('url'), profile.get('note'), rel_me_json, cached_until.isoformat(), ) ) db.commit() current_app.logger.info(f"Saved author profile for {me_url} (expires {cached_until})")