StarPunk/starpunk/author_discovery.py

"""
Author profile discovery from IndieAuth identity

Per ADR-061 and v1.2.0 Phase 2:
- Discover h-card from user's IndieAuth 'me' URL
- Cache for 24 hours (per Q14)
- Graceful fallback if discovery fails
- Never block login functionality

Discovery Process:
1. Fetch user's profile URL
2. Parse h-card microformats using mf2py
3. Extract: name, photo, url, note (bio), rel-me links
4. Cache in author_profile table with 24-hour TTL
5. Return cached data on subsequent requests

Fallback Behavior (per Q14):
- If discovery fails, use cached data even if expired
- If no cache exists, use minimal defaults (domain as name)
- Never block or fail login due to discovery issues
"""

import json
import logging
from datetime import datetime, timedelta
from typing import Dict, Optional
from urllib.parse import urlparse

import httpx
import mf2py
from flask import current_app

from starpunk.database import get_db


# Discovery timeout (per Q&A Q38)
DISCOVERY_TIMEOUT = 5.0

# Cache TTL (per Q&A Q14, Q19)
CACHE_TTL_HOURS = 24


class DiscoveryError(Exception):
    """Raised when author profile discovery fails"""
    pass


def discover_author_profile(me_url: str) -> Optional[Dict]:
    """
    Discover author h-card from IndieAuth profile URL

    Per Q15: Use mf2py library (already a dependency)
    Per Q14: Graceful fallback, never block login
    Per Q16: Use first representative h-card

    Args:
        me_url: User's IndieAuth identity URL

    Returns:
        Dict with author profile data or None on failure

    Profile dict contains:
        - name: Author name (from p-name)
        - photo: Author photo URL (from u-photo)
        - url: Author canonical URL (from u-url)
        - note: Author bio (from p-note)
        - rel_me_links: List of rel-me URLs
    """
    try:
        current_app.logger.info(f"Discovering author profile from {me_url}")

        # Fetch profile page with timeout
        response = httpx.get(
            me_url,
            timeout=DISCOVERY_TIMEOUT,
            follow_redirects=True,
            headers={
                'Accept': 'text/html,application/xhtml+xml',
                'User-Agent': f'StarPunk/{current_app.config.get("VERSION", "1.2.0")}'
            }
        )
        response.raise_for_status()

        # Parse microformats from HTML
        parsed = mf2py.parse(doc=response.text, url=me_url)

        # Extract h-card (per Q16: first representative h-card)
        hcard = _find_representative_hcard(parsed, me_url)

        if not hcard:
            current_app.logger.warning(f"No h-card found at {me_url}")
            return None

        # Extract h-card properties
        profile = {
            'name': _get_property(hcard, 'name'),
            'photo': _get_property(hcard, 'photo'),
            'url': _get_property(hcard, 'url') or me_url,
            'note': _get_property(hcard, 'note'),
        }

        # Extract rel-me links (per Q17: store as list)
        rel_me_links = parsed.get('rels', {}).get('me', [])
        profile['rel_me_links'] = rel_me_links

        current_app.logger.info(
            f"Discovered author profile: name={profile.get('name')}, "
            f"photo={'yes' if profile.get('photo') else 'no'}, "
            f"rel_me_count={len(rel_me_links)}"
        )

        return profile

    except httpx.TimeoutException:
        current_app.logger.warning(f"Timeout discovering profile at {me_url}")
        raise DiscoveryError(f"Timeout fetching profile: {me_url}")

    except httpx.HTTPStatusError as e:
        current_app.logger.warning(
            f"HTTP {e.response.status_code} discovering profile at {me_url}"
        )
        raise DiscoveryError(f"HTTP error fetching profile: {e.response.status_code}")

    except httpx.RequestError as e:
        current_app.logger.warning(f"Network error discovering profile at {me_url}: {e}")
        raise DiscoveryError(f"Network error: {e}")

    except Exception as e:
        current_app.logger.error(f"Unexpected error discovering profile at {me_url}: {e}")
        raise DiscoveryError(f"Discovery failed: {e}")


def _find_representative_hcard(parsed: dict, me_url: str) -> Optional[dict]:
    """
    Find representative h-card from parsed microformats

    Per Q16: First representative h-card = first h-card with p-name
    Per Q18: First h-card with url property matching profile URL

    Args:
        parsed: Parsed microformats data from mf2py
        me_url: Profile URL for matching

    Returns:
        h-card dict or None if not found
    """
    items = parsed.get('items', [])

    # First try: h-card with matching URL (most specific)
    for item in items:
        if 'h-card' in item.get('type', []):
            properties = item.get('properties', {})
            urls = properties.get('url', [])

            # Check if any URL matches the profile URL
            for url in urls:
                if isinstance(url, dict):
                    url = url.get('value', '')
                if _normalize_url(url) == _normalize_url(me_url):
                    # Found matching h-card
                    return item

    # Second try: First h-card with p-name (representative h-card)
    for item in items:
        if 'h-card' in item.get('type', []):
            properties = item.get('properties', {})
            if properties.get('name'):
                return item

    # Third try: Just use first h-card if any
    for item in items:
        if 'h-card' in item.get('type', []):
            return item

    return None


def _get_property(hcard: dict, prop_name: str) -> Optional[str]:
    """
    Extract property value from h-card

    Handles both string values and nested objects (for u-* properties)

    Args:
        hcard: h-card item dict
        prop_name: Property name (e.g., 'name', 'photo', 'url')

    Returns:
        Property value as string or None
    """
    properties = hcard.get('properties', {})
    values = properties.get(prop_name, [])

    if not values:
        return None

    # Get first value
    value = values[0]

    # Handle nested objects (e.g., u-photo might be {'value': '...', 'alt': '...'})
    if isinstance(value, dict):
        return value.get('value')

    return value


def _normalize_url(url: str) -> str:
    """
    Normalize URL for comparison

    Removes trailing slash and converts to lowercase

    Args:
        url: URL to normalize

    Returns:
        Normalized URL
    """
    if not url:
        return ''
    return url.rstrip('/').lower()


def get_author_profile(me_url: str, refresh: bool = False) -> Dict:
    """
    Get author profile with caching

    Per Q14: 24-hour cache, never block on failure
    Per Q19: Use database for caching

    Args:
        me_url: User's IndieAuth identity URL
        refresh: If True, force refresh from profile URL

    Returns:
        Author profile dict (from cache or fresh discovery)
        Always returns a dict, never None (uses fallback defaults)

    Profile dict contains:
        - me: IndieAuth identity URL
        - name: Author name
        - photo: Author photo URL (may be None)
        - url: Author canonical URL
        - note: Author bio (may be None)
        - rel_me_links: List of rel-me URLs
    """
    db = get_db(current_app)

    # Check cache unless refresh requested
    if not refresh:
        cached = db.execute(
            """
            SELECT me, name, photo, url, note, rel_me_links, cached_until
            FROM author_profile
            WHERE me = ?
            """,
            (me_url,)
        ).fetchone()

        if cached:
            # Check if cache is still valid
            cached_until = datetime.fromisoformat(cached['cached_until'])
            if datetime.utcnow() < cached_until:
                current_app.logger.debug(f"Using cached author profile for {me_url}")

                # Parse rel_me_links from JSON
                rel_me_links = json.loads(cached['rel_me_links']) if cached['rel_me_links'] else []

                return {
                    'me': cached['me'],
                    'name': cached['name'],
                    'photo': cached['photo'],
                    'url': cached['url'],
                    'note': cached['note'],
                    'rel_me_links': rel_me_links,
                }

    # Attempt discovery
    try:
        profile = discover_author_profile(me_url)

        if profile:
            # Save to cache
            save_author_profile(me_url, profile)

            # Return with me_url added
            profile['me'] = me_url
            return profile

    except DiscoveryError as e:
        current_app.logger.warning(f"Discovery failed: {e}")

        # Try to use expired cache as fallback (per Q14)
        cached = db.execute(
            """
            SELECT me, name, photo, url, note, rel_me_links
            FROM author_profile
            WHERE me = ?
            """,
            (me_url,)
        ).fetchone()

        if cached:
            current_app.logger.info(f"Using expired cache as fallback for {me_url}")

            rel_me_links = json.loads(cached['rel_me_links']) if cached['rel_me_links'] else []

            return {
                'me': cached['me'],
                'name': cached['name'],
                'photo': cached['photo'],
                'url': cached['url'],
                'note': cached['note'],
                'rel_me_links': rel_me_links,
            }

    # No cache, discovery failed - use minimal defaults (per Q14, Q21)
    current_app.logger.warning(
        f"No cached profile for {me_url}, using default fallback"
    )

    # Extract domain from URL for default name
    try:
        parsed_url = urlparse(me_url)
        default_name = parsed_url.netloc or me_url
    except Exception:
        default_name = me_url

    return {
        'me': me_url,
        'name': default_name,
        'photo': None,
        'url': me_url,
        'note': None,
        'rel_me_links': [],
    }


def save_author_profile(me_url: str, profile: Dict) -> None:
    """
    Save author profile to database

    Per Q14: Sets cached_until to 24 hours from now
    Per Q17: Store rel-me as JSON

    Args:
        me_url: User's IndieAuth identity URL
        profile: Author profile dict from discovery
    """
    db = get_db(current_app)

    # Calculate cache expiry (24 hours from now)
    cached_until = datetime.utcnow() + timedelta(hours=CACHE_TTL_HOURS)

    # Convert rel_me_links to JSON (per Q17)
    rel_me_json = json.dumps(profile.get('rel_me_links', []))

    # Upsert (insert or replace)
    db.execute(
        """
        INSERT OR REPLACE INTO author_profile
        (me, name, photo, url, note, rel_me_links, discovered_at, cached_until)
        VALUES (?, ?, ?, ?, ?, ?, CURRENT_TIMESTAMP, ?)
        """,
        (
            me_url,
            profile.get('name'),
            profile.get('photo'),
            profile.get('url'),
            profile.get('note'),
            rel_me_json,
            cached_until.isoformat(),
        )
    )
    db.commit()

    current_app.logger.info(f"Saved author profile for {me_url} (expires {cached_until})")