Merges Phase 4a work including: Implementation: - Metadata discovery endpoint (/api/.well-known/oauth-authorization-server) - h-app microformat parser service - Enhanced authorization endpoint with client info display - Configuration management system - Dependency injection framework Documentation: - Comprehensive gap analysis for v1.0.0 compliance - Phase 4a clarifications on development approach - Phase 4-5 critical components breakdown Testing: - Unit tests for h-app parser (308 lines, comprehensive coverage) - Unit tests for metadata endpoint (134 lines) - Unit tests for configuration system (18 lines) - Integration test updates All tests passing with high coverage. Ready for Phase 4b security hardening.
154 lines
4.6 KiB
Python
154 lines
4.6 KiB
Python
"""h-app microformat parser for client metadata extraction."""
|
|
import logging
|
|
from dataclasses import dataclass
|
|
from datetime import datetime, timedelta
|
|
from typing import Dict
|
|
from urllib.parse import urlparse
|
|
|
|
import mf2py
|
|
|
|
from gondulf.services.html_fetcher import HTMLFetcherService
|
|
|
|
logger = logging.getLogger("gondulf.happ_parser")
|
|
|
|
|
|
@dataclass
|
|
class ClientMetadata:
|
|
"""Client metadata extracted from h-app markup."""
|
|
name: str
|
|
logo: str | None = None
|
|
url: str | None = None
|
|
|
|
|
|
class HAppParser:
|
|
"""Parse h-app microformat data from client HTML."""
|
|
|
|
def __init__(self, html_fetcher: HTMLFetcherService):
|
|
"""
|
|
Initialize parser with HTML fetcher dependency.
|
|
|
|
Args:
|
|
html_fetcher: Service for fetching HTML content
|
|
"""
|
|
self.html_fetcher = html_fetcher
|
|
self.cache: Dict[str, tuple[ClientMetadata, datetime]] = {}
|
|
self.cache_ttl = timedelta(hours=24)
|
|
|
|
async def fetch_and_parse(self, client_id: str) -> ClientMetadata:
|
|
"""
|
|
Fetch client_id URL and parse h-app metadata.
|
|
|
|
Uses 24-hour caching to reduce HTTP requests.
|
|
Falls back to domain name if h-app not found.
|
|
|
|
Args:
|
|
client_id: Client application URL
|
|
|
|
Returns:
|
|
ClientMetadata with name (always populated) and optional logo/url
|
|
"""
|
|
# Check cache
|
|
if client_id in self.cache:
|
|
cached_metadata, cached_at = self.cache[client_id]
|
|
if datetime.utcnow() - cached_at < self.cache_ttl:
|
|
logger.debug(f"Returning cached metadata for {client_id}")
|
|
return cached_metadata
|
|
|
|
logger.info(f"Fetching h-app metadata from {client_id}")
|
|
|
|
# Fetch HTML
|
|
try:
|
|
html = self.html_fetcher.fetch(client_id)
|
|
except Exception as e:
|
|
logger.warning(f"Failed to fetch {client_id}: {e}")
|
|
html = None
|
|
|
|
# Parse h-app or fallback to domain name
|
|
if html:
|
|
metadata = self._parse_h_app(html, client_id)
|
|
else:
|
|
logger.info(f"Using domain fallback for {client_id}")
|
|
metadata = ClientMetadata(
|
|
name=self._extract_domain_name(client_id)
|
|
)
|
|
|
|
# Cache result
|
|
self.cache[client_id] = (metadata, datetime.utcnow())
|
|
logger.debug(f"Cached metadata for {client_id}: {metadata.name}")
|
|
|
|
return metadata
|
|
|
|
def _parse_h_app(self, html: str, client_id: str) -> ClientMetadata:
|
|
"""
|
|
Parse h-app microformat from HTML.
|
|
|
|
Args:
|
|
html: HTML content to parse
|
|
client_id: Client URL (for resolving relative URLs)
|
|
|
|
Returns:
|
|
ClientMetadata with extracted values, or domain fallback if no h-app
|
|
"""
|
|
try:
|
|
# Parse microformats
|
|
parsed = mf2py.parse(doc=html, url=client_id)
|
|
|
|
# Find h-app items
|
|
h_apps = [
|
|
item for item in parsed.get('items', [])
|
|
if 'h-app' in item.get('type', [])
|
|
]
|
|
|
|
if not h_apps:
|
|
logger.info(f"No h-app markup found at {client_id}")
|
|
return ClientMetadata(
|
|
name=self._extract_domain_name(client_id)
|
|
)
|
|
|
|
# Use first h-app
|
|
h_app = h_apps[0]
|
|
properties = h_app.get('properties', {})
|
|
|
|
# Extract properties
|
|
name = properties.get('name', [None])[0] or self._extract_domain_name(client_id)
|
|
|
|
# Extract logo - mf2py may return dict with 'value' key or string
|
|
logo_raw = properties.get('logo', [None])[0]
|
|
if isinstance(logo_raw, dict):
|
|
logo = logo_raw.get('value')
|
|
else:
|
|
logo = logo_raw
|
|
|
|
url = properties.get('url', [None])[0] or client_id
|
|
|
|
logger.info(f"Extracted h-app metadata from {client_id}: name={name}")
|
|
|
|
return ClientMetadata(
|
|
name=name,
|
|
logo=logo,
|
|
url=url
|
|
)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Failed to parse h-app from {client_id}: {e}")
|
|
return ClientMetadata(
|
|
name=self._extract_domain_name(client_id)
|
|
)
|
|
|
|
def _extract_domain_name(self, client_id: str) -> str:
|
|
"""
|
|
Extract domain name from client_id for fallback display.
|
|
|
|
Args:
|
|
client_id: Client URL
|
|
|
|
Returns:
|
|
Domain name (e.g., "example.com")
|
|
"""
|
|
try:
|
|
parsed = urlparse(client_id)
|
|
domain = parsed.netloc or parsed.path
|
|
return domain
|
|
except Exception:
|
|
return client_id
|