Files
Gondulf/src/gondulf/services/happ_parser.py
Phil Skentelbery 115e733604 feat(phase-4a): complete Phase 3 implementation and gap analysis
Merges Phase 4a work including:

Implementation:
- Metadata discovery endpoint (/api/.well-known/oauth-authorization-server)
- h-app microformat parser service
- Enhanced authorization endpoint with client info display
- Configuration management system
- Dependency injection framework

Documentation:
- Comprehensive gap analysis for v1.0.0 compliance
- Phase 4a clarifications on development approach
- Phase 4-5 critical components breakdown

Testing:
- Unit tests for h-app parser (308 lines, comprehensive coverage)
- Unit tests for metadata endpoint (134 lines)
- Unit tests for configuration system (18 lines)
- Integration test updates

All tests passing with high coverage. Ready for Phase 4b security hardening.
2025-11-20 17:16:11 -07:00

154 lines
4.6 KiB
Python

"""h-app microformat parser for client metadata extraction."""
import logging
from dataclasses import dataclass
from datetime import datetime, timedelta
from typing import Dict
from urllib.parse import urlparse
import mf2py
from gondulf.services.html_fetcher import HTMLFetcherService
logger = logging.getLogger("gondulf.happ_parser")
@dataclass
class ClientMetadata:
"""Client metadata extracted from h-app markup."""
name: str
logo: str | None = None
url: str | None = None
class HAppParser:
"""Parse h-app microformat data from client HTML."""
def __init__(self, html_fetcher: HTMLFetcherService):
"""
Initialize parser with HTML fetcher dependency.
Args:
html_fetcher: Service for fetching HTML content
"""
self.html_fetcher = html_fetcher
self.cache: Dict[str, tuple[ClientMetadata, datetime]] = {}
self.cache_ttl = timedelta(hours=24)
async def fetch_and_parse(self, client_id: str) -> ClientMetadata:
"""
Fetch client_id URL and parse h-app metadata.
Uses 24-hour caching to reduce HTTP requests.
Falls back to domain name if h-app not found.
Args:
client_id: Client application URL
Returns:
ClientMetadata with name (always populated) and optional logo/url
"""
# Check cache
if client_id in self.cache:
cached_metadata, cached_at = self.cache[client_id]
if datetime.utcnow() - cached_at < self.cache_ttl:
logger.debug(f"Returning cached metadata for {client_id}")
return cached_metadata
logger.info(f"Fetching h-app metadata from {client_id}")
# Fetch HTML
try:
html = self.html_fetcher.fetch(client_id)
except Exception as e:
logger.warning(f"Failed to fetch {client_id}: {e}")
html = None
# Parse h-app or fallback to domain name
if html:
metadata = self._parse_h_app(html, client_id)
else:
logger.info(f"Using domain fallback for {client_id}")
metadata = ClientMetadata(
name=self._extract_domain_name(client_id)
)
# Cache result
self.cache[client_id] = (metadata, datetime.utcnow())
logger.debug(f"Cached metadata for {client_id}: {metadata.name}")
return metadata
def _parse_h_app(self, html: str, client_id: str) -> ClientMetadata:
"""
Parse h-app microformat from HTML.
Args:
html: HTML content to parse
client_id: Client URL (for resolving relative URLs)
Returns:
ClientMetadata with extracted values, or domain fallback if no h-app
"""
try:
# Parse microformats
parsed = mf2py.parse(doc=html, url=client_id)
# Find h-app items
h_apps = [
item for item in parsed.get('items', [])
if 'h-app' in item.get('type', [])
]
if not h_apps:
logger.info(f"No h-app markup found at {client_id}")
return ClientMetadata(
name=self._extract_domain_name(client_id)
)
# Use first h-app
h_app = h_apps[0]
properties = h_app.get('properties', {})
# Extract properties
name = properties.get('name', [None])[0] or self._extract_domain_name(client_id)
# Extract logo - mf2py may return dict with 'value' key or string
logo_raw = properties.get('logo', [None])[0]
if isinstance(logo_raw, dict):
logo = logo_raw.get('value')
else:
logo = logo_raw
url = properties.get('url', [None])[0] or client_id
logger.info(f"Extracted h-app metadata from {client_id}: name={name}")
return ClientMetadata(
name=name,
logo=logo,
url=url
)
except Exception as e:
logger.error(f"Failed to parse h-app from {client_id}: {e}")
return ClientMetadata(
name=self._extract_domain_name(client_id)
)
def _extract_domain_name(self, client_id: str) -> str:
"""
Extract domain name from client_id for fallback display.
Args:
client_id: Client URL
Returns:
Domain name (e.g., "example.com")
"""
try:
parsed = urlparse(client_id)
domain = parsed.netloc or parsed.path
return domain
except Exception:
return client_id