"""h-app microformat parser for client metadata extraction.""" import logging from dataclasses import dataclass from datetime import datetime, timedelta from typing import Dict from urllib.parse import urlparse import mf2py from gondulf.services.html_fetcher import HTMLFetcherService logger = logging.getLogger("gondulf.happ_parser") @dataclass class ClientMetadata: """Client metadata extracted from h-app markup.""" name: str logo: str | None = None url: str | None = None class HAppParser: """Parse h-app microformat data from client HTML.""" def __init__(self, html_fetcher: HTMLFetcherService): """ Initialize parser with HTML fetcher dependency. Args: html_fetcher: Service for fetching HTML content """ self.html_fetcher = html_fetcher self.cache: Dict[str, tuple[ClientMetadata, datetime]] = {} self.cache_ttl = timedelta(hours=24) async def fetch_and_parse(self, client_id: str) -> ClientMetadata: """ Fetch client_id URL and parse h-app metadata. Uses 24-hour caching to reduce HTTP requests. Falls back to domain name if h-app not found. Args: client_id: Client application URL Returns: ClientMetadata with name (always populated) and optional logo/url """ # Check cache if client_id in self.cache: cached_metadata, cached_at = self.cache[client_id] if datetime.utcnow() - cached_at < self.cache_ttl: logger.debug(f"Returning cached metadata for {client_id}") return cached_metadata logger.info(f"Fetching h-app metadata from {client_id}") # Fetch HTML try: html = self.html_fetcher.fetch(client_id) except Exception as e: logger.warning(f"Failed to fetch {client_id}: {e}") html = None # Parse h-app or fallback to domain name if html: metadata = self._parse_h_app(html, client_id) else: logger.info(f"Using domain fallback for {client_id}") metadata = ClientMetadata( name=self._extract_domain_name(client_id) ) # Cache result self.cache[client_id] = (metadata, datetime.utcnow()) logger.debug(f"Cached metadata for {client_id}: {metadata.name}") return metadata def _parse_h_app(self, html: str, client_id: str) -> ClientMetadata: """ Parse h-app microformat from HTML. Args: html: HTML content to parse client_id: Client URL (for resolving relative URLs) Returns: ClientMetadata with extracted values, or domain fallback if no h-app """ try: # Parse microformats parsed = mf2py.parse(doc=html, url=client_id) # Find h-app items h_apps = [ item for item in parsed.get('items', []) if 'h-app' in item.get('type', []) ] if not h_apps: logger.info(f"No h-app markup found at {client_id}") return ClientMetadata( name=self._extract_domain_name(client_id) ) # Use first h-app h_app = h_apps[0] properties = h_app.get('properties', {}) # Extract properties name = properties.get('name', [None])[0] or self._extract_domain_name(client_id) # Extract logo - mf2py may return dict with 'value' key or string logo_raw = properties.get('logo', [None])[0] if isinstance(logo_raw, dict): logo = logo_raw.get('value') else: logo = logo_raw url = properties.get('url', [None])[0] or client_id logger.info(f"Extracted h-app metadata from {client_id}: name={name}") return ClientMetadata( name=name, logo=logo, url=url ) except Exception as e: logger.error(f"Failed to parse h-app from {client_id}: {e}") return ClientMetadata( name=self._extract_domain_name(client_id) ) def _extract_domain_name(self, client_id: str) -> str: """ Extract domain name from client_id for fallback display. Args: client_id: Client URL Returns: Domain name (e.g., "example.com") """ try: parsed = urlparse(client_id) domain = parsed.netloc or parsed.path return domain except Exception: return client_id