"""HTML fetcher service for retrieving user homepages.""" import urllib.request from urllib.error import HTTPError, URLError class HTMLFetcherService: """Service for fetching HTML content from URLs.""" def __init__( self, timeout: int = 10, max_size: int = 1024 * 1024, # 1MB max_redirects: int = 5, user_agent: str = "Gondulf-IndieAuth/0.1" ) -> None: """ Initialize HTML fetcher service. Args: timeout: Request timeout in seconds (default: 10) max_size: Maximum response size in bytes (default: 1MB) max_redirects: Maximum number of redirects to follow (default: 5) user_agent: User-Agent header value """ self.timeout = timeout self.max_size = max_size self.max_redirects = max_redirects self.user_agent = user_agent def fetch(self, url: str) -> str | None: """ Fetch HTML content from URL. Args: url: URL to fetch (must be HTTPS) Returns: HTML content as string, or None if fetch fails Raises: ValueError: If URL is not HTTPS """ # Enforce HTTPS if not url.startswith('https://'): raise ValueError("URL must use HTTPS") try: # Create request with User-Agent header req = urllib.request.Request( url, headers={'User-Agent': self.user_agent} ) # Open URL with timeout with urllib.request.urlopen( req, timeout=self.timeout ) as response: # Check content length if provided content_length = response.headers.get('Content-Length') if content_length and int(content_length) > self.max_size: return None # Read with size limit content = response.read(self.max_size + 1) if len(content) > self.max_size: return None # Decode content charset = response.headers.get_content_charset() or 'utf-8' return content.decode(charset, errors='replace') except (URLError, HTTPError, UnicodeDecodeError, TimeoutError): return None except Exception: # Catch all other exceptions and return None return None