"""HTML fetcher service for retrieving user homepages."""
import urllib.request
from urllib.error import HTTPError, URLError
class HTMLFetcherService:
"""Service for fetching HTML content from URLs."""
def __init__(
self,
timeout: int = 10,
max_size: int = 1024 * 1024, # 1MB
max_redirects: int = 5,
user_agent: str = "Gondulf-IndieAuth/0.1"
) -> None:
"""
Initialize HTML fetcher service.
Args:
timeout: Request timeout in seconds (default: 10)
max_size: Maximum response size in bytes (default: 1MB)
max_redirects: Maximum number of redirects to follow (default: 5)
user_agent: User-Agent header value
"""
self.timeout = timeout
self.max_size = max_size
self.max_redirects = max_redirects
self.user_agent = user_agent
def fetch(self, url: str) -> str | None:
"""
Fetch HTML content from URL.
Args:
url: URL to fetch (must be HTTPS)
Returns:
HTML content as string, or None if fetch fails
Raises:
ValueError: If URL is not HTTPS
"""
# Enforce HTTPS
if not url.startswith('https://'):
raise ValueError("URL must use HTTPS")
try:
# Create request with User-Agent header
req = urllib.request.Request(
url,
headers={'User-Agent': self.user_agent}
)
# Open URL with timeout
with urllib.request.urlopen(
req,
timeout=self.timeout
) as response:
# Check content length if provided
content_length = response.headers.get('Content-Length')
if content_length and int(content_length) > self.max_size:
return None
# Read with size limit
content = response.read(self.max_size + 1)
if len(content) > self.max_size:
return None
# Decode content
charset = response.headers.get_content_charset() or 'utf-8'
return content.decode(charset, errors='replace')
except (URLError, HTTPError, UnicodeDecodeError, TimeoutError):
return None
except Exception:
# Catch all other exceptions and return None
return None