Gondulf/src/gondulf/services/html_fetcher.py

"""HTML fetcher service for retrieving user homepages."""
import urllib.request
from urllib.error import HTTPError, URLError


class HTMLFetcherService:
    """Service for fetching HTML content from URLs."""

    def __init__(
        self,
        timeout: int = 10,
        max_size: int = 1024 * 1024,  # 1MB
        max_redirects: int = 5,
        user_agent: str = "Gondulf-IndieAuth/0.1"
    ) -> None:
        """
        Initialize HTML fetcher service.

        Args:
            timeout: Request timeout in seconds (default: 10)
            max_size: Maximum response size in bytes (default: 1MB)
            max_redirects: Maximum number of redirects to follow (default: 5)
            user_agent: User-Agent header value
        """
        self.timeout = timeout
        self.max_size = max_size
        self.max_redirects = max_redirects
        self.user_agent = user_agent

    def fetch(self, url: str) -> str | None:
        """
        Fetch HTML content from URL.

        Args:
            url: URL to fetch (must be HTTPS)

        Returns:
            HTML content as string, or None if fetch fails

        Raises:
            ValueError: If URL is not HTTPS
        """
        # Enforce HTTPS
        if not url.startswith('https://'):
            raise ValueError("URL must use HTTPS")

        try:
            # Create request with User-Agent header
            req = urllib.request.Request(
                url,
                headers={'User-Agent': self.user_agent}
            )

            # Open URL with timeout
            with urllib.request.urlopen(
                req,
                timeout=self.timeout
            ) as response:
                # Check content length if provided
                content_length = response.headers.get('Content-Length')
                if content_length and int(content_length) > self.max_size:
                    return None

                # Read with size limit
                content = response.read(self.max_size + 1)
                if len(content) > self.max_size:
                    return None

                # Decode content
                charset = response.headers.get_content_charset() or 'utf-8'
                return content.decode(charset, errors='replace')

        except (URLError, HTTPError, UnicodeDecodeError, TimeoutError):
            return None
        except Exception:
            # Catch all other exceptions and return None
            return None