"""Tests for HTML fetcher service.""" import pytest from unittest.mock import Mock, patch, MagicMock from urllib.error import URLError, HTTPError from gondulf.services.html_fetcher import HTMLFetcherService class TestHTMLFetcherService: """Tests for HTMLFetcherService.""" def test_init_default_params(self): """Test initialization with default parameters.""" fetcher = HTMLFetcherService() assert fetcher.timeout == 10 assert fetcher.max_size == 1024 * 1024 assert fetcher.max_redirects == 5 assert "Gondulf" in fetcher.user_agent def test_init_custom_params(self): """Test initialization with custom parameters.""" fetcher = HTMLFetcherService( timeout=5, max_size=512 * 1024, max_redirects=3, user_agent="TestAgent/1.0" ) assert fetcher.timeout == 5 assert fetcher.max_size == 512 * 1024 assert fetcher.max_redirects == 3 assert fetcher.user_agent == "TestAgent/1.0" def test_fetch_requires_https(self): """Test that fetch requires HTTPS URLs.""" fetcher = HTMLFetcherService() with pytest.raises(ValueError, match="must use HTTPS"): fetcher.fetch("http://example.com/") @patch('gondulf.services.html_fetcher.urllib.request.urlopen') def test_fetch_success(self, mock_urlopen): """Test successful HTML fetch.""" # Mock response mock_response = MagicMock() mock_response.read.return_value = b"
Test" mock_response.headers.get_content_charset.return_value = "utf-8" mock_response.headers.get.return_value = None # No Content-Length header mock_response.__enter__.return_value = mock_response mock_response.__exit__.return_value = None mock_urlopen.return_value = mock_response fetcher = HTMLFetcherService() html = fetcher.fetch("https://example.com/") assert html == "Test" mock_urlopen.assert_called_once() @patch('gondulf.services.html_fetcher.urllib.request.urlopen') def test_fetch_respects_timeout(self, mock_urlopen): """Test that fetch respects timeout parameter.""" mock_response = MagicMock() mock_response.read.return_value = b"" mock_response.headers.get_content_charset.return_value = "utf-8" mock_response.headers.get.return_value = None mock_response.__enter__.return_value = mock_response mock_response.__exit__.return_value = None mock_urlopen.return_value = mock_response fetcher = HTMLFetcherService(timeout=15) fetcher.fetch("https://example.com/") call_kwargs = mock_urlopen.call_args[1] assert call_kwargs['timeout'] == 15 @patch('gondulf.services.html_fetcher.urllib.request.urlopen') def test_fetch_content_length_too_large(self, mock_urlopen): """Test that fetch returns None if Content-Length exceeds max_size.""" mock_response = MagicMock() mock_response.headers.get.return_value = str(2 * 1024 * 1024) # 2MB mock_response.__enter__.return_value = mock_response mock_response.__exit__.return_value = None mock_urlopen.return_value = mock_response fetcher = HTMLFetcherService(max_size=1024 * 1024) # 1MB max html = fetcher.fetch("https://example.com/") assert html is None @patch('gondulf.services.html_fetcher.urllib.request.urlopen') def test_fetch_response_too_large(self, mock_urlopen): """Test that fetch returns None if response exceeds max_size.""" # Create response larger than max_size large_content = b"x" * (1024 * 1024 + 1) # 1MB + 1 byte mock_response = MagicMock() mock_response.read.return_value = large_content mock_response.headers.get_content_charset.return_value = "utf-8" mock_response.headers.get.return_value = None mock_response.__enter__.return_value = mock_response mock_response.__exit__.return_value = None mock_urlopen.return_value = mock_response fetcher = HTMLFetcherService(max_size=1024 * 1024) html = fetcher.fetch("https://example.com/") assert html is None @patch('gondulf.services.html_fetcher.urllib.request.urlopen') def test_fetch_url_error(self, mock_urlopen): """Test that fetch returns None on URLError.""" mock_urlopen.side_effect = URLError("Connection failed") fetcher = HTMLFetcherService() html = fetcher.fetch("https://example.com/") assert html is None @patch('gondulf.services.html_fetcher.urllib.request.urlopen') def test_fetch_http_error(self, mock_urlopen): """Test that fetch returns None on HTTPError.""" mock_urlopen.side_effect = HTTPError( "https://example.com/", 404, "Not Found", {}, None ) fetcher = HTMLFetcherService() html = fetcher.fetch("https://example.com/") assert html is None @patch('gondulf.services.html_fetcher.urllib.request.urlopen') def test_fetch_timeout_error(self, mock_urlopen): """Test that fetch returns None on timeout.""" mock_urlopen.side_effect = TimeoutError("Request timed out") fetcher = HTMLFetcherService() html = fetcher.fetch("https://example.com/") assert html is None @patch('gondulf.services.html_fetcher.urllib.request.urlopen') def test_fetch_unicode_decode_error(self, mock_urlopen): """Test that fetch returns None on Unicode decode error.""" mock_response = MagicMock() mock_response.read.return_value = b"\xff\xfe" # Invalid UTF-8 mock_response.headers.get_content_charset.return_value = "utf-8" mock_response.headers.get.return_value = None mock_response.__enter__.return_value = mock_response mock_response.__exit__.return_value = None mock_urlopen.return_value = mock_response fetcher = HTMLFetcherService() # Should use 'replace' error handling and return a string html = fetcher.fetch("https://example.com/") assert html is not None # Should not fail, uses error='replace' @patch('gondulf.services.html_fetcher.urllib.request.urlopen') def test_fetch_sets_user_agent(self, mock_urlopen): """Test that fetch sets User-Agent header.""" mock_response = MagicMock() mock_response.read.return_value = b"" mock_response.headers.get_content_charset.return_value = "utf-8" mock_response.headers.get.return_value = None mock_response.__enter__.return_value = mock_response mock_response.__exit__.return_value = None mock_urlopen.return_value = mock_response fetcher = HTMLFetcherService(user_agent="CustomAgent/2.0") fetcher.fetch("https://example.com/") # Check that User-Agent header was set request = mock_urlopen.call_args[0][0] assert request.get_header('User-agent') == "CustomAgent/2.0"