- Modular architecture with separate modules for scraping, parsing, security, validation, and caching - Comprehensive security measures including HTML sanitization, rate limiting, and input validation - Robust error handling with custom exceptions and retry logic - HTTP caching with ETags and Last-Modified headers for efficiency - Pre-compiled regex patterns for improved performance - Comprehensive test suite with 66 tests covering all major functionality - Docker support for containerized deployment - Configuration management with environment variable support - Working parser that successfully extracts 32 articles from Warhammer Community 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
202 lines
7.8 KiB
Python
202 lines
7.8 KiB
Python
"""Tests for main module functionality."""
|
|
|
|
import pytest
|
|
import sys
|
|
import tempfile
|
|
from unittest.mock import patch, MagicMock
|
|
from argparse import Namespace
|
|
|
|
from main import parse_arguments, setup_logging, scrape_and_generate_rss
|
|
from src.rss_scraper.exceptions import ValidationError, NetworkError, ParseError
|
|
|
|
|
|
class TestParseArguments:
|
|
"""Test command line argument parsing."""
|
|
|
|
def test_parse_arguments_defaults(self):
|
|
"""Test parsing with default arguments."""
|
|
with patch('sys.argv', ['main.py']):
|
|
args = parse_arguments()
|
|
|
|
assert args.url == 'https://www.warhammer-community.com/en-gb/'
|
|
assert args.output_dir is None
|
|
assert args.max_scroll == 5
|
|
assert args.log_level == 'INFO'
|
|
assert args.log_file == 'scraper.log'
|
|
|
|
def test_parse_arguments_custom_values(self):
|
|
"""Test parsing with custom argument values."""
|
|
test_args = [
|
|
'main.py',
|
|
'--url', 'https://example.com',
|
|
'--output-dir', '/custom/path',
|
|
'--max-scroll', '10',
|
|
'--log-level', 'DEBUG',
|
|
'--log-file', 'custom.log'
|
|
]
|
|
|
|
with patch('sys.argv', test_args):
|
|
args = parse_arguments()
|
|
|
|
assert args.url == 'https://example.com'
|
|
assert args.output_dir == '/custom/path'
|
|
assert args.max_scroll == 10
|
|
assert args.log_level == 'DEBUG'
|
|
assert args.log_file == 'custom.log'
|
|
|
|
def test_parse_arguments_invalid_max_scroll(self):
|
|
"""Test parsing fails with invalid max_scroll value."""
|
|
test_args = ['main.py', '--max-scroll', '-1']
|
|
|
|
with patch('sys.argv', test_args):
|
|
with pytest.raises(SystemExit):
|
|
parse_arguments()
|
|
|
|
def test_parse_arguments_relative_output_dir(self):
|
|
"""Test that relative output directory is converted to absolute."""
|
|
test_args = ['main.py', '--output-dir', 'relative/path']
|
|
|
|
with patch('sys.argv', test_args):
|
|
args = parse_arguments()
|
|
|
|
assert args.output_dir.startswith('/') # Should be absolute path
|
|
assert args.output_dir.endswith('relative/path')
|
|
|
|
|
|
class TestSetupLogging:
|
|
"""Test logging setup functionality."""
|
|
|
|
def test_setup_logging_info_level(self):
|
|
"""Test logging setup with INFO level."""
|
|
with tempfile.NamedTemporaryFile(delete=False) as temp_file:
|
|
setup_logging('INFO', temp_file.name)
|
|
|
|
import logging
|
|
logger = logging.getLogger('test')
|
|
logger.info("Test message")
|
|
logger.debug("Debug message") # Should not appear
|
|
|
|
# Check that the log file was created and has correct level
|
|
assert logging.getLogger().level == logging.INFO
|
|
|
|
def test_setup_logging_debug_level(self):
|
|
"""Test logging setup with DEBUG level."""
|
|
with tempfile.NamedTemporaryFile(delete=False) as temp_file:
|
|
setup_logging('DEBUG', temp_file.name)
|
|
|
|
import logging
|
|
assert logging.getLogger().level == logging.DEBUG
|
|
|
|
def test_setup_logging_clears_existing_handlers(self):
|
|
"""Test that setup_logging clears existing handlers."""
|
|
import logging
|
|
|
|
# Add a dummy handler
|
|
dummy_handler = logging.StreamHandler()
|
|
logging.getLogger().addHandler(dummy_handler)
|
|
initial_handler_count = len(logging.getLogger().handlers)
|
|
|
|
with tempfile.NamedTemporaryFile(delete=False) as temp_file:
|
|
setup_logging('INFO', temp_file.name)
|
|
|
|
# Should have exactly 2 handlers (console + file)
|
|
assert len(logging.getLogger().handlers) == 2
|
|
|
|
|
|
class TestScrapeAndGenerateRss:
|
|
"""Test main scraping function."""
|
|
|
|
@patch('main.save_debug_html')
|
|
@patch('main.save_rss_feed')
|
|
@patch('main.generate_rss_feed')
|
|
@patch('main.extract_articles_from_html')
|
|
@patch('main.load_page_with_retry')
|
|
@patch('main.validate_url')
|
|
def test_scrape_and_generate_rss_success(
|
|
self, mock_validate_url, mock_load_page, mock_extract_articles,
|
|
mock_generate_rss, mock_save_rss, mock_save_html
|
|
):
|
|
"""Test successful RSS scraping and generation."""
|
|
# Setup mocks
|
|
mock_validate_url.return_value = True
|
|
mock_load_page.return_value = "<html>test</html>"
|
|
mock_extract_articles.return_value = [
|
|
{'title': 'Test', 'link': 'http://example.com', 'date': 'date'}
|
|
]
|
|
mock_generate_rss.return_value = b"<rss>feed</rss>"
|
|
mock_save_rss.return_value = "/path/to/feed.xml"
|
|
|
|
url = "https://www.warhammer-community.com/en-gb/"
|
|
output_dir = "/test/output"
|
|
|
|
# Should not raise any exception
|
|
scrape_and_generate_rss(url, output_dir)
|
|
|
|
# Verify all functions were called
|
|
mock_validate_url.assert_called_once_with(url)
|
|
mock_load_page.assert_called_once_with(url)
|
|
mock_extract_articles.assert_called_once_with("<html>test</html>", url)
|
|
mock_generate_rss.assert_called_once()
|
|
mock_save_rss.assert_called_once()
|
|
mock_save_html.assert_called_once()
|
|
|
|
@patch('main.validate_url')
|
|
def test_scrape_and_generate_rss_validation_error(self, mock_validate_url):
|
|
"""Test scraping fails with validation error."""
|
|
mock_validate_url.side_effect = ValidationError("Invalid URL")
|
|
|
|
with pytest.raises(ValidationError):
|
|
scrape_and_generate_rss("invalid-url")
|
|
|
|
@patch('main.load_page_with_retry')
|
|
@patch('main.validate_url')
|
|
def test_scrape_and_generate_rss_network_error(
|
|
self, mock_validate_url, mock_load_page
|
|
):
|
|
"""Test scraping fails with network error."""
|
|
mock_validate_url.return_value = True
|
|
mock_load_page.side_effect = NetworkError("Network error")
|
|
|
|
with pytest.raises(NetworkError):
|
|
scrape_and_generate_rss("https://www.warhammer-community.com/en-gb/")
|
|
|
|
@patch('main.extract_articles_from_html')
|
|
@patch('main.load_page_with_retry')
|
|
@patch('main.validate_url')
|
|
def test_scrape_and_generate_rss_parse_error(
|
|
self, mock_validate_url, mock_load_page, mock_extract_articles
|
|
):
|
|
"""Test scraping fails with parse error."""
|
|
mock_validate_url.return_value = True
|
|
mock_load_page.return_value = "<html>test</html>"
|
|
mock_extract_articles.side_effect = ParseError("Parse error")
|
|
|
|
with pytest.raises(ParseError):
|
|
scrape_and_generate_rss("https://www.warhammer-community.com/en-gb/")
|
|
|
|
@patch('main.save_debug_html')
|
|
@patch('main.save_rss_feed')
|
|
@patch('main.generate_rss_feed')
|
|
@patch('main.extract_articles_from_html')
|
|
@patch('main.load_page_with_retry')
|
|
@patch('main.validate_url')
|
|
def test_scrape_and_generate_rss_default_output_dir(
|
|
self, mock_validate_url, mock_load_page, mock_extract_articles,
|
|
mock_generate_rss, mock_save_rss, mock_save_html
|
|
):
|
|
"""Test scraping uses default output directory when none provided."""
|
|
# Setup mocks
|
|
mock_validate_url.return_value = True
|
|
mock_load_page.return_value = "<html>test</html>"
|
|
mock_extract_articles.return_value = []
|
|
mock_generate_rss.return_value = b"<rss>feed</rss>"
|
|
mock_save_rss.return_value = "/path/to/feed.xml"
|
|
|
|
url = "https://www.warhammer-community.com/en-gb/"
|
|
|
|
# Call without output_dir
|
|
scrape_and_generate_rss(url)
|
|
|
|
# Verify functions were called (output_dir would be set to default)
|
|
mock_validate_url.assert_called_once_with(url)
|
|
mock_save_rss.assert_called_once_with(b"<rss>feed</rss>", ".") # Default output dir |