Phil 25086fc01b Add comprehensive RSS scraper implementation with security and testing
- Modular architecture with separate modules for scraping, parsing, security, validation, and caching
- Comprehensive security measures including HTML sanitization, rate limiting, and input validation
- Robust error handling with custom exceptions and retry logic
- HTTP caching with ETags and Last-Modified headers for efficiency
- Pre-compiled regex patterns for improved performance
- Comprehensive test suite with 66 tests covering all major functionality
- Docker support for containerized deployment
- Configuration management with environment variable support
- Working parser that successfully extracts 32 articles from Warhammer Community

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-06-06 09:15:06 -06:00

202 lines
7.8 KiB
Python

"""Tests for main module functionality."""
import pytest
import sys
import tempfile
from unittest.mock import patch, MagicMock
from argparse import Namespace
from main import parse_arguments, setup_logging, scrape_and_generate_rss
from src.rss_scraper.exceptions import ValidationError, NetworkError, ParseError
class TestParseArguments:
"""Test command line argument parsing."""
def test_parse_arguments_defaults(self):
"""Test parsing with default arguments."""
with patch('sys.argv', ['main.py']):
args = parse_arguments()
assert args.url == 'https://www.warhammer-community.com/en-gb/'
assert args.output_dir is None
assert args.max_scroll == 5
assert args.log_level == 'INFO'
assert args.log_file == 'scraper.log'
def test_parse_arguments_custom_values(self):
"""Test parsing with custom argument values."""
test_args = [
'main.py',
'--url', 'https://example.com',
'--output-dir', '/custom/path',
'--max-scroll', '10',
'--log-level', 'DEBUG',
'--log-file', 'custom.log'
]
with patch('sys.argv', test_args):
args = parse_arguments()
assert args.url == 'https://example.com'
assert args.output_dir == '/custom/path'
assert args.max_scroll == 10
assert args.log_level == 'DEBUG'
assert args.log_file == 'custom.log'
def test_parse_arguments_invalid_max_scroll(self):
"""Test parsing fails with invalid max_scroll value."""
test_args = ['main.py', '--max-scroll', '-1']
with patch('sys.argv', test_args):
with pytest.raises(SystemExit):
parse_arguments()
def test_parse_arguments_relative_output_dir(self):
"""Test that relative output directory is converted to absolute."""
test_args = ['main.py', '--output-dir', 'relative/path']
with patch('sys.argv', test_args):
args = parse_arguments()
assert args.output_dir.startswith('/') # Should be absolute path
assert args.output_dir.endswith('relative/path')
class TestSetupLogging:
"""Test logging setup functionality."""
def test_setup_logging_info_level(self):
"""Test logging setup with INFO level."""
with tempfile.NamedTemporaryFile(delete=False) as temp_file:
setup_logging('INFO', temp_file.name)
import logging
logger = logging.getLogger('test')
logger.info("Test message")
logger.debug("Debug message") # Should not appear
# Check that the log file was created and has correct level
assert logging.getLogger().level == logging.INFO
def test_setup_logging_debug_level(self):
"""Test logging setup with DEBUG level."""
with tempfile.NamedTemporaryFile(delete=False) as temp_file:
setup_logging('DEBUG', temp_file.name)
import logging
assert logging.getLogger().level == logging.DEBUG
def test_setup_logging_clears_existing_handlers(self):
"""Test that setup_logging clears existing handlers."""
import logging
# Add a dummy handler
dummy_handler = logging.StreamHandler()
logging.getLogger().addHandler(dummy_handler)
initial_handler_count = len(logging.getLogger().handlers)
with tempfile.NamedTemporaryFile(delete=False) as temp_file:
setup_logging('INFO', temp_file.name)
# Should have exactly 2 handlers (console + file)
assert len(logging.getLogger().handlers) == 2
class TestScrapeAndGenerateRss:
"""Test main scraping function."""
@patch('main.save_debug_html')
@patch('main.save_rss_feed')
@patch('main.generate_rss_feed')
@patch('main.extract_articles_from_html')
@patch('main.load_page_with_retry')
@patch('main.validate_url')
def test_scrape_and_generate_rss_success(
self, mock_validate_url, mock_load_page, mock_extract_articles,
mock_generate_rss, mock_save_rss, mock_save_html
):
"""Test successful RSS scraping and generation."""
# Setup mocks
mock_validate_url.return_value = True
mock_load_page.return_value = "<html>test</html>"
mock_extract_articles.return_value = [
{'title': 'Test', 'link': 'http://example.com', 'date': 'date'}
]
mock_generate_rss.return_value = b"<rss>feed</rss>"
mock_save_rss.return_value = "/path/to/feed.xml"
url = "https://www.warhammer-community.com/en-gb/"
output_dir = "/test/output"
# Should not raise any exception
scrape_and_generate_rss(url, output_dir)
# Verify all functions were called
mock_validate_url.assert_called_once_with(url)
mock_load_page.assert_called_once_with(url)
mock_extract_articles.assert_called_once_with("<html>test</html>", url)
mock_generate_rss.assert_called_once()
mock_save_rss.assert_called_once()
mock_save_html.assert_called_once()
@patch('main.validate_url')
def test_scrape_and_generate_rss_validation_error(self, mock_validate_url):
"""Test scraping fails with validation error."""
mock_validate_url.side_effect = ValidationError("Invalid URL")
with pytest.raises(ValidationError):
scrape_and_generate_rss("invalid-url")
@patch('main.load_page_with_retry')
@patch('main.validate_url')
def test_scrape_and_generate_rss_network_error(
self, mock_validate_url, mock_load_page
):
"""Test scraping fails with network error."""
mock_validate_url.return_value = True
mock_load_page.side_effect = NetworkError("Network error")
with pytest.raises(NetworkError):
scrape_and_generate_rss("https://www.warhammer-community.com/en-gb/")
@patch('main.extract_articles_from_html')
@patch('main.load_page_with_retry')
@patch('main.validate_url')
def test_scrape_and_generate_rss_parse_error(
self, mock_validate_url, mock_load_page, mock_extract_articles
):
"""Test scraping fails with parse error."""
mock_validate_url.return_value = True
mock_load_page.return_value = "<html>test</html>"
mock_extract_articles.side_effect = ParseError("Parse error")
with pytest.raises(ParseError):
scrape_and_generate_rss("https://www.warhammer-community.com/en-gb/")
@patch('main.save_debug_html')
@patch('main.save_rss_feed')
@patch('main.generate_rss_feed')
@patch('main.extract_articles_from_html')
@patch('main.load_page_with_retry')
@patch('main.validate_url')
def test_scrape_and_generate_rss_default_output_dir(
self, mock_validate_url, mock_load_page, mock_extract_articles,
mock_generate_rss, mock_save_rss, mock_save_html
):
"""Test scraping uses default output directory when none provided."""
# Setup mocks
mock_validate_url.return_value = True
mock_load_page.return_value = "<html>test</html>"
mock_extract_articles.return_value = []
mock_generate_rss.return_value = b"<rss>feed</rss>"
mock_save_rss.return_value = "/path/to/feed.xml"
url = "https://www.warhammer-community.com/en-gb/"
# Call without output_dir
scrape_and_generate_rss(url)
# Verify functions were called (output_dir would be set to default)
mock_validate_url.assert_called_once_with(url)
mock_save_rss.assert_called_once_with(b"<rss>feed</rss>", ".") # Default output dir