Phil 25086fc01b Add comprehensive RSS scraper implementation with security and testing
- Modular architecture with separate modules for scraping, parsing, security, validation, and caching
- Comprehensive security measures including HTML sanitization, rate limiting, and input validation
- Robust error handling with custom exceptions and retry logic
- HTTP caching with ETags and Last-Modified headers for efficiency
- Pre-compiled regex patterns for improved performance
- Comprehensive test suite with 66 tests covering all major functionality
- Docker support for containerized deployment
- Configuration management with environment variable support
- Working parser that successfully extracts 32 articles from Warhammer Community

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-06-06 09:15:06 -06:00

116 lines
5.2 KiB
Python

"""Tests for configuration module."""
import pytest
import os
from unittest.mock import patch
from src.rss_scraper.config import Config
class TestConfig:
"""Test configuration functionality."""
def test_default_values(self):
"""Test that default configuration values are set correctly."""
assert Config.MAX_SCROLL_ITERATIONS == 5
assert Config.MAX_CONTENT_SIZE == 10 * 1024 * 1024
assert Config.MAX_TITLE_LENGTH == 500
assert Config.SCROLL_DELAY_SECONDS == 2.0
assert Config.PAGE_TIMEOUT_MS == 120000
assert Config.DEFAULT_URL == 'https://www.warhammer-community.com/en-gb/'
assert Config.DEFAULT_OUTPUT_DIR == '.'
assert Config.RSS_FILENAME == 'warhammer_rss_feed.xml'
assert Config.DEBUG_HTML_FILENAME == 'page.html'
assert Config.FEED_TITLE == 'Warhammer Community RSS Feed'
assert Config.FEED_DESCRIPTION == 'Latest Warhammer Community Articles'
def test_environment_variable_override(self):
"""Test that environment variables override default values."""
with patch.dict(os.environ, {
'MAX_SCROLL_ITERATIONS': '10',
'MAX_CONTENT_SIZE': '20971520', # 20MB
'SCROLL_DELAY_SECONDS': '1.5',
'DEFAULT_URL': 'https://example.com',
'RSS_FILENAME': 'custom_feed.xml'
}):
# Need to reload the config to pick up environment changes
import importlib
import config
importlib.reload(config)
assert config.Config.MAX_SCROLL_ITERATIONS == 10
assert config.Config.MAX_CONTENT_SIZE == 20971520
assert config.Config.SCROLL_DELAY_SECONDS == 1.5
assert config.Config.DEFAULT_URL == 'https://example.com'
assert config.Config.RSS_FILENAME == 'custom_feed.xml'
def test_get_output_dir_with_override(self):
"""Test get_output_dir method with override."""
result = Config.get_output_dir('/custom/path')
assert result == '/custom/path'
def test_get_output_dir_without_override(self):
"""Test get_output_dir method without override."""
result = Config.get_output_dir()
assert result == Config.DEFAULT_OUTPUT_DIR
def test_get_allowed_domains_default(self):
"""Test get_allowed_domains returns default domains."""
domains = Config.get_allowed_domains()
assert 'warhammer-community.com' in domains
assert 'www.warhammer-community.com' in domains
def test_get_allowed_domains_from_env(self):
"""Test get_allowed_domains reads from environment variable."""
with patch.dict(os.environ, {
'ALLOWED_DOMAINS': 'example.com,test.com,another.com'
}):
domains = Config.get_allowed_domains()
assert domains == ['example.com', 'test.com', 'another.com']
def test_validate_config_success(self):
"""Test that valid configuration passes validation."""
# Should not raise any exception
Config.validate_config()
def test_validate_config_negative_scroll_iterations(self):
"""Test validation fails for negative scroll iterations."""
with patch.object(Config, 'MAX_SCROLL_ITERATIONS', -1):
with pytest.raises(ValueError, match="MAX_SCROLL_ITERATIONS must be non-negative"):
Config.validate_config()
def test_validate_config_zero_content_size(self):
"""Test validation fails for zero content size."""
with patch.object(Config, 'MAX_CONTENT_SIZE', 0):
with pytest.raises(ValueError, match="MAX_CONTENT_SIZE must be positive"):
Config.validate_config()
def test_validate_config_zero_title_length(self):
"""Test validation fails for zero title length."""
with patch.object(Config, 'MAX_TITLE_LENGTH', 0):
with pytest.raises(ValueError, match="MAX_TITLE_LENGTH must be positive"):
Config.validate_config()
def test_validate_config_negative_scroll_delay(self):
"""Test validation fails for negative scroll delay."""
with patch.object(Config, 'SCROLL_DELAY_SECONDS', -1.0):
with pytest.raises(ValueError, match="SCROLL_DELAY_SECONDS must be non-negative"):
Config.validate_config()
def test_validate_config_zero_timeout(self):
"""Test validation fails for zero timeout."""
with patch.object(Config, 'PAGE_TIMEOUT_MS', 0):
with pytest.raises(ValueError, match="PAGE_TIMEOUT_MS must be positive"):
Config.validate_config()
def test_validate_config_invalid_url(self):
"""Test validation fails for invalid default URL."""
with patch.object(Config, 'DEFAULT_URL', 'not-a-url'):
with pytest.raises(ValueError, match="DEFAULT_URL must be a valid HTTP/HTTPS URL"):
Config.validate_config()
def test_validate_config_empty_domains(self):
"""Test validation fails for empty allowed domains."""
with patch.object(Config, 'get_allowed_domains', return_value=[]):
with pytest.raises(ValueError, match="ALLOWED_DOMAINS cannot be empty"):
Config.validate_config()