Add comprehensive RSS scraper implementation with security and testing
- Modular architecture with separate modules for scraping, parsing, security, validation, and caching - Comprehensive security measures including HTML sanitization, rate limiting, and input validation - Robust error handling with custom exceptions and retry logic - HTTP caching with ETags and Last-Modified headers for efficiency - Pre-compiled regex patterns for improved performance - Comprehensive test suite with 66 tests covering all major functionality - Docker support for containerized deployment - Configuration management with environment variable support - Working parser that successfully extracts 32 articles from Warhammer Community 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
1
tests/__init__.py
Normal file
1
tests/__init__.py
Normal file
@ -0,0 +1 @@
|
||||
# Tests package
|
116
tests/test_config.py
Normal file
116
tests/test_config.py
Normal file
@ -0,0 +1,116 @@
|
||||
"""Tests for configuration module."""
|
||||
|
||||
import pytest
|
||||
import os
|
||||
from unittest.mock import patch
|
||||
|
||||
from src.rss_scraper.config import Config
|
||||
|
||||
|
||||
class TestConfig:
|
||||
"""Test configuration functionality."""
|
||||
|
||||
def test_default_values(self):
|
||||
"""Test that default configuration values are set correctly."""
|
||||
assert Config.MAX_SCROLL_ITERATIONS == 5
|
||||
assert Config.MAX_CONTENT_SIZE == 10 * 1024 * 1024
|
||||
assert Config.MAX_TITLE_LENGTH == 500
|
||||
assert Config.SCROLL_DELAY_SECONDS == 2.0
|
||||
assert Config.PAGE_TIMEOUT_MS == 120000
|
||||
assert Config.DEFAULT_URL == 'https://www.warhammer-community.com/en-gb/'
|
||||
assert Config.DEFAULT_OUTPUT_DIR == '.'
|
||||
assert Config.RSS_FILENAME == 'warhammer_rss_feed.xml'
|
||||
assert Config.DEBUG_HTML_FILENAME == 'page.html'
|
||||
assert Config.FEED_TITLE == 'Warhammer Community RSS Feed'
|
||||
assert Config.FEED_DESCRIPTION == 'Latest Warhammer Community Articles'
|
||||
|
||||
def test_environment_variable_override(self):
|
||||
"""Test that environment variables override default values."""
|
||||
with patch.dict(os.environ, {
|
||||
'MAX_SCROLL_ITERATIONS': '10',
|
||||
'MAX_CONTENT_SIZE': '20971520', # 20MB
|
||||
'SCROLL_DELAY_SECONDS': '1.5',
|
||||
'DEFAULT_URL': 'https://example.com',
|
||||
'RSS_FILENAME': 'custom_feed.xml'
|
||||
}):
|
||||
# Need to reload the config to pick up environment changes
|
||||
import importlib
|
||||
import config
|
||||
importlib.reload(config)
|
||||
|
||||
assert config.Config.MAX_SCROLL_ITERATIONS == 10
|
||||
assert config.Config.MAX_CONTENT_SIZE == 20971520
|
||||
assert config.Config.SCROLL_DELAY_SECONDS == 1.5
|
||||
assert config.Config.DEFAULT_URL == 'https://example.com'
|
||||
assert config.Config.RSS_FILENAME == 'custom_feed.xml'
|
||||
|
||||
def test_get_output_dir_with_override(self):
|
||||
"""Test get_output_dir method with override."""
|
||||
result = Config.get_output_dir('/custom/path')
|
||||
assert result == '/custom/path'
|
||||
|
||||
def test_get_output_dir_without_override(self):
|
||||
"""Test get_output_dir method without override."""
|
||||
result = Config.get_output_dir()
|
||||
assert result == Config.DEFAULT_OUTPUT_DIR
|
||||
|
||||
def test_get_allowed_domains_default(self):
|
||||
"""Test get_allowed_domains returns default domains."""
|
||||
domains = Config.get_allowed_domains()
|
||||
assert 'warhammer-community.com' in domains
|
||||
assert 'www.warhammer-community.com' in domains
|
||||
|
||||
def test_get_allowed_domains_from_env(self):
|
||||
"""Test get_allowed_domains reads from environment variable."""
|
||||
with patch.dict(os.environ, {
|
||||
'ALLOWED_DOMAINS': 'example.com,test.com,another.com'
|
||||
}):
|
||||
domains = Config.get_allowed_domains()
|
||||
assert domains == ['example.com', 'test.com', 'another.com']
|
||||
|
||||
def test_validate_config_success(self):
|
||||
"""Test that valid configuration passes validation."""
|
||||
# Should not raise any exception
|
||||
Config.validate_config()
|
||||
|
||||
def test_validate_config_negative_scroll_iterations(self):
|
||||
"""Test validation fails for negative scroll iterations."""
|
||||
with patch.object(Config, 'MAX_SCROLL_ITERATIONS', -1):
|
||||
with pytest.raises(ValueError, match="MAX_SCROLL_ITERATIONS must be non-negative"):
|
||||
Config.validate_config()
|
||||
|
||||
def test_validate_config_zero_content_size(self):
|
||||
"""Test validation fails for zero content size."""
|
||||
with patch.object(Config, 'MAX_CONTENT_SIZE', 0):
|
||||
with pytest.raises(ValueError, match="MAX_CONTENT_SIZE must be positive"):
|
||||
Config.validate_config()
|
||||
|
||||
def test_validate_config_zero_title_length(self):
|
||||
"""Test validation fails for zero title length."""
|
||||
with patch.object(Config, 'MAX_TITLE_LENGTH', 0):
|
||||
with pytest.raises(ValueError, match="MAX_TITLE_LENGTH must be positive"):
|
||||
Config.validate_config()
|
||||
|
||||
def test_validate_config_negative_scroll_delay(self):
|
||||
"""Test validation fails for negative scroll delay."""
|
||||
with patch.object(Config, 'SCROLL_DELAY_SECONDS', -1.0):
|
||||
with pytest.raises(ValueError, match="SCROLL_DELAY_SECONDS must be non-negative"):
|
||||
Config.validate_config()
|
||||
|
||||
def test_validate_config_zero_timeout(self):
|
||||
"""Test validation fails for zero timeout."""
|
||||
with patch.object(Config, 'PAGE_TIMEOUT_MS', 0):
|
||||
with pytest.raises(ValueError, match="PAGE_TIMEOUT_MS must be positive"):
|
||||
Config.validate_config()
|
||||
|
||||
def test_validate_config_invalid_url(self):
|
||||
"""Test validation fails for invalid default URL."""
|
||||
with patch.object(Config, 'DEFAULT_URL', 'not-a-url'):
|
||||
with pytest.raises(ValueError, match="DEFAULT_URL must be a valid HTTP/HTTPS URL"):
|
||||
Config.validate_config()
|
||||
|
||||
def test_validate_config_empty_domains(self):
|
||||
"""Test validation fails for empty allowed domains."""
|
||||
with patch.object(Config, 'get_allowed_domains', return_value=[]):
|
||||
with pytest.raises(ValueError, match="ALLOWED_DOMAINS cannot be empty"):
|
||||
Config.validate_config()
|
202
tests/test_main.py
Normal file
202
tests/test_main.py
Normal file
@ -0,0 +1,202 @@
|
||||
"""Tests for main module functionality."""
|
||||
|
||||
import pytest
|
||||
import sys
|
||||
import tempfile
|
||||
from unittest.mock import patch, MagicMock
|
||||
from argparse import Namespace
|
||||
|
||||
from main import parse_arguments, setup_logging, scrape_and_generate_rss
|
||||
from src.rss_scraper.exceptions import ValidationError, NetworkError, ParseError
|
||||
|
||||
|
||||
class TestParseArguments:
|
||||
"""Test command line argument parsing."""
|
||||
|
||||
def test_parse_arguments_defaults(self):
|
||||
"""Test parsing with default arguments."""
|
||||
with patch('sys.argv', ['main.py']):
|
||||
args = parse_arguments()
|
||||
|
||||
assert args.url == 'https://www.warhammer-community.com/en-gb/'
|
||||
assert args.output_dir is None
|
||||
assert args.max_scroll == 5
|
||||
assert args.log_level == 'INFO'
|
||||
assert args.log_file == 'scraper.log'
|
||||
|
||||
def test_parse_arguments_custom_values(self):
|
||||
"""Test parsing with custom argument values."""
|
||||
test_args = [
|
||||
'main.py',
|
||||
'--url', 'https://example.com',
|
||||
'--output-dir', '/custom/path',
|
||||
'--max-scroll', '10',
|
||||
'--log-level', 'DEBUG',
|
||||
'--log-file', 'custom.log'
|
||||
]
|
||||
|
||||
with patch('sys.argv', test_args):
|
||||
args = parse_arguments()
|
||||
|
||||
assert args.url == 'https://example.com'
|
||||
assert args.output_dir == '/custom/path'
|
||||
assert args.max_scroll == 10
|
||||
assert args.log_level == 'DEBUG'
|
||||
assert args.log_file == 'custom.log'
|
||||
|
||||
def test_parse_arguments_invalid_max_scroll(self):
|
||||
"""Test parsing fails with invalid max_scroll value."""
|
||||
test_args = ['main.py', '--max-scroll', '-1']
|
||||
|
||||
with patch('sys.argv', test_args):
|
||||
with pytest.raises(SystemExit):
|
||||
parse_arguments()
|
||||
|
||||
def test_parse_arguments_relative_output_dir(self):
|
||||
"""Test that relative output directory is converted to absolute."""
|
||||
test_args = ['main.py', '--output-dir', 'relative/path']
|
||||
|
||||
with patch('sys.argv', test_args):
|
||||
args = parse_arguments()
|
||||
|
||||
assert args.output_dir.startswith('/') # Should be absolute path
|
||||
assert args.output_dir.endswith('relative/path')
|
||||
|
||||
|
||||
class TestSetupLogging:
|
||||
"""Test logging setup functionality."""
|
||||
|
||||
def test_setup_logging_info_level(self):
|
||||
"""Test logging setup with INFO level."""
|
||||
with tempfile.NamedTemporaryFile(delete=False) as temp_file:
|
||||
setup_logging('INFO', temp_file.name)
|
||||
|
||||
import logging
|
||||
logger = logging.getLogger('test')
|
||||
logger.info("Test message")
|
||||
logger.debug("Debug message") # Should not appear
|
||||
|
||||
# Check that the log file was created and has correct level
|
||||
assert logging.getLogger().level == logging.INFO
|
||||
|
||||
def test_setup_logging_debug_level(self):
|
||||
"""Test logging setup with DEBUG level."""
|
||||
with tempfile.NamedTemporaryFile(delete=False) as temp_file:
|
||||
setup_logging('DEBUG', temp_file.name)
|
||||
|
||||
import logging
|
||||
assert logging.getLogger().level == logging.DEBUG
|
||||
|
||||
def test_setup_logging_clears_existing_handlers(self):
|
||||
"""Test that setup_logging clears existing handlers."""
|
||||
import logging
|
||||
|
||||
# Add a dummy handler
|
||||
dummy_handler = logging.StreamHandler()
|
||||
logging.getLogger().addHandler(dummy_handler)
|
||||
initial_handler_count = len(logging.getLogger().handlers)
|
||||
|
||||
with tempfile.NamedTemporaryFile(delete=False) as temp_file:
|
||||
setup_logging('INFO', temp_file.name)
|
||||
|
||||
# Should have exactly 2 handlers (console + file)
|
||||
assert len(logging.getLogger().handlers) == 2
|
||||
|
||||
|
||||
class TestScrapeAndGenerateRss:
|
||||
"""Test main scraping function."""
|
||||
|
||||
@patch('main.save_debug_html')
|
||||
@patch('main.save_rss_feed')
|
||||
@patch('main.generate_rss_feed')
|
||||
@patch('main.extract_articles_from_html')
|
||||
@patch('main.load_page_with_retry')
|
||||
@patch('main.validate_url')
|
||||
def test_scrape_and_generate_rss_success(
|
||||
self, mock_validate_url, mock_load_page, mock_extract_articles,
|
||||
mock_generate_rss, mock_save_rss, mock_save_html
|
||||
):
|
||||
"""Test successful RSS scraping and generation."""
|
||||
# Setup mocks
|
||||
mock_validate_url.return_value = True
|
||||
mock_load_page.return_value = "<html>test</html>"
|
||||
mock_extract_articles.return_value = [
|
||||
{'title': 'Test', 'link': 'http://example.com', 'date': 'date'}
|
||||
]
|
||||
mock_generate_rss.return_value = b"<rss>feed</rss>"
|
||||
mock_save_rss.return_value = "/path/to/feed.xml"
|
||||
|
||||
url = "https://www.warhammer-community.com/en-gb/"
|
||||
output_dir = "/test/output"
|
||||
|
||||
# Should not raise any exception
|
||||
scrape_and_generate_rss(url, output_dir)
|
||||
|
||||
# Verify all functions were called
|
||||
mock_validate_url.assert_called_once_with(url)
|
||||
mock_load_page.assert_called_once_with(url)
|
||||
mock_extract_articles.assert_called_once_with("<html>test</html>", url)
|
||||
mock_generate_rss.assert_called_once()
|
||||
mock_save_rss.assert_called_once()
|
||||
mock_save_html.assert_called_once()
|
||||
|
||||
@patch('main.validate_url')
|
||||
def test_scrape_and_generate_rss_validation_error(self, mock_validate_url):
|
||||
"""Test scraping fails with validation error."""
|
||||
mock_validate_url.side_effect = ValidationError("Invalid URL")
|
||||
|
||||
with pytest.raises(ValidationError):
|
||||
scrape_and_generate_rss("invalid-url")
|
||||
|
||||
@patch('main.load_page_with_retry')
|
||||
@patch('main.validate_url')
|
||||
def test_scrape_and_generate_rss_network_error(
|
||||
self, mock_validate_url, mock_load_page
|
||||
):
|
||||
"""Test scraping fails with network error."""
|
||||
mock_validate_url.return_value = True
|
||||
mock_load_page.side_effect = NetworkError("Network error")
|
||||
|
||||
with pytest.raises(NetworkError):
|
||||
scrape_and_generate_rss("https://www.warhammer-community.com/en-gb/")
|
||||
|
||||
@patch('main.extract_articles_from_html')
|
||||
@patch('main.load_page_with_retry')
|
||||
@patch('main.validate_url')
|
||||
def test_scrape_and_generate_rss_parse_error(
|
||||
self, mock_validate_url, mock_load_page, mock_extract_articles
|
||||
):
|
||||
"""Test scraping fails with parse error."""
|
||||
mock_validate_url.return_value = True
|
||||
mock_load_page.return_value = "<html>test</html>"
|
||||
mock_extract_articles.side_effect = ParseError("Parse error")
|
||||
|
||||
with pytest.raises(ParseError):
|
||||
scrape_and_generate_rss("https://www.warhammer-community.com/en-gb/")
|
||||
|
||||
@patch('main.save_debug_html')
|
||||
@patch('main.save_rss_feed')
|
||||
@patch('main.generate_rss_feed')
|
||||
@patch('main.extract_articles_from_html')
|
||||
@patch('main.load_page_with_retry')
|
||||
@patch('main.validate_url')
|
||||
def test_scrape_and_generate_rss_default_output_dir(
|
||||
self, mock_validate_url, mock_load_page, mock_extract_articles,
|
||||
mock_generate_rss, mock_save_rss, mock_save_html
|
||||
):
|
||||
"""Test scraping uses default output directory when none provided."""
|
||||
# Setup mocks
|
||||
mock_validate_url.return_value = True
|
||||
mock_load_page.return_value = "<html>test</html>"
|
||||
mock_extract_articles.return_value = []
|
||||
mock_generate_rss.return_value = b"<rss>feed</rss>"
|
||||
mock_save_rss.return_value = "/path/to/feed.xml"
|
||||
|
||||
url = "https://www.warhammer-community.com/en-gb/"
|
||||
|
||||
# Call without output_dir
|
||||
scrape_and_generate_rss(url)
|
||||
|
||||
# Verify functions were called (output_dir would be set to default)
|
||||
mock_validate_url.assert_called_once_with(url)
|
||||
mock_save_rss.assert_called_once_with(b"<rss>feed</rss>", ".") # Default output dir
|
208
tests/test_parser.py
Normal file
208
tests/test_parser.py
Normal file
@ -0,0 +1,208 @@
|
||||
"""Tests for parser module."""
|
||||
|
||||
import pytest
|
||||
from datetime import datetime
|
||||
import pytz
|
||||
from unittest.mock import patch
|
||||
|
||||
from src.rss_scraper.parser import sanitize_text, extract_articles_from_html
|
||||
from src.rss_scraper.exceptions import ParseError
|
||||
from src.rss_scraper.config import Config
|
||||
|
||||
|
||||
class TestSanitizeText:
|
||||
"""Test text sanitization functionality."""
|
||||
|
||||
def test_sanitize_normal_text(self):
|
||||
"""Test sanitization of normal text."""
|
||||
text = "Normal article title"
|
||||
result = sanitize_text(text)
|
||||
assert result == "Normal article title"
|
||||
|
||||
def test_sanitize_none_text(self):
|
||||
"""Test sanitization of None text."""
|
||||
result = sanitize_text(None)
|
||||
assert result == "No title"
|
||||
|
||||
def test_sanitize_empty_text(self):
|
||||
"""Test sanitization of empty text."""
|
||||
result = sanitize_text("")
|
||||
assert result == "No title"
|
||||
|
||||
def test_sanitize_whitespace_text(self):
|
||||
"""Test sanitization of whitespace-only text."""
|
||||
result = sanitize_text(" ")
|
||||
assert result == "No title"
|
||||
|
||||
def test_remove_dangerous_patterns(self):
|
||||
"""Test removal of dangerous patterns."""
|
||||
dangerous_text = "Title with <script>alert('xss')</script> content"
|
||||
result = sanitize_text(dangerous_text)
|
||||
assert "<script" not in result
|
||||
assert "</script" not in result
|
||||
assert "alert('xss')" in result # Only script tags should be removed
|
||||
|
||||
def test_length_limit(self):
|
||||
"""Test that text is limited to max length."""
|
||||
long_text = "a" * 1000
|
||||
result = sanitize_text(long_text)
|
||||
assert len(result) <= Config.MAX_TITLE_LENGTH
|
||||
|
||||
def test_case_insensitive_pattern_removal(self):
|
||||
"""Test that dangerous patterns are removed case-insensitively."""
|
||||
text = "Title with <SCRIPT>alert('xss')</SCRIPT> and javascript: protocol"
|
||||
result = sanitize_text(text)
|
||||
assert "<SCRIPT" not in result
|
||||
assert "</SCRIPT" not in result
|
||||
assert "javascript:" not in result
|
||||
|
||||
|
||||
class TestExtractArticlesFromHtml:
|
||||
"""Test article extraction from HTML."""
|
||||
|
||||
def test_extract_articles_valid_html(self):
|
||||
"""Test extraction from valid HTML with articles."""
|
||||
html = """
|
||||
<html>
|
||||
<body>
|
||||
<article>
|
||||
<h3 class="newsCard-title-sm">Test Article 1</h3>
|
||||
<a href="/article/test-1">Read more</a>
|
||||
<time>01 Jan 24</time>
|
||||
</article>
|
||||
<article>
|
||||
<h3 class="newsCard-title-lg">Test Article 2</h3>
|
||||
<a href="https://www.warhammer-community.com/article/test-2">Read more</a>
|
||||
<time>02 Jan 24</time>
|
||||
</article>
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
|
||||
base_url = "https://www.warhammer-community.com"
|
||||
articles = extract_articles_from_html(html, base_url)
|
||||
|
||||
assert len(articles) == 2
|
||||
assert articles[0]['title'] == "Test Article 2" # Sorted by date, newest first
|
||||
assert articles[1]['title'] == "Test Article 1"
|
||||
assert "warhammer-community.com" in articles[0]['link']
|
||||
assert "warhammer-community.com" in articles[1]['link']
|
||||
|
||||
def test_extract_articles_no_articles(self):
|
||||
"""Test extraction from HTML with no articles."""
|
||||
html = """
|
||||
<html>
|
||||
<body>
|
||||
<div>No articles here</div>
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
|
||||
base_url = "https://www.warhammer-community.com"
|
||||
articles = extract_articles_from_html(html, base_url)
|
||||
|
||||
assert len(articles) == 0
|
||||
|
||||
def test_extract_articles_duplicate_links(self):
|
||||
"""Test that duplicate links are filtered out."""
|
||||
html = """
|
||||
<html>
|
||||
<body>
|
||||
<article>
|
||||
<h3 class="newsCard-title-sm">Test Article 1</h3>
|
||||
<a href="/article/test-1">Read more</a>
|
||||
<time>01 Jan 24</time>
|
||||
</article>
|
||||
<article>
|
||||
<h3 class="newsCard-title-lg">Test Article 1 Duplicate</h3>
|
||||
<a href="/article/test-1">Read more</a>
|
||||
<time>02 Jan 24</time>
|
||||
</article>
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
|
||||
base_url = "https://www.warhammer-community.com"
|
||||
articles = extract_articles_from_html(html, base_url)
|
||||
|
||||
assert len(articles) == 1 # Duplicate should be filtered out
|
||||
assert articles[0]['title'] == "Test Article 1"
|
||||
|
||||
def test_extract_articles_invalid_links(self):
|
||||
"""Test handling of articles with invalid links."""
|
||||
html = """
|
||||
<html>
|
||||
<body>
|
||||
<article>
|
||||
<h3 class="newsCard-title-sm">Valid Article</h3>
|
||||
<a href="/article/valid">Read more</a>
|
||||
<time>01 Jan 24</time>
|
||||
</article>
|
||||
<article>
|
||||
<h3 class="newsCard-title-lg">Invalid Article</h3>
|
||||
<a href="https://malicious-site.com/article">Read more</a>
|
||||
<time>02 Jan 24</time>
|
||||
</article>
|
||||
<article>
|
||||
<h3 class="newsCard-title-sm">No Link Article</h3>
|
||||
<time>03 Jan 24</time>
|
||||
</article>
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
|
||||
base_url = "https://www.warhammer-community.com"
|
||||
articles = extract_articles_from_html(html, base_url)
|
||||
|
||||
assert len(articles) == 1 # Only valid article should be included
|
||||
assert articles[0]['title'] == "Valid Article"
|
||||
|
||||
def test_extract_articles_date_parsing(self):
|
||||
"""Test parsing of various date formats."""
|
||||
html = """
|
||||
<html>
|
||||
<body>
|
||||
<article>
|
||||
<h3 class="newsCard-title-sm">Article with good date</h3>
|
||||
<a href="/article/1">Read more</a>
|
||||
<time>15 Mar 24</time>
|
||||
</article>
|
||||
<article>
|
||||
<h3 class="newsCard-title-lg">Article with bad date</h3>
|
||||
<a href="/article/2">Read more</a>
|
||||
<time>Invalid Date Format</time>
|
||||
</article>
|
||||
<article>
|
||||
<h3 class="newsCard-title-sm">Article with reading time</h3>
|
||||
<a href="/article/3">Read more</a>
|
||||
<time>5 min read</time>
|
||||
<time>20 Mar 24</time>
|
||||
</article>
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
|
||||
base_url = "https://www.warhammer-community.com"
|
||||
articles = extract_articles_from_html(html, base_url)
|
||||
|
||||
assert len(articles) == 3
|
||||
|
||||
# Check that dates are parsed correctly
|
||||
for article in articles:
|
||||
assert isinstance(article['date'], datetime)
|
||||
assert article['date'].tzinfo is not None
|
||||
|
||||
def test_extract_articles_malformed_html(self):
|
||||
"""Test handling of malformed HTML."""
|
||||
malformed_html = "<html><body><article><h3>Unclosed tags"
|
||||
|
||||
base_url = "https://www.warhammer-community.com"
|
||||
# Should not raise ParseError - BeautifulSoup handles malformed HTML gracefully
|
||||
articles = extract_articles_from_html(malformed_html, base_url)
|
||||
assert isinstance(articles, list)
|
||||
|
||||
def test_extract_articles_invalid_html(self):
|
||||
"""Test handling of completely invalid HTML."""
|
||||
with patch('bs4.BeautifulSoup', side_effect=Exception("Parser error")):
|
||||
with pytest.raises(ParseError):
|
||||
extract_articles_from_html("<html></html>", "https://example.com")
|
162
tests/test_rss_generator.py
Normal file
162
tests/test_rss_generator.py
Normal file
@ -0,0 +1,162 @@
|
||||
"""Tests for RSS generator module."""
|
||||
|
||||
import pytest
|
||||
import os
|
||||
import tempfile
|
||||
from datetime import datetime
|
||||
import pytz
|
||||
from unittest.mock import patch, mock_open
|
||||
|
||||
from src.rss_scraper.rss_generator import generate_rss_feed, save_rss_feed, save_debug_html
|
||||
from src.rss_scraper.exceptions import FileOperationError
|
||||
|
||||
|
||||
class TestGenerateRssFeed:
|
||||
"""Test RSS feed generation functionality."""
|
||||
|
||||
def test_generate_rss_feed_with_articles(self):
|
||||
"""Test RSS generation with valid articles."""
|
||||
timezone = pytz.UTC
|
||||
articles = [
|
||||
{
|
||||
'title': 'Test Article 1',
|
||||
'link': 'https://example.com/article1',
|
||||
'date': datetime(2024, 1, 1, tzinfo=timezone)
|
||||
},
|
||||
{
|
||||
'title': 'Test Article 2',
|
||||
'link': 'https://example.com/article2',
|
||||
'date': datetime(2024, 1, 2, tzinfo=timezone)
|
||||
}
|
||||
]
|
||||
|
||||
feed_url = "https://example.com"
|
||||
rss_content = generate_rss_feed(articles, feed_url)
|
||||
|
||||
assert isinstance(rss_content, bytes)
|
||||
rss_str = rss_content.decode('utf-8')
|
||||
assert 'Test Article 1' in rss_str
|
||||
assert 'Test Article 2' in rss_str
|
||||
assert 'https://example.com/article1' in rss_str
|
||||
assert 'https://example.com/article2' in rss_str
|
||||
assert '<?xml version=' in rss_str
|
||||
assert '<rss version=' in rss_str
|
||||
|
||||
def test_generate_rss_feed_empty_articles(self):
|
||||
"""Test RSS generation with empty articles list."""
|
||||
articles = []
|
||||
feed_url = "https://example.com"
|
||||
|
||||
rss_content = generate_rss_feed(articles, feed_url)
|
||||
|
||||
assert isinstance(rss_content, bytes)
|
||||
rss_str = rss_content.decode('utf-8')
|
||||
assert '<?xml version=' in rss_str
|
||||
assert '<rss version=' in rss_str
|
||||
# Should still contain feed metadata
|
||||
assert 'Warhammer Community RSS Feed' in rss_str
|
||||
|
||||
def test_generate_rss_feed_unicode_content(self):
|
||||
"""Test RSS generation with unicode content."""
|
||||
timezone = pytz.UTC
|
||||
articles = [
|
||||
{
|
||||
'title': 'Tëst Artìclé with Ūnïcödë',
|
||||
'link': 'https://example.com/unicode',
|
||||
'date': datetime(2024, 1, 1, tzinfo=timezone)
|
||||
}
|
||||
]
|
||||
|
||||
feed_url = "https://example.com"
|
||||
rss_content = generate_rss_feed(articles, feed_url)
|
||||
|
||||
assert isinstance(rss_content, bytes)
|
||||
rss_str = rss_content.decode('utf-8')
|
||||
assert 'Tëst Artìclé with Ūnïcödë' in rss_str
|
||||
|
||||
|
||||
class TestSaveRssFeed:
|
||||
"""Test RSS feed saving functionality."""
|
||||
|
||||
def test_save_rss_feed_success(self):
|
||||
"""Test successful RSS feed saving."""
|
||||
rss_content = b'<?xml version="1.0"?><rss>test</rss>'
|
||||
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
result_path = save_rss_feed(rss_content, temp_dir)
|
||||
|
||||
assert os.path.exists(result_path)
|
||||
assert result_path.endswith('warhammer_rss_feed.xml')
|
||||
|
||||
with open(result_path, 'rb') as f:
|
||||
saved_content = f.read()
|
||||
assert saved_content == rss_content
|
||||
|
||||
def test_save_rss_feed_permission_error(self):
|
||||
"""Test RSS feed saving with permission error."""
|
||||
rss_content = b'<?xml version="1.0"?><rss>test</rss>'
|
||||
|
||||
with patch('builtins.open', side_effect=PermissionError("Permission denied")):
|
||||
with pytest.raises(FileOperationError):
|
||||
save_rss_feed(rss_content, "/some/path")
|
||||
|
||||
def test_save_rss_feed_creates_directory(self):
|
||||
"""Test that RSS feed saving creates directory if needed."""
|
||||
rss_content = b'<?xml version="1.0"?><rss>test</rss>'
|
||||
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
new_subdir = os.path.join(temp_dir, "new_subdir")
|
||||
result_path = save_rss_feed(rss_content, new_subdir)
|
||||
|
||||
assert os.path.exists(new_subdir)
|
||||
assert os.path.exists(result_path)
|
||||
|
||||
|
||||
class TestSaveDebugHtml:
|
||||
"""Test debug HTML saving functionality."""
|
||||
|
||||
def test_save_debug_html_success(self):
|
||||
"""Test successful debug HTML saving."""
|
||||
html_content = "<html><body>Test content</body></html>"
|
||||
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
save_debug_html(html_content, temp_dir)
|
||||
|
||||
html_path = os.path.join(temp_dir, "page.html")
|
||||
assert os.path.exists(html_path)
|
||||
|
||||
with open(html_path, 'r', encoding='utf-8') as f:
|
||||
saved_content = f.read()
|
||||
# BeautifulSoup prettifies the content
|
||||
assert "Test content" in saved_content
|
||||
|
||||
def test_save_debug_html_permission_error(self):
|
||||
"""Test debug HTML saving with permission error (should not raise)."""
|
||||
html_content = "<html><body>Test content</body></html>"
|
||||
|
||||
with patch('builtins.open', side_effect=PermissionError("Permission denied")):
|
||||
# Should not raise exception, just log warning
|
||||
save_debug_html(html_content, "/some/path")
|
||||
|
||||
def test_save_debug_html_malformed_content(self):
|
||||
"""Test debug HTML saving with malformed HTML content."""
|
||||
malformed_html = "<html><body>Unclosed tags"
|
||||
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
# Should handle malformed HTML gracefully
|
||||
save_debug_html(malformed_html, temp_dir)
|
||||
|
||||
html_path = os.path.join(temp_dir, "page.html")
|
||||
assert os.path.exists(html_path)
|
||||
|
||||
def test_save_debug_html_creates_directory(self):
|
||||
"""Test that debug HTML saving creates directory if needed."""
|
||||
html_content = "<html><body>Test content</body></html>"
|
||||
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
new_subdir = os.path.join(temp_dir, "new_subdir")
|
||||
save_debug_html(html_content, new_subdir)
|
||||
|
||||
assert os.path.exists(new_subdir)
|
||||
html_path = os.path.join(new_subdir, "page.html")
|
||||
assert os.path.exists(html_path)
|
170
tests/test_validation.py
Normal file
170
tests/test_validation.py
Normal file
@ -0,0 +1,170 @@
|
||||
"""Tests for validation module."""
|
||||
|
||||
import pytest
|
||||
import os
|
||||
import tempfile
|
||||
from unittest.mock import patch
|
||||
|
||||
from src.rss_scraper.validation import validate_url, validate_output_path, validate_link
|
||||
from src.rss_scraper.exceptions import ValidationError, FileOperationError
|
||||
from src.rss_scraper.config import Config
|
||||
|
||||
|
||||
class TestValidateUrl:
|
||||
"""Test URL validation functionality."""
|
||||
|
||||
def test_valid_url(self):
|
||||
"""Test validation of valid URLs."""
|
||||
valid_urls = [
|
||||
"https://www.warhammer-community.com/en-gb/",
|
||||
"https://warhammer-community.com/some/path",
|
||||
]
|
||||
|
||||
for url in valid_urls:
|
||||
assert validate_url(url) is True
|
||||
|
||||
def test_invalid_url_format(self):
|
||||
"""Test validation fails for invalid URL formats."""
|
||||
invalid_urls = [
|
||||
"not-a-url",
|
||||
"ftp://example.com",
|
||||
"",
|
||||
"http://",
|
||||
"https://",
|
||||
]
|
||||
|
||||
for url in invalid_urls:
|
||||
with pytest.raises(ValidationError):
|
||||
validate_url(url)
|
||||
|
||||
def test_disallowed_domain(self):
|
||||
"""Test validation fails for disallowed domains."""
|
||||
disallowed_urls = [
|
||||
"https://malicious-site.com",
|
||||
"https://example.com",
|
||||
"https://google.com",
|
||||
]
|
||||
|
||||
for url in disallowed_urls:
|
||||
with pytest.raises(ValidationError):
|
||||
validate_url(url)
|
||||
|
||||
def test_case_insensitive_domain(self):
|
||||
"""Test domain validation is case insensitive."""
|
||||
urls = [
|
||||
"https://WWW.WARHAMMER-COMMUNITY.COM",
|
||||
"https://Warhammer-Community.com",
|
||||
]
|
||||
|
||||
for url in urls:
|
||||
assert validate_url(url) is True
|
||||
|
||||
|
||||
class TestValidateOutputPath:
|
||||
"""Test output path validation functionality."""
|
||||
|
||||
def test_valid_path_within_base(self):
|
||||
"""Test validation of valid paths within base directory."""
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
test_path = os.path.join(temp_dir, "output.xml")
|
||||
result = validate_output_path(test_path, temp_dir)
|
||||
assert result == os.path.abspath(test_path)
|
||||
|
||||
def test_path_outside_base_directory(self):
|
||||
"""Test validation fails for paths outside base directory."""
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
outside_path = "/tmp/malicious.xml"
|
||||
with pytest.raises(ValidationError):
|
||||
validate_output_path(outside_path, temp_dir)
|
||||
|
||||
def test_absolute_path_within_base_directory(self):
|
||||
"""Test that absolute paths within base directory are allowed."""
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
# This should work - absolute path within the base directory
|
||||
abs_path = os.path.join(temp_dir, "output.xml")
|
||||
result = validate_output_path(abs_path, temp_dir)
|
||||
assert result == os.path.abspath(abs_path)
|
||||
|
||||
def test_creates_directory_if_not_exists(self):
|
||||
"""Test that validation creates directory if it doesn't exist."""
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
new_subdir = os.path.join(temp_dir, "new_subdir")
|
||||
test_path = os.path.join(new_subdir, "output.xml")
|
||||
|
||||
result = validate_output_path(test_path, new_subdir)
|
||||
|
||||
assert os.path.exists(new_subdir)
|
||||
assert result == os.path.abspath(test_path)
|
||||
|
||||
def test_directory_traversal_protection(self):
|
||||
"""Test that directory traversal attacks are blocked."""
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
# These should be blocked - either by directory traversal check or outside-base check
|
||||
traversal_paths = [
|
||||
"../../../etc/passwd",
|
||||
"subdir/../../../etc/passwd",
|
||||
"normal/../../../dangerous.xml"
|
||||
]
|
||||
|
||||
for path in traversal_paths:
|
||||
with pytest.raises(ValidationError): # Either error type is acceptable
|
||||
validate_output_path(path, temp_dir)
|
||||
|
||||
def test_permission_error(self):
|
||||
"""Test handling of permission errors."""
|
||||
with patch('os.makedirs', side_effect=PermissionError("Permission denied")):
|
||||
with pytest.raises(FileOperationError):
|
||||
validate_output_path("/some/path/file.xml", "/some/path")
|
||||
|
||||
|
||||
class TestValidateLink:
|
||||
"""Test link validation functionality."""
|
||||
|
||||
def test_valid_absolute_link(self):
|
||||
"""Test validation of valid absolute links."""
|
||||
base_url = "https://www.warhammer-community.com"
|
||||
valid_link = "https://www.warhammer-community.com/article"
|
||||
|
||||
result = validate_link(valid_link, base_url)
|
||||
assert result == valid_link
|
||||
|
||||
def test_valid_relative_link(self):
|
||||
"""Test validation of valid relative links."""
|
||||
base_url = "https://www.warhammer-community.com/en-gb/"
|
||||
relative_link = "/article/some-article"
|
||||
|
||||
result = validate_link(relative_link, base_url)
|
||||
assert result == "https://www.warhammer-community.com/article/some-article"
|
||||
|
||||
def test_none_link(self):
|
||||
"""Test handling of None link."""
|
||||
base_url = "https://www.warhammer-community.com"
|
||||
result = validate_link(None, base_url)
|
||||
assert result is None
|
||||
|
||||
def test_empty_link(self):
|
||||
"""Test handling of empty link."""
|
||||
base_url = "https://www.warhammer-community.com"
|
||||
result = validate_link("", base_url)
|
||||
assert result is None
|
||||
|
||||
def test_invalid_domain_link(self):
|
||||
"""Test rejection of links from invalid domains."""
|
||||
base_url = "https://www.warhammer-community.com"
|
||||
invalid_link = "https://malicious-site.com/article"
|
||||
|
||||
result = validate_link(invalid_link, base_url)
|
||||
assert result is None
|
||||
|
||||
def test_malformed_link(self):
|
||||
"""Test handling of malformed links."""
|
||||
base_url = "https://www.warhammer-community.com"
|
||||
malformed_links = [
|
||||
"not-a-url",
|
||||
"://missing-scheme",
|
||||
"https://",
|
||||
]
|
||||
|
||||
for link in malformed_links:
|
||||
result = validate_link(link, base_url)
|
||||
assert result is None
|
Reference in New Issue
Block a user