Add comprehensive RSS scraper implementation with security and testing

- Modular architecture with separate modules for scraping, parsing, security, validation, and caching
- Comprehensive security measures including HTML sanitization, rate limiting, and input validation
- Robust error handling with custom exceptions and retry logic
- HTTP caching with ETags and Last-Modified headers for efficiency
- Pre-compiled regex patterns for improved performance
- Comprehensive test suite with 66 tests covering all major functionality
- Docker support for containerized deployment
- Configuration management with environment variable support
- Working parser that successfully extracts 32 articles from Warhammer Community

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
2025-06-06 09:15:06 -06:00
parent e0647325ff
commit 25086fc01b
26 changed files with 15226 additions and 280 deletions

1
tests/__init__.py Normal file
View File

@ -0,0 +1 @@
# Tests package

116
tests/test_config.py Normal file
View File

@ -0,0 +1,116 @@
"""Tests for configuration module."""
import pytest
import os
from unittest.mock import patch
from src.rss_scraper.config import Config
class TestConfig:
"""Test configuration functionality."""
def test_default_values(self):
"""Test that default configuration values are set correctly."""
assert Config.MAX_SCROLL_ITERATIONS == 5
assert Config.MAX_CONTENT_SIZE == 10 * 1024 * 1024
assert Config.MAX_TITLE_LENGTH == 500
assert Config.SCROLL_DELAY_SECONDS == 2.0
assert Config.PAGE_TIMEOUT_MS == 120000
assert Config.DEFAULT_URL == 'https://www.warhammer-community.com/en-gb/'
assert Config.DEFAULT_OUTPUT_DIR == '.'
assert Config.RSS_FILENAME == 'warhammer_rss_feed.xml'
assert Config.DEBUG_HTML_FILENAME == 'page.html'
assert Config.FEED_TITLE == 'Warhammer Community RSS Feed'
assert Config.FEED_DESCRIPTION == 'Latest Warhammer Community Articles'
def test_environment_variable_override(self):
"""Test that environment variables override default values."""
with patch.dict(os.environ, {
'MAX_SCROLL_ITERATIONS': '10',
'MAX_CONTENT_SIZE': '20971520', # 20MB
'SCROLL_DELAY_SECONDS': '1.5',
'DEFAULT_URL': 'https://example.com',
'RSS_FILENAME': 'custom_feed.xml'
}):
# Need to reload the config to pick up environment changes
import importlib
import config
importlib.reload(config)
assert config.Config.MAX_SCROLL_ITERATIONS == 10
assert config.Config.MAX_CONTENT_SIZE == 20971520
assert config.Config.SCROLL_DELAY_SECONDS == 1.5
assert config.Config.DEFAULT_URL == 'https://example.com'
assert config.Config.RSS_FILENAME == 'custom_feed.xml'
def test_get_output_dir_with_override(self):
"""Test get_output_dir method with override."""
result = Config.get_output_dir('/custom/path')
assert result == '/custom/path'
def test_get_output_dir_without_override(self):
"""Test get_output_dir method without override."""
result = Config.get_output_dir()
assert result == Config.DEFAULT_OUTPUT_DIR
def test_get_allowed_domains_default(self):
"""Test get_allowed_domains returns default domains."""
domains = Config.get_allowed_domains()
assert 'warhammer-community.com' in domains
assert 'www.warhammer-community.com' in domains
def test_get_allowed_domains_from_env(self):
"""Test get_allowed_domains reads from environment variable."""
with patch.dict(os.environ, {
'ALLOWED_DOMAINS': 'example.com,test.com,another.com'
}):
domains = Config.get_allowed_domains()
assert domains == ['example.com', 'test.com', 'another.com']
def test_validate_config_success(self):
"""Test that valid configuration passes validation."""
# Should not raise any exception
Config.validate_config()
def test_validate_config_negative_scroll_iterations(self):
"""Test validation fails for negative scroll iterations."""
with patch.object(Config, 'MAX_SCROLL_ITERATIONS', -1):
with pytest.raises(ValueError, match="MAX_SCROLL_ITERATIONS must be non-negative"):
Config.validate_config()
def test_validate_config_zero_content_size(self):
"""Test validation fails for zero content size."""
with patch.object(Config, 'MAX_CONTENT_SIZE', 0):
with pytest.raises(ValueError, match="MAX_CONTENT_SIZE must be positive"):
Config.validate_config()
def test_validate_config_zero_title_length(self):
"""Test validation fails for zero title length."""
with patch.object(Config, 'MAX_TITLE_LENGTH', 0):
with pytest.raises(ValueError, match="MAX_TITLE_LENGTH must be positive"):
Config.validate_config()
def test_validate_config_negative_scroll_delay(self):
"""Test validation fails for negative scroll delay."""
with patch.object(Config, 'SCROLL_DELAY_SECONDS', -1.0):
with pytest.raises(ValueError, match="SCROLL_DELAY_SECONDS must be non-negative"):
Config.validate_config()
def test_validate_config_zero_timeout(self):
"""Test validation fails for zero timeout."""
with patch.object(Config, 'PAGE_TIMEOUT_MS', 0):
with pytest.raises(ValueError, match="PAGE_TIMEOUT_MS must be positive"):
Config.validate_config()
def test_validate_config_invalid_url(self):
"""Test validation fails for invalid default URL."""
with patch.object(Config, 'DEFAULT_URL', 'not-a-url'):
with pytest.raises(ValueError, match="DEFAULT_URL must be a valid HTTP/HTTPS URL"):
Config.validate_config()
def test_validate_config_empty_domains(self):
"""Test validation fails for empty allowed domains."""
with patch.object(Config, 'get_allowed_domains', return_value=[]):
with pytest.raises(ValueError, match="ALLOWED_DOMAINS cannot be empty"):
Config.validate_config()

202
tests/test_main.py Normal file
View File

@ -0,0 +1,202 @@
"""Tests for main module functionality."""
import pytest
import sys
import tempfile
from unittest.mock import patch, MagicMock
from argparse import Namespace
from main import parse_arguments, setup_logging, scrape_and_generate_rss
from src.rss_scraper.exceptions import ValidationError, NetworkError, ParseError
class TestParseArguments:
"""Test command line argument parsing."""
def test_parse_arguments_defaults(self):
"""Test parsing with default arguments."""
with patch('sys.argv', ['main.py']):
args = parse_arguments()
assert args.url == 'https://www.warhammer-community.com/en-gb/'
assert args.output_dir is None
assert args.max_scroll == 5
assert args.log_level == 'INFO'
assert args.log_file == 'scraper.log'
def test_parse_arguments_custom_values(self):
"""Test parsing with custom argument values."""
test_args = [
'main.py',
'--url', 'https://example.com',
'--output-dir', '/custom/path',
'--max-scroll', '10',
'--log-level', 'DEBUG',
'--log-file', 'custom.log'
]
with patch('sys.argv', test_args):
args = parse_arguments()
assert args.url == 'https://example.com'
assert args.output_dir == '/custom/path'
assert args.max_scroll == 10
assert args.log_level == 'DEBUG'
assert args.log_file == 'custom.log'
def test_parse_arguments_invalid_max_scroll(self):
"""Test parsing fails with invalid max_scroll value."""
test_args = ['main.py', '--max-scroll', '-1']
with patch('sys.argv', test_args):
with pytest.raises(SystemExit):
parse_arguments()
def test_parse_arguments_relative_output_dir(self):
"""Test that relative output directory is converted to absolute."""
test_args = ['main.py', '--output-dir', 'relative/path']
with patch('sys.argv', test_args):
args = parse_arguments()
assert args.output_dir.startswith('/') # Should be absolute path
assert args.output_dir.endswith('relative/path')
class TestSetupLogging:
"""Test logging setup functionality."""
def test_setup_logging_info_level(self):
"""Test logging setup with INFO level."""
with tempfile.NamedTemporaryFile(delete=False) as temp_file:
setup_logging('INFO', temp_file.name)
import logging
logger = logging.getLogger('test')
logger.info("Test message")
logger.debug("Debug message") # Should not appear
# Check that the log file was created and has correct level
assert logging.getLogger().level == logging.INFO
def test_setup_logging_debug_level(self):
"""Test logging setup with DEBUG level."""
with tempfile.NamedTemporaryFile(delete=False) as temp_file:
setup_logging('DEBUG', temp_file.name)
import logging
assert logging.getLogger().level == logging.DEBUG
def test_setup_logging_clears_existing_handlers(self):
"""Test that setup_logging clears existing handlers."""
import logging
# Add a dummy handler
dummy_handler = logging.StreamHandler()
logging.getLogger().addHandler(dummy_handler)
initial_handler_count = len(logging.getLogger().handlers)
with tempfile.NamedTemporaryFile(delete=False) as temp_file:
setup_logging('INFO', temp_file.name)
# Should have exactly 2 handlers (console + file)
assert len(logging.getLogger().handlers) == 2
class TestScrapeAndGenerateRss:
"""Test main scraping function."""
@patch('main.save_debug_html')
@patch('main.save_rss_feed')
@patch('main.generate_rss_feed')
@patch('main.extract_articles_from_html')
@patch('main.load_page_with_retry')
@patch('main.validate_url')
def test_scrape_and_generate_rss_success(
self, mock_validate_url, mock_load_page, mock_extract_articles,
mock_generate_rss, mock_save_rss, mock_save_html
):
"""Test successful RSS scraping and generation."""
# Setup mocks
mock_validate_url.return_value = True
mock_load_page.return_value = "<html>test</html>"
mock_extract_articles.return_value = [
{'title': 'Test', 'link': 'http://example.com', 'date': 'date'}
]
mock_generate_rss.return_value = b"<rss>feed</rss>"
mock_save_rss.return_value = "/path/to/feed.xml"
url = "https://www.warhammer-community.com/en-gb/"
output_dir = "/test/output"
# Should not raise any exception
scrape_and_generate_rss(url, output_dir)
# Verify all functions were called
mock_validate_url.assert_called_once_with(url)
mock_load_page.assert_called_once_with(url)
mock_extract_articles.assert_called_once_with("<html>test</html>", url)
mock_generate_rss.assert_called_once()
mock_save_rss.assert_called_once()
mock_save_html.assert_called_once()
@patch('main.validate_url')
def test_scrape_and_generate_rss_validation_error(self, mock_validate_url):
"""Test scraping fails with validation error."""
mock_validate_url.side_effect = ValidationError("Invalid URL")
with pytest.raises(ValidationError):
scrape_and_generate_rss("invalid-url")
@patch('main.load_page_with_retry')
@patch('main.validate_url')
def test_scrape_and_generate_rss_network_error(
self, mock_validate_url, mock_load_page
):
"""Test scraping fails with network error."""
mock_validate_url.return_value = True
mock_load_page.side_effect = NetworkError("Network error")
with pytest.raises(NetworkError):
scrape_and_generate_rss("https://www.warhammer-community.com/en-gb/")
@patch('main.extract_articles_from_html')
@patch('main.load_page_with_retry')
@patch('main.validate_url')
def test_scrape_and_generate_rss_parse_error(
self, mock_validate_url, mock_load_page, mock_extract_articles
):
"""Test scraping fails with parse error."""
mock_validate_url.return_value = True
mock_load_page.return_value = "<html>test</html>"
mock_extract_articles.side_effect = ParseError("Parse error")
with pytest.raises(ParseError):
scrape_and_generate_rss("https://www.warhammer-community.com/en-gb/")
@patch('main.save_debug_html')
@patch('main.save_rss_feed')
@patch('main.generate_rss_feed')
@patch('main.extract_articles_from_html')
@patch('main.load_page_with_retry')
@patch('main.validate_url')
def test_scrape_and_generate_rss_default_output_dir(
self, mock_validate_url, mock_load_page, mock_extract_articles,
mock_generate_rss, mock_save_rss, mock_save_html
):
"""Test scraping uses default output directory when none provided."""
# Setup mocks
mock_validate_url.return_value = True
mock_load_page.return_value = "<html>test</html>"
mock_extract_articles.return_value = []
mock_generate_rss.return_value = b"<rss>feed</rss>"
mock_save_rss.return_value = "/path/to/feed.xml"
url = "https://www.warhammer-community.com/en-gb/"
# Call without output_dir
scrape_and_generate_rss(url)
# Verify functions were called (output_dir would be set to default)
mock_validate_url.assert_called_once_with(url)
mock_save_rss.assert_called_once_with(b"<rss>feed</rss>", ".") # Default output dir

208
tests/test_parser.py Normal file
View File

@ -0,0 +1,208 @@
"""Tests for parser module."""
import pytest
from datetime import datetime
import pytz
from unittest.mock import patch
from src.rss_scraper.parser import sanitize_text, extract_articles_from_html
from src.rss_scraper.exceptions import ParseError
from src.rss_scraper.config import Config
class TestSanitizeText:
"""Test text sanitization functionality."""
def test_sanitize_normal_text(self):
"""Test sanitization of normal text."""
text = "Normal article title"
result = sanitize_text(text)
assert result == "Normal article title"
def test_sanitize_none_text(self):
"""Test sanitization of None text."""
result = sanitize_text(None)
assert result == "No title"
def test_sanitize_empty_text(self):
"""Test sanitization of empty text."""
result = sanitize_text("")
assert result == "No title"
def test_sanitize_whitespace_text(self):
"""Test sanitization of whitespace-only text."""
result = sanitize_text(" ")
assert result == "No title"
def test_remove_dangerous_patterns(self):
"""Test removal of dangerous patterns."""
dangerous_text = "Title with <script>alert('xss')</script> content"
result = sanitize_text(dangerous_text)
assert "<script" not in result
assert "</script" not in result
assert "alert('xss')" in result # Only script tags should be removed
def test_length_limit(self):
"""Test that text is limited to max length."""
long_text = "a" * 1000
result = sanitize_text(long_text)
assert len(result) <= Config.MAX_TITLE_LENGTH
def test_case_insensitive_pattern_removal(self):
"""Test that dangerous patterns are removed case-insensitively."""
text = "Title with <SCRIPT>alert('xss')</SCRIPT> and javascript: protocol"
result = sanitize_text(text)
assert "<SCRIPT" not in result
assert "</SCRIPT" not in result
assert "javascript:" not in result
class TestExtractArticlesFromHtml:
"""Test article extraction from HTML."""
def test_extract_articles_valid_html(self):
"""Test extraction from valid HTML with articles."""
html = """
<html>
<body>
<article>
<h3 class="newsCard-title-sm">Test Article 1</h3>
<a href="/article/test-1">Read more</a>
<time>01 Jan 24</time>
</article>
<article>
<h3 class="newsCard-title-lg">Test Article 2</h3>
<a href="https://www.warhammer-community.com/article/test-2">Read more</a>
<time>02 Jan 24</time>
</article>
</body>
</html>
"""
base_url = "https://www.warhammer-community.com"
articles = extract_articles_from_html(html, base_url)
assert len(articles) == 2
assert articles[0]['title'] == "Test Article 2" # Sorted by date, newest first
assert articles[1]['title'] == "Test Article 1"
assert "warhammer-community.com" in articles[0]['link']
assert "warhammer-community.com" in articles[1]['link']
def test_extract_articles_no_articles(self):
"""Test extraction from HTML with no articles."""
html = """
<html>
<body>
<div>No articles here</div>
</body>
</html>
"""
base_url = "https://www.warhammer-community.com"
articles = extract_articles_from_html(html, base_url)
assert len(articles) == 0
def test_extract_articles_duplicate_links(self):
"""Test that duplicate links are filtered out."""
html = """
<html>
<body>
<article>
<h3 class="newsCard-title-sm">Test Article 1</h3>
<a href="/article/test-1">Read more</a>
<time>01 Jan 24</time>
</article>
<article>
<h3 class="newsCard-title-lg">Test Article 1 Duplicate</h3>
<a href="/article/test-1">Read more</a>
<time>02 Jan 24</time>
</article>
</body>
</html>
"""
base_url = "https://www.warhammer-community.com"
articles = extract_articles_from_html(html, base_url)
assert len(articles) == 1 # Duplicate should be filtered out
assert articles[0]['title'] == "Test Article 1"
def test_extract_articles_invalid_links(self):
"""Test handling of articles with invalid links."""
html = """
<html>
<body>
<article>
<h3 class="newsCard-title-sm">Valid Article</h3>
<a href="/article/valid">Read more</a>
<time>01 Jan 24</time>
</article>
<article>
<h3 class="newsCard-title-lg">Invalid Article</h3>
<a href="https://malicious-site.com/article">Read more</a>
<time>02 Jan 24</time>
</article>
<article>
<h3 class="newsCard-title-sm">No Link Article</h3>
<time>03 Jan 24</time>
</article>
</body>
</html>
"""
base_url = "https://www.warhammer-community.com"
articles = extract_articles_from_html(html, base_url)
assert len(articles) == 1 # Only valid article should be included
assert articles[0]['title'] == "Valid Article"
def test_extract_articles_date_parsing(self):
"""Test parsing of various date formats."""
html = """
<html>
<body>
<article>
<h3 class="newsCard-title-sm">Article with good date</h3>
<a href="/article/1">Read more</a>
<time>15 Mar 24</time>
</article>
<article>
<h3 class="newsCard-title-lg">Article with bad date</h3>
<a href="/article/2">Read more</a>
<time>Invalid Date Format</time>
</article>
<article>
<h3 class="newsCard-title-sm">Article with reading time</h3>
<a href="/article/3">Read more</a>
<time>5 min read</time>
<time>20 Mar 24</time>
</article>
</body>
</html>
"""
base_url = "https://www.warhammer-community.com"
articles = extract_articles_from_html(html, base_url)
assert len(articles) == 3
# Check that dates are parsed correctly
for article in articles:
assert isinstance(article['date'], datetime)
assert article['date'].tzinfo is not None
def test_extract_articles_malformed_html(self):
"""Test handling of malformed HTML."""
malformed_html = "<html><body><article><h3>Unclosed tags"
base_url = "https://www.warhammer-community.com"
# Should not raise ParseError - BeautifulSoup handles malformed HTML gracefully
articles = extract_articles_from_html(malformed_html, base_url)
assert isinstance(articles, list)
def test_extract_articles_invalid_html(self):
"""Test handling of completely invalid HTML."""
with patch('bs4.BeautifulSoup', side_effect=Exception("Parser error")):
with pytest.raises(ParseError):
extract_articles_from_html("<html></html>", "https://example.com")

162
tests/test_rss_generator.py Normal file
View File

@ -0,0 +1,162 @@
"""Tests for RSS generator module."""
import pytest
import os
import tempfile
from datetime import datetime
import pytz
from unittest.mock import patch, mock_open
from src.rss_scraper.rss_generator import generate_rss_feed, save_rss_feed, save_debug_html
from src.rss_scraper.exceptions import FileOperationError
class TestGenerateRssFeed:
"""Test RSS feed generation functionality."""
def test_generate_rss_feed_with_articles(self):
"""Test RSS generation with valid articles."""
timezone = pytz.UTC
articles = [
{
'title': 'Test Article 1',
'link': 'https://example.com/article1',
'date': datetime(2024, 1, 1, tzinfo=timezone)
},
{
'title': 'Test Article 2',
'link': 'https://example.com/article2',
'date': datetime(2024, 1, 2, tzinfo=timezone)
}
]
feed_url = "https://example.com"
rss_content = generate_rss_feed(articles, feed_url)
assert isinstance(rss_content, bytes)
rss_str = rss_content.decode('utf-8')
assert 'Test Article 1' in rss_str
assert 'Test Article 2' in rss_str
assert 'https://example.com/article1' in rss_str
assert 'https://example.com/article2' in rss_str
assert '<?xml version=' in rss_str
assert '<rss version=' in rss_str
def test_generate_rss_feed_empty_articles(self):
"""Test RSS generation with empty articles list."""
articles = []
feed_url = "https://example.com"
rss_content = generate_rss_feed(articles, feed_url)
assert isinstance(rss_content, bytes)
rss_str = rss_content.decode('utf-8')
assert '<?xml version=' in rss_str
assert '<rss version=' in rss_str
# Should still contain feed metadata
assert 'Warhammer Community RSS Feed' in rss_str
def test_generate_rss_feed_unicode_content(self):
"""Test RSS generation with unicode content."""
timezone = pytz.UTC
articles = [
{
'title': 'Tëst Artìclé with Ūnïcödë',
'link': 'https://example.com/unicode',
'date': datetime(2024, 1, 1, tzinfo=timezone)
}
]
feed_url = "https://example.com"
rss_content = generate_rss_feed(articles, feed_url)
assert isinstance(rss_content, bytes)
rss_str = rss_content.decode('utf-8')
assert 'Tëst Artìclé with Ūnïcödë' in rss_str
class TestSaveRssFeed:
"""Test RSS feed saving functionality."""
def test_save_rss_feed_success(self):
"""Test successful RSS feed saving."""
rss_content = b'<?xml version="1.0"?><rss>test</rss>'
with tempfile.TemporaryDirectory() as temp_dir:
result_path = save_rss_feed(rss_content, temp_dir)
assert os.path.exists(result_path)
assert result_path.endswith('warhammer_rss_feed.xml')
with open(result_path, 'rb') as f:
saved_content = f.read()
assert saved_content == rss_content
def test_save_rss_feed_permission_error(self):
"""Test RSS feed saving with permission error."""
rss_content = b'<?xml version="1.0"?><rss>test</rss>'
with patch('builtins.open', side_effect=PermissionError("Permission denied")):
with pytest.raises(FileOperationError):
save_rss_feed(rss_content, "/some/path")
def test_save_rss_feed_creates_directory(self):
"""Test that RSS feed saving creates directory if needed."""
rss_content = b'<?xml version="1.0"?><rss>test</rss>'
with tempfile.TemporaryDirectory() as temp_dir:
new_subdir = os.path.join(temp_dir, "new_subdir")
result_path = save_rss_feed(rss_content, new_subdir)
assert os.path.exists(new_subdir)
assert os.path.exists(result_path)
class TestSaveDebugHtml:
"""Test debug HTML saving functionality."""
def test_save_debug_html_success(self):
"""Test successful debug HTML saving."""
html_content = "<html><body>Test content</body></html>"
with tempfile.TemporaryDirectory() as temp_dir:
save_debug_html(html_content, temp_dir)
html_path = os.path.join(temp_dir, "page.html")
assert os.path.exists(html_path)
with open(html_path, 'r', encoding='utf-8') as f:
saved_content = f.read()
# BeautifulSoup prettifies the content
assert "Test content" in saved_content
def test_save_debug_html_permission_error(self):
"""Test debug HTML saving with permission error (should not raise)."""
html_content = "<html><body>Test content</body></html>"
with patch('builtins.open', side_effect=PermissionError("Permission denied")):
# Should not raise exception, just log warning
save_debug_html(html_content, "/some/path")
def test_save_debug_html_malformed_content(self):
"""Test debug HTML saving with malformed HTML content."""
malformed_html = "<html><body>Unclosed tags"
with tempfile.TemporaryDirectory() as temp_dir:
# Should handle malformed HTML gracefully
save_debug_html(malformed_html, temp_dir)
html_path = os.path.join(temp_dir, "page.html")
assert os.path.exists(html_path)
def test_save_debug_html_creates_directory(self):
"""Test that debug HTML saving creates directory if needed."""
html_content = "<html><body>Test content</body></html>"
with tempfile.TemporaryDirectory() as temp_dir:
new_subdir = os.path.join(temp_dir, "new_subdir")
save_debug_html(html_content, new_subdir)
assert os.path.exists(new_subdir)
html_path = os.path.join(new_subdir, "page.html")
assert os.path.exists(html_path)

170
tests/test_validation.py Normal file
View File

@ -0,0 +1,170 @@
"""Tests for validation module."""
import pytest
import os
import tempfile
from unittest.mock import patch
from src.rss_scraper.validation import validate_url, validate_output_path, validate_link
from src.rss_scraper.exceptions import ValidationError, FileOperationError
from src.rss_scraper.config import Config
class TestValidateUrl:
"""Test URL validation functionality."""
def test_valid_url(self):
"""Test validation of valid URLs."""
valid_urls = [
"https://www.warhammer-community.com/en-gb/",
"https://warhammer-community.com/some/path",
]
for url in valid_urls:
assert validate_url(url) is True
def test_invalid_url_format(self):
"""Test validation fails for invalid URL formats."""
invalid_urls = [
"not-a-url",
"ftp://example.com",
"",
"http://",
"https://",
]
for url in invalid_urls:
with pytest.raises(ValidationError):
validate_url(url)
def test_disallowed_domain(self):
"""Test validation fails for disallowed domains."""
disallowed_urls = [
"https://malicious-site.com",
"https://example.com",
"https://google.com",
]
for url in disallowed_urls:
with pytest.raises(ValidationError):
validate_url(url)
def test_case_insensitive_domain(self):
"""Test domain validation is case insensitive."""
urls = [
"https://WWW.WARHAMMER-COMMUNITY.COM",
"https://Warhammer-Community.com",
]
for url in urls:
assert validate_url(url) is True
class TestValidateOutputPath:
"""Test output path validation functionality."""
def test_valid_path_within_base(self):
"""Test validation of valid paths within base directory."""
with tempfile.TemporaryDirectory() as temp_dir:
test_path = os.path.join(temp_dir, "output.xml")
result = validate_output_path(test_path, temp_dir)
assert result == os.path.abspath(test_path)
def test_path_outside_base_directory(self):
"""Test validation fails for paths outside base directory."""
with tempfile.TemporaryDirectory() as temp_dir:
outside_path = "/tmp/malicious.xml"
with pytest.raises(ValidationError):
validate_output_path(outside_path, temp_dir)
def test_absolute_path_within_base_directory(self):
"""Test that absolute paths within base directory are allowed."""
with tempfile.TemporaryDirectory() as temp_dir:
# This should work - absolute path within the base directory
abs_path = os.path.join(temp_dir, "output.xml")
result = validate_output_path(abs_path, temp_dir)
assert result == os.path.abspath(abs_path)
def test_creates_directory_if_not_exists(self):
"""Test that validation creates directory if it doesn't exist."""
with tempfile.TemporaryDirectory() as temp_dir:
new_subdir = os.path.join(temp_dir, "new_subdir")
test_path = os.path.join(new_subdir, "output.xml")
result = validate_output_path(test_path, new_subdir)
assert os.path.exists(new_subdir)
assert result == os.path.abspath(test_path)
def test_directory_traversal_protection(self):
"""Test that directory traversal attacks are blocked."""
with tempfile.TemporaryDirectory() as temp_dir:
# These should be blocked - either by directory traversal check or outside-base check
traversal_paths = [
"../../../etc/passwd",
"subdir/../../../etc/passwd",
"normal/../../../dangerous.xml"
]
for path in traversal_paths:
with pytest.raises(ValidationError): # Either error type is acceptable
validate_output_path(path, temp_dir)
def test_permission_error(self):
"""Test handling of permission errors."""
with patch('os.makedirs', side_effect=PermissionError("Permission denied")):
with pytest.raises(FileOperationError):
validate_output_path("/some/path/file.xml", "/some/path")
class TestValidateLink:
"""Test link validation functionality."""
def test_valid_absolute_link(self):
"""Test validation of valid absolute links."""
base_url = "https://www.warhammer-community.com"
valid_link = "https://www.warhammer-community.com/article"
result = validate_link(valid_link, base_url)
assert result == valid_link
def test_valid_relative_link(self):
"""Test validation of valid relative links."""
base_url = "https://www.warhammer-community.com/en-gb/"
relative_link = "/article/some-article"
result = validate_link(relative_link, base_url)
assert result == "https://www.warhammer-community.com/article/some-article"
def test_none_link(self):
"""Test handling of None link."""
base_url = "https://www.warhammer-community.com"
result = validate_link(None, base_url)
assert result is None
def test_empty_link(self):
"""Test handling of empty link."""
base_url = "https://www.warhammer-community.com"
result = validate_link("", base_url)
assert result is None
def test_invalid_domain_link(self):
"""Test rejection of links from invalid domains."""
base_url = "https://www.warhammer-community.com"
invalid_link = "https://malicious-site.com/article"
result = validate_link(invalid_link, base_url)
assert result is None
def test_malformed_link(self):
"""Test handling of malformed links."""
base_url = "https://www.warhammer-community.com"
malformed_links = [
"not-a-url",
"://missing-scheme",
"https://",
]
for link in malformed_links:
result = validate_link(link, base_url)
assert result is None