- Modular architecture with separate modules for scraping, parsing, security, validation, and caching - Comprehensive security measures including HTML sanitization, rate limiting, and input validation - Robust error handling with custom exceptions and retry logic - HTTP caching with ETags and Last-Modified headers for efficiency - Pre-compiled regex patterns for improved performance - Comprehensive test suite with 66 tests covering all major functionality - Docker support for containerized deployment - Configuration management with environment variable support - Working parser that successfully extracts 32 articles from Warhammer Community 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
170 lines
6.2 KiB
Python
170 lines
6.2 KiB
Python
"""Tests for validation module."""
|
|
|
|
import pytest
|
|
import os
|
|
import tempfile
|
|
from unittest.mock import patch
|
|
|
|
from src.rss_scraper.validation import validate_url, validate_output_path, validate_link
|
|
from src.rss_scraper.exceptions import ValidationError, FileOperationError
|
|
from src.rss_scraper.config import Config
|
|
|
|
|
|
class TestValidateUrl:
|
|
"""Test URL validation functionality."""
|
|
|
|
def test_valid_url(self):
|
|
"""Test validation of valid URLs."""
|
|
valid_urls = [
|
|
"https://www.warhammer-community.com/en-gb/",
|
|
"https://warhammer-community.com/some/path",
|
|
]
|
|
|
|
for url in valid_urls:
|
|
assert validate_url(url) is True
|
|
|
|
def test_invalid_url_format(self):
|
|
"""Test validation fails for invalid URL formats."""
|
|
invalid_urls = [
|
|
"not-a-url",
|
|
"ftp://example.com",
|
|
"",
|
|
"http://",
|
|
"https://",
|
|
]
|
|
|
|
for url in invalid_urls:
|
|
with pytest.raises(ValidationError):
|
|
validate_url(url)
|
|
|
|
def test_disallowed_domain(self):
|
|
"""Test validation fails for disallowed domains."""
|
|
disallowed_urls = [
|
|
"https://malicious-site.com",
|
|
"https://example.com",
|
|
"https://google.com",
|
|
]
|
|
|
|
for url in disallowed_urls:
|
|
with pytest.raises(ValidationError):
|
|
validate_url(url)
|
|
|
|
def test_case_insensitive_domain(self):
|
|
"""Test domain validation is case insensitive."""
|
|
urls = [
|
|
"https://WWW.WARHAMMER-COMMUNITY.COM",
|
|
"https://Warhammer-Community.com",
|
|
]
|
|
|
|
for url in urls:
|
|
assert validate_url(url) is True
|
|
|
|
|
|
class TestValidateOutputPath:
|
|
"""Test output path validation functionality."""
|
|
|
|
def test_valid_path_within_base(self):
|
|
"""Test validation of valid paths within base directory."""
|
|
with tempfile.TemporaryDirectory() as temp_dir:
|
|
test_path = os.path.join(temp_dir, "output.xml")
|
|
result = validate_output_path(test_path, temp_dir)
|
|
assert result == os.path.abspath(test_path)
|
|
|
|
def test_path_outside_base_directory(self):
|
|
"""Test validation fails for paths outside base directory."""
|
|
with tempfile.TemporaryDirectory() as temp_dir:
|
|
outside_path = "/tmp/malicious.xml"
|
|
with pytest.raises(ValidationError):
|
|
validate_output_path(outside_path, temp_dir)
|
|
|
|
def test_absolute_path_within_base_directory(self):
|
|
"""Test that absolute paths within base directory are allowed."""
|
|
with tempfile.TemporaryDirectory() as temp_dir:
|
|
# This should work - absolute path within the base directory
|
|
abs_path = os.path.join(temp_dir, "output.xml")
|
|
result = validate_output_path(abs_path, temp_dir)
|
|
assert result == os.path.abspath(abs_path)
|
|
|
|
def test_creates_directory_if_not_exists(self):
|
|
"""Test that validation creates directory if it doesn't exist."""
|
|
with tempfile.TemporaryDirectory() as temp_dir:
|
|
new_subdir = os.path.join(temp_dir, "new_subdir")
|
|
test_path = os.path.join(new_subdir, "output.xml")
|
|
|
|
result = validate_output_path(test_path, new_subdir)
|
|
|
|
assert os.path.exists(new_subdir)
|
|
assert result == os.path.abspath(test_path)
|
|
|
|
def test_directory_traversal_protection(self):
|
|
"""Test that directory traversal attacks are blocked."""
|
|
with tempfile.TemporaryDirectory() as temp_dir:
|
|
# These should be blocked - either by directory traversal check or outside-base check
|
|
traversal_paths = [
|
|
"../../../etc/passwd",
|
|
"subdir/../../../etc/passwd",
|
|
"normal/../../../dangerous.xml"
|
|
]
|
|
|
|
for path in traversal_paths:
|
|
with pytest.raises(ValidationError): # Either error type is acceptable
|
|
validate_output_path(path, temp_dir)
|
|
|
|
def test_permission_error(self):
|
|
"""Test handling of permission errors."""
|
|
with patch('os.makedirs', side_effect=PermissionError("Permission denied")):
|
|
with pytest.raises(FileOperationError):
|
|
validate_output_path("/some/path/file.xml", "/some/path")
|
|
|
|
|
|
class TestValidateLink:
|
|
"""Test link validation functionality."""
|
|
|
|
def test_valid_absolute_link(self):
|
|
"""Test validation of valid absolute links."""
|
|
base_url = "https://www.warhammer-community.com"
|
|
valid_link = "https://www.warhammer-community.com/article"
|
|
|
|
result = validate_link(valid_link, base_url)
|
|
assert result == valid_link
|
|
|
|
def test_valid_relative_link(self):
|
|
"""Test validation of valid relative links."""
|
|
base_url = "https://www.warhammer-community.com/en-gb/"
|
|
relative_link = "/article/some-article"
|
|
|
|
result = validate_link(relative_link, base_url)
|
|
assert result == "https://www.warhammer-community.com/article/some-article"
|
|
|
|
def test_none_link(self):
|
|
"""Test handling of None link."""
|
|
base_url = "https://www.warhammer-community.com"
|
|
result = validate_link(None, base_url)
|
|
assert result is None
|
|
|
|
def test_empty_link(self):
|
|
"""Test handling of empty link."""
|
|
base_url = "https://www.warhammer-community.com"
|
|
result = validate_link("", base_url)
|
|
assert result is None
|
|
|
|
def test_invalid_domain_link(self):
|
|
"""Test rejection of links from invalid domains."""
|
|
base_url = "https://www.warhammer-community.com"
|
|
invalid_link = "https://malicious-site.com/article"
|
|
|
|
result = validate_link(invalid_link, base_url)
|
|
assert result is None
|
|
|
|
def test_malformed_link(self):
|
|
"""Test handling of malformed links."""
|
|
base_url = "https://www.warhammer-community.com"
|
|
malformed_links = [
|
|
"not-a-url",
|
|
"://missing-scheme",
|
|
"https://",
|
|
]
|
|
|
|
for link in malformed_links:
|
|
result = validate_link(link, base_url)
|
|
assert result is None |