rss_warhammer-community/tests/test_validation.py
Phil 25086fc01b Add comprehensive RSS scraper implementation with security and testing
- Modular architecture with separate modules for scraping, parsing, security, validation, and caching
- Comprehensive security measures including HTML sanitization, rate limiting, and input validation
- Robust error handling with custom exceptions and retry logic
- HTTP caching with ETags and Last-Modified headers for efficiency
- Pre-compiled regex patterns for improved performance
- Comprehensive test suite with 66 tests covering all major functionality
- Docker support for containerized deployment
- Configuration management with environment variable support
- Working parser that successfully extracts 32 articles from Warhammer Community

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-06-06 09:15:06 -06:00

170 lines
6.2 KiB
Python

"""Tests for validation module."""
import pytest
import os
import tempfile
from unittest.mock import patch
from src.rss_scraper.validation import validate_url, validate_output_path, validate_link
from src.rss_scraper.exceptions import ValidationError, FileOperationError
from src.rss_scraper.config import Config
class TestValidateUrl:
"""Test URL validation functionality."""
def test_valid_url(self):
"""Test validation of valid URLs."""
valid_urls = [
"https://www.warhammer-community.com/en-gb/",
"https://warhammer-community.com/some/path",
]
for url in valid_urls:
assert validate_url(url) is True
def test_invalid_url_format(self):
"""Test validation fails for invalid URL formats."""
invalid_urls = [
"not-a-url",
"ftp://example.com",
"",
"http://",
"https://",
]
for url in invalid_urls:
with pytest.raises(ValidationError):
validate_url(url)
def test_disallowed_domain(self):
"""Test validation fails for disallowed domains."""
disallowed_urls = [
"https://malicious-site.com",
"https://example.com",
"https://google.com",
]
for url in disallowed_urls:
with pytest.raises(ValidationError):
validate_url(url)
def test_case_insensitive_domain(self):
"""Test domain validation is case insensitive."""
urls = [
"https://WWW.WARHAMMER-COMMUNITY.COM",
"https://Warhammer-Community.com",
]
for url in urls:
assert validate_url(url) is True
class TestValidateOutputPath:
"""Test output path validation functionality."""
def test_valid_path_within_base(self):
"""Test validation of valid paths within base directory."""
with tempfile.TemporaryDirectory() as temp_dir:
test_path = os.path.join(temp_dir, "output.xml")
result = validate_output_path(test_path, temp_dir)
assert result == os.path.abspath(test_path)
def test_path_outside_base_directory(self):
"""Test validation fails for paths outside base directory."""
with tempfile.TemporaryDirectory() as temp_dir:
outside_path = "/tmp/malicious.xml"
with pytest.raises(ValidationError):
validate_output_path(outside_path, temp_dir)
def test_absolute_path_within_base_directory(self):
"""Test that absolute paths within base directory are allowed."""
with tempfile.TemporaryDirectory() as temp_dir:
# This should work - absolute path within the base directory
abs_path = os.path.join(temp_dir, "output.xml")
result = validate_output_path(abs_path, temp_dir)
assert result == os.path.abspath(abs_path)
def test_creates_directory_if_not_exists(self):
"""Test that validation creates directory if it doesn't exist."""
with tempfile.TemporaryDirectory() as temp_dir:
new_subdir = os.path.join(temp_dir, "new_subdir")
test_path = os.path.join(new_subdir, "output.xml")
result = validate_output_path(test_path, new_subdir)
assert os.path.exists(new_subdir)
assert result == os.path.abspath(test_path)
def test_directory_traversal_protection(self):
"""Test that directory traversal attacks are blocked."""
with tempfile.TemporaryDirectory() as temp_dir:
# These should be blocked - either by directory traversal check or outside-base check
traversal_paths = [
"../../../etc/passwd",
"subdir/../../../etc/passwd",
"normal/../../../dangerous.xml"
]
for path in traversal_paths:
with pytest.raises(ValidationError): # Either error type is acceptable
validate_output_path(path, temp_dir)
def test_permission_error(self):
"""Test handling of permission errors."""
with patch('os.makedirs', side_effect=PermissionError("Permission denied")):
with pytest.raises(FileOperationError):
validate_output_path("/some/path/file.xml", "/some/path")
class TestValidateLink:
"""Test link validation functionality."""
def test_valid_absolute_link(self):
"""Test validation of valid absolute links."""
base_url = "https://www.warhammer-community.com"
valid_link = "https://www.warhammer-community.com/article"
result = validate_link(valid_link, base_url)
assert result == valid_link
def test_valid_relative_link(self):
"""Test validation of valid relative links."""
base_url = "https://www.warhammer-community.com/en-gb/"
relative_link = "/article/some-article"
result = validate_link(relative_link, base_url)
assert result == "https://www.warhammer-community.com/article/some-article"
def test_none_link(self):
"""Test handling of None link."""
base_url = "https://www.warhammer-community.com"
result = validate_link(None, base_url)
assert result is None
def test_empty_link(self):
"""Test handling of empty link."""
base_url = "https://www.warhammer-community.com"
result = validate_link("", base_url)
assert result is None
def test_invalid_domain_link(self):
"""Test rejection of links from invalid domains."""
base_url = "https://www.warhammer-community.com"
invalid_link = "https://malicious-site.com/article"
result = validate_link(invalid_link, base_url)
assert result is None
def test_malformed_link(self):
"""Test handling of malformed links."""
base_url = "https://www.warhammer-community.com"
malformed_links = [
"not-a-url",
"://missing-scheme",
"https://",
]
for link in malformed_links:
result = validate_link(link, base_url)
assert result is None