Add comprehensive RSS scraper implementation with security and testing
- Modular architecture with separate modules for scraping, parsing, security, validation, and caching - Comprehensive security measures including HTML sanitization, rate limiting, and input validation - Robust error handling with custom exceptions and retry logic - HTTP caching with ETags and Last-Modified headers for efficiency - Pre-compiled regex patterns for improved performance - Comprehensive test suite with 66 tests covering all major functionality - Docker support for containerized deployment - Configuration management with environment variable support - Working parser that successfully extracts 32 articles from Warhammer Community 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
162
tests/test_rss_generator.py
Normal file
162
tests/test_rss_generator.py
Normal file
@ -0,0 +1,162 @@
|
||||
"""Tests for RSS generator module."""
|
||||
|
||||
import pytest
|
||||
import os
|
||||
import tempfile
|
||||
from datetime import datetime
|
||||
import pytz
|
||||
from unittest.mock import patch, mock_open
|
||||
|
||||
from src.rss_scraper.rss_generator import generate_rss_feed, save_rss_feed, save_debug_html
|
||||
from src.rss_scraper.exceptions import FileOperationError
|
||||
|
||||
|
||||
class TestGenerateRssFeed:
|
||||
"""Test RSS feed generation functionality."""
|
||||
|
||||
def test_generate_rss_feed_with_articles(self):
|
||||
"""Test RSS generation with valid articles."""
|
||||
timezone = pytz.UTC
|
||||
articles = [
|
||||
{
|
||||
'title': 'Test Article 1',
|
||||
'link': 'https://example.com/article1',
|
||||
'date': datetime(2024, 1, 1, tzinfo=timezone)
|
||||
},
|
||||
{
|
||||
'title': 'Test Article 2',
|
||||
'link': 'https://example.com/article2',
|
||||
'date': datetime(2024, 1, 2, tzinfo=timezone)
|
||||
}
|
||||
]
|
||||
|
||||
feed_url = "https://example.com"
|
||||
rss_content = generate_rss_feed(articles, feed_url)
|
||||
|
||||
assert isinstance(rss_content, bytes)
|
||||
rss_str = rss_content.decode('utf-8')
|
||||
assert 'Test Article 1' in rss_str
|
||||
assert 'Test Article 2' in rss_str
|
||||
assert 'https://example.com/article1' in rss_str
|
||||
assert 'https://example.com/article2' in rss_str
|
||||
assert '<?xml version=' in rss_str
|
||||
assert '<rss version=' in rss_str
|
||||
|
||||
def test_generate_rss_feed_empty_articles(self):
|
||||
"""Test RSS generation with empty articles list."""
|
||||
articles = []
|
||||
feed_url = "https://example.com"
|
||||
|
||||
rss_content = generate_rss_feed(articles, feed_url)
|
||||
|
||||
assert isinstance(rss_content, bytes)
|
||||
rss_str = rss_content.decode('utf-8')
|
||||
assert '<?xml version=' in rss_str
|
||||
assert '<rss version=' in rss_str
|
||||
# Should still contain feed metadata
|
||||
assert 'Warhammer Community RSS Feed' in rss_str
|
||||
|
||||
def test_generate_rss_feed_unicode_content(self):
|
||||
"""Test RSS generation with unicode content."""
|
||||
timezone = pytz.UTC
|
||||
articles = [
|
||||
{
|
||||
'title': 'Tëst Artìclé with Ūnïcödë',
|
||||
'link': 'https://example.com/unicode',
|
||||
'date': datetime(2024, 1, 1, tzinfo=timezone)
|
||||
}
|
||||
]
|
||||
|
||||
feed_url = "https://example.com"
|
||||
rss_content = generate_rss_feed(articles, feed_url)
|
||||
|
||||
assert isinstance(rss_content, bytes)
|
||||
rss_str = rss_content.decode('utf-8')
|
||||
assert 'Tëst Artìclé with Ūnïcödë' in rss_str
|
||||
|
||||
|
||||
class TestSaveRssFeed:
|
||||
"""Test RSS feed saving functionality."""
|
||||
|
||||
def test_save_rss_feed_success(self):
|
||||
"""Test successful RSS feed saving."""
|
||||
rss_content = b'<?xml version="1.0"?><rss>test</rss>'
|
||||
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
result_path = save_rss_feed(rss_content, temp_dir)
|
||||
|
||||
assert os.path.exists(result_path)
|
||||
assert result_path.endswith('warhammer_rss_feed.xml')
|
||||
|
||||
with open(result_path, 'rb') as f:
|
||||
saved_content = f.read()
|
||||
assert saved_content == rss_content
|
||||
|
||||
def test_save_rss_feed_permission_error(self):
|
||||
"""Test RSS feed saving with permission error."""
|
||||
rss_content = b'<?xml version="1.0"?><rss>test</rss>'
|
||||
|
||||
with patch('builtins.open', side_effect=PermissionError("Permission denied")):
|
||||
with pytest.raises(FileOperationError):
|
||||
save_rss_feed(rss_content, "/some/path")
|
||||
|
||||
def test_save_rss_feed_creates_directory(self):
|
||||
"""Test that RSS feed saving creates directory if needed."""
|
||||
rss_content = b'<?xml version="1.0"?><rss>test</rss>'
|
||||
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
new_subdir = os.path.join(temp_dir, "new_subdir")
|
||||
result_path = save_rss_feed(rss_content, new_subdir)
|
||||
|
||||
assert os.path.exists(new_subdir)
|
||||
assert os.path.exists(result_path)
|
||||
|
||||
|
||||
class TestSaveDebugHtml:
|
||||
"""Test debug HTML saving functionality."""
|
||||
|
||||
def test_save_debug_html_success(self):
|
||||
"""Test successful debug HTML saving."""
|
||||
html_content = "<html><body>Test content</body></html>"
|
||||
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
save_debug_html(html_content, temp_dir)
|
||||
|
||||
html_path = os.path.join(temp_dir, "page.html")
|
||||
assert os.path.exists(html_path)
|
||||
|
||||
with open(html_path, 'r', encoding='utf-8') as f:
|
||||
saved_content = f.read()
|
||||
# BeautifulSoup prettifies the content
|
||||
assert "Test content" in saved_content
|
||||
|
||||
def test_save_debug_html_permission_error(self):
|
||||
"""Test debug HTML saving with permission error (should not raise)."""
|
||||
html_content = "<html><body>Test content</body></html>"
|
||||
|
||||
with patch('builtins.open', side_effect=PermissionError("Permission denied")):
|
||||
# Should not raise exception, just log warning
|
||||
save_debug_html(html_content, "/some/path")
|
||||
|
||||
def test_save_debug_html_malformed_content(self):
|
||||
"""Test debug HTML saving with malformed HTML content."""
|
||||
malformed_html = "<html><body>Unclosed tags"
|
||||
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
# Should handle malformed HTML gracefully
|
||||
save_debug_html(malformed_html, temp_dir)
|
||||
|
||||
html_path = os.path.join(temp_dir, "page.html")
|
||||
assert os.path.exists(html_path)
|
||||
|
||||
def test_save_debug_html_creates_directory(self):
|
||||
"""Test that debug HTML saving creates directory if needed."""
|
||||
html_content = "<html><body>Test content</body></html>"
|
||||
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
new_subdir = os.path.join(temp_dir, "new_subdir")
|
||||
save_debug_html(html_content, new_subdir)
|
||||
|
||||
assert os.path.exists(new_subdir)
|
||||
html_path = os.path.join(new_subdir, "page.html")
|
||||
assert os.path.exists(html_path)
|
Reference in New Issue
Block a user