Phil 25086fc01b Add comprehensive RSS scraper implementation with security and testing
- Modular architecture with separate modules for scraping, parsing, security, validation, and caching
- Comprehensive security measures including HTML sanitization, rate limiting, and input validation
- Robust error handling with custom exceptions and retry logic
- HTTP caching with ETags and Last-Modified headers for efficiency
- Pre-compiled regex patterns for improved performance
- Comprehensive test suite with 66 tests covering all major functionality
- Docker support for containerized deployment
- Configuration management with environment variable support
- Working parser that successfully extracts 32 articles from Warhammer Community

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-06-06 09:15:06 -06:00

208 lines
7.4 KiB
Python

"""Tests for parser module."""
import pytest
from datetime import datetime
import pytz
from unittest.mock import patch
from src.rss_scraper.parser import sanitize_text, extract_articles_from_html
from src.rss_scraper.exceptions import ParseError
from src.rss_scraper.config import Config
class TestSanitizeText:
"""Test text sanitization functionality."""
def test_sanitize_normal_text(self):
"""Test sanitization of normal text."""
text = "Normal article title"
result = sanitize_text(text)
assert result == "Normal article title"
def test_sanitize_none_text(self):
"""Test sanitization of None text."""
result = sanitize_text(None)
assert result == "No title"
def test_sanitize_empty_text(self):
"""Test sanitization of empty text."""
result = sanitize_text("")
assert result == "No title"
def test_sanitize_whitespace_text(self):
"""Test sanitization of whitespace-only text."""
result = sanitize_text(" ")
assert result == "No title"
def test_remove_dangerous_patterns(self):
"""Test removal of dangerous patterns."""
dangerous_text = "Title with <script>alert('xss')</script> content"
result = sanitize_text(dangerous_text)
assert "<script" not in result
assert "</script" not in result
assert "alert('xss')" in result # Only script tags should be removed
def test_length_limit(self):
"""Test that text is limited to max length."""
long_text = "a" * 1000
result = sanitize_text(long_text)
assert len(result) <= Config.MAX_TITLE_LENGTH
def test_case_insensitive_pattern_removal(self):
"""Test that dangerous patterns are removed case-insensitively."""
text = "Title with <SCRIPT>alert('xss')</SCRIPT> and javascript: protocol"
result = sanitize_text(text)
assert "<SCRIPT" not in result
assert "</SCRIPT" not in result
assert "javascript:" not in result
class TestExtractArticlesFromHtml:
"""Test article extraction from HTML."""
def test_extract_articles_valid_html(self):
"""Test extraction from valid HTML with articles."""
html = """
<html>
<body>
<article>
<h3 class="newsCard-title-sm">Test Article 1</h3>
<a href="/article/test-1">Read more</a>
<time>01 Jan 24</time>
</article>
<article>
<h3 class="newsCard-title-lg">Test Article 2</h3>
<a href="https://www.warhammer-community.com/article/test-2">Read more</a>
<time>02 Jan 24</time>
</article>
</body>
</html>
"""
base_url = "https://www.warhammer-community.com"
articles = extract_articles_from_html(html, base_url)
assert len(articles) == 2
assert articles[0]['title'] == "Test Article 2" # Sorted by date, newest first
assert articles[1]['title'] == "Test Article 1"
assert "warhammer-community.com" in articles[0]['link']
assert "warhammer-community.com" in articles[1]['link']
def test_extract_articles_no_articles(self):
"""Test extraction from HTML with no articles."""
html = """
<html>
<body>
<div>No articles here</div>
</body>
</html>
"""
base_url = "https://www.warhammer-community.com"
articles = extract_articles_from_html(html, base_url)
assert len(articles) == 0
def test_extract_articles_duplicate_links(self):
"""Test that duplicate links are filtered out."""
html = """
<html>
<body>
<article>
<h3 class="newsCard-title-sm">Test Article 1</h3>
<a href="/article/test-1">Read more</a>
<time>01 Jan 24</time>
</article>
<article>
<h3 class="newsCard-title-lg">Test Article 1 Duplicate</h3>
<a href="/article/test-1">Read more</a>
<time>02 Jan 24</time>
</article>
</body>
</html>
"""
base_url = "https://www.warhammer-community.com"
articles = extract_articles_from_html(html, base_url)
assert len(articles) == 1 # Duplicate should be filtered out
assert articles[0]['title'] == "Test Article 1"
def test_extract_articles_invalid_links(self):
"""Test handling of articles with invalid links."""
html = """
<html>
<body>
<article>
<h3 class="newsCard-title-sm">Valid Article</h3>
<a href="/article/valid">Read more</a>
<time>01 Jan 24</time>
</article>
<article>
<h3 class="newsCard-title-lg">Invalid Article</h3>
<a href="https://malicious-site.com/article">Read more</a>
<time>02 Jan 24</time>
</article>
<article>
<h3 class="newsCard-title-sm">No Link Article</h3>
<time>03 Jan 24</time>
</article>
</body>
</html>
"""
base_url = "https://www.warhammer-community.com"
articles = extract_articles_from_html(html, base_url)
assert len(articles) == 1 # Only valid article should be included
assert articles[0]['title'] == "Valid Article"
def test_extract_articles_date_parsing(self):
"""Test parsing of various date formats."""
html = """
<html>
<body>
<article>
<h3 class="newsCard-title-sm">Article with good date</h3>
<a href="/article/1">Read more</a>
<time>15 Mar 24</time>
</article>
<article>
<h3 class="newsCard-title-lg">Article with bad date</h3>
<a href="/article/2">Read more</a>
<time>Invalid Date Format</time>
</article>
<article>
<h3 class="newsCard-title-sm">Article with reading time</h3>
<a href="/article/3">Read more</a>
<time>5 min read</time>
<time>20 Mar 24</time>
</article>
</body>
</html>
"""
base_url = "https://www.warhammer-community.com"
articles = extract_articles_from_html(html, base_url)
assert len(articles) == 3
# Check that dates are parsed correctly
for article in articles:
assert isinstance(article['date'], datetime)
assert article['date'].tzinfo is not None
def test_extract_articles_malformed_html(self):
"""Test handling of malformed HTML."""
malformed_html = "<html><body><article><h3>Unclosed tags"
base_url = "https://www.warhammer-community.com"
# Should not raise ParseError - BeautifulSoup handles malformed HTML gracefully
articles = extract_articles_from_html(malformed_html, base_url)
assert isinstance(articles, list)
def test_extract_articles_invalid_html(self):
"""Test handling of completely invalid HTML."""
with patch('bs4.BeautifulSoup', side_effect=Exception("Parser error")):
with pytest.raises(ParseError):
extract_articles_from_html("<html></html>", "https://example.com")