- Modular architecture with separate modules for scraping, parsing, security, validation, and caching - Comprehensive security measures including HTML sanitization, rate limiting, and input validation - Robust error handling with custom exceptions and retry logic - HTTP caching with ETags and Last-Modified headers for efficiency - Pre-compiled regex patterns for improved performance - Comprehensive test suite with 66 tests covering all major functionality - Docker support for containerized deployment - Configuration management with environment variable support - Working parser that successfully extracts 32 articles from Warhammer Community 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
208 lines
7.4 KiB
Python
208 lines
7.4 KiB
Python
"""Tests for parser module."""
|
|
|
|
import pytest
|
|
from datetime import datetime
|
|
import pytz
|
|
from unittest.mock import patch
|
|
|
|
from src.rss_scraper.parser import sanitize_text, extract_articles_from_html
|
|
from src.rss_scraper.exceptions import ParseError
|
|
from src.rss_scraper.config import Config
|
|
|
|
|
|
class TestSanitizeText:
|
|
"""Test text sanitization functionality."""
|
|
|
|
def test_sanitize_normal_text(self):
|
|
"""Test sanitization of normal text."""
|
|
text = "Normal article title"
|
|
result = sanitize_text(text)
|
|
assert result == "Normal article title"
|
|
|
|
def test_sanitize_none_text(self):
|
|
"""Test sanitization of None text."""
|
|
result = sanitize_text(None)
|
|
assert result == "No title"
|
|
|
|
def test_sanitize_empty_text(self):
|
|
"""Test sanitization of empty text."""
|
|
result = sanitize_text("")
|
|
assert result == "No title"
|
|
|
|
def test_sanitize_whitespace_text(self):
|
|
"""Test sanitization of whitespace-only text."""
|
|
result = sanitize_text(" ")
|
|
assert result == "No title"
|
|
|
|
def test_remove_dangerous_patterns(self):
|
|
"""Test removal of dangerous patterns."""
|
|
dangerous_text = "Title with <script>alert('xss')</script> content"
|
|
result = sanitize_text(dangerous_text)
|
|
assert "<script" not in result
|
|
assert "</script" not in result
|
|
assert "alert('xss')" in result # Only script tags should be removed
|
|
|
|
def test_length_limit(self):
|
|
"""Test that text is limited to max length."""
|
|
long_text = "a" * 1000
|
|
result = sanitize_text(long_text)
|
|
assert len(result) <= Config.MAX_TITLE_LENGTH
|
|
|
|
def test_case_insensitive_pattern_removal(self):
|
|
"""Test that dangerous patterns are removed case-insensitively."""
|
|
text = "Title with <SCRIPT>alert('xss')</SCRIPT> and javascript: protocol"
|
|
result = sanitize_text(text)
|
|
assert "<SCRIPT" not in result
|
|
assert "</SCRIPT" not in result
|
|
assert "javascript:" not in result
|
|
|
|
|
|
class TestExtractArticlesFromHtml:
|
|
"""Test article extraction from HTML."""
|
|
|
|
def test_extract_articles_valid_html(self):
|
|
"""Test extraction from valid HTML with articles."""
|
|
html = """
|
|
<html>
|
|
<body>
|
|
<article>
|
|
<h3 class="newsCard-title-sm">Test Article 1</h3>
|
|
<a href="/article/test-1">Read more</a>
|
|
<time>01 Jan 24</time>
|
|
</article>
|
|
<article>
|
|
<h3 class="newsCard-title-lg">Test Article 2</h3>
|
|
<a href="https://www.warhammer-community.com/article/test-2">Read more</a>
|
|
<time>02 Jan 24</time>
|
|
</article>
|
|
</body>
|
|
</html>
|
|
"""
|
|
|
|
base_url = "https://www.warhammer-community.com"
|
|
articles = extract_articles_from_html(html, base_url)
|
|
|
|
assert len(articles) == 2
|
|
assert articles[0]['title'] == "Test Article 2" # Sorted by date, newest first
|
|
assert articles[1]['title'] == "Test Article 1"
|
|
assert "warhammer-community.com" in articles[0]['link']
|
|
assert "warhammer-community.com" in articles[1]['link']
|
|
|
|
def test_extract_articles_no_articles(self):
|
|
"""Test extraction from HTML with no articles."""
|
|
html = """
|
|
<html>
|
|
<body>
|
|
<div>No articles here</div>
|
|
</body>
|
|
</html>
|
|
"""
|
|
|
|
base_url = "https://www.warhammer-community.com"
|
|
articles = extract_articles_from_html(html, base_url)
|
|
|
|
assert len(articles) == 0
|
|
|
|
def test_extract_articles_duplicate_links(self):
|
|
"""Test that duplicate links are filtered out."""
|
|
html = """
|
|
<html>
|
|
<body>
|
|
<article>
|
|
<h3 class="newsCard-title-sm">Test Article 1</h3>
|
|
<a href="/article/test-1">Read more</a>
|
|
<time>01 Jan 24</time>
|
|
</article>
|
|
<article>
|
|
<h3 class="newsCard-title-lg">Test Article 1 Duplicate</h3>
|
|
<a href="/article/test-1">Read more</a>
|
|
<time>02 Jan 24</time>
|
|
</article>
|
|
</body>
|
|
</html>
|
|
"""
|
|
|
|
base_url = "https://www.warhammer-community.com"
|
|
articles = extract_articles_from_html(html, base_url)
|
|
|
|
assert len(articles) == 1 # Duplicate should be filtered out
|
|
assert articles[0]['title'] == "Test Article 1"
|
|
|
|
def test_extract_articles_invalid_links(self):
|
|
"""Test handling of articles with invalid links."""
|
|
html = """
|
|
<html>
|
|
<body>
|
|
<article>
|
|
<h3 class="newsCard-title-sm">Valid Article</h3>
|
|
<a href="/article/valid">Read more</a>
|
|
<time>01 Jan 24</time>
|
|
</article>
|
|
<article>
|
|
<h3 class="newsCard-title-lg">Invalid Article</h3>
|
|
<a href="https://malicious-site.com/article">Read more</a>
|
|
<time>02 Jan 24</time>
|
|
</article>
|
|
<article>
|
|
<h3 class="newsCard-title-sm">No Link Article</h3>
|
|
<time>03 Jan 24</time>
|
|
</article>
|
|
</body>
|
|
</html>
|
|
"""
|
|
|
|
base_url = "https://www.warhammer-community.com"
|
|
articles = extract_articles_from_html(html, base_url)
|
|
|
|
assert len(articles) == 1 # Only valid article should be included
|
|
assert articles[0]['title'] == "Valid Article"
|
|
|
|
def test_extract_articles_date_parsing(self):
|
|
"""Test parsing of various date formats."""
|
|
html = """
|
|
<html>
|
|
<body>
|
|
<article>
|
|
<h3 class="newsCard-title-sm">Article with good date</h3>
|
|
<a href="/article/1">Read more</a>
|
|
<time>15 Mar 24</time>
|
|
</article>
|
|
<article>
|
|
<h3 class="newsCard-title-lg">Article with bad date</h3>
|
|
<a href="/article/2">Read more</a>
|
|
<time>Invalid Date Format</time>
|
|
</article>
|
|
<article>
|
|
<h3 class="newsCard-title-sm">Article with reading time</h3>
|
|
<a href="/article/3">Read more</a>
|
|
<time>5 min read</time>
|
|
<time>20 Mar 24</time>
|
|
</article>
|
|
</body>
|
|
</html>
|
|
"""
|
|
|
|
base_url = "https://www.warhammer-community.com"
|
|
articles = extract_articles_from_html(html, base_url)
|
|
|
|
assert len(articles) == 3
|
|
|
|
# Check that dates are parsed correctly
|
|
for article in articles:
|
|
assert isinstance(article['date'], datetime)
|
|
assert article['date'].tzinfo is not None
|
|
|
|
def test_extract_articles_malformed_html(self):
|
|
"""Test handling of malformed HTML."""
|
|
malformed_html = "<html><body><article><h3>Unclosed tags"
|
|
|
|
base_url = "https://www.warhammer-community.com"
|
|
# Should not raise ParseError - BeautifulSoup handles malformed HTML gracefully
|
|
articles = extract_articles_from_html(malformed_html, base_url)
|
|
assert isinstance(articles, list)
|
|
|
|
def test_extract_articles_invalid_html(self):
|
|
"""Test handling of completely invalid HTML."""
|
|
with patch('bs4.BeautifulSoup', side_effect=Exception("Parser error")):
|
|
with pytest.raises(ParseError):
|
|
extract_articles_from_html("<html></html>", "https://example.com") |