Add comprehensive RSS scraper implementation with security and testing

- Modular architecture with separate modules for scraping, parsing, security, validation, and caching
- Comprehensive security measures including HTML sanitization, rate limiting, and input validation
- Robust error handling with custom exceptions and retry logic
- HTTP caching with ETags and Last-Modified headers for efficiency
- Pre-compiled regex patterns for improved performance
- Comprehensive test suite with 66 tests covering all major functionality
- Docker support for containerized deployment
- Configuration management with environment variable support
- Working parser that successfully extracts 32 articles from Warhammer Community

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
2025-06-06 09:15:06 -06:00
parent e0647325ff
commit 25086fc01b
26 changed files with 15226 additions and 280 deletions

236
src/rss_scraper/security.py Normal file
View File

@@ -0,0 +1,236 @@
"""Security utilities for content sanitization and rate limiting."""
import time
import logging
import re
from typing import Optional, Dict, Any
from datetime import datetime, timedelta
import bleach
from .config import Config
logger = logging.getLogger(__name__)
class RateLimiter:
"""Rate limiter to prevent excessive requests."""
def __init__(self, requests_per_minute: int = 30):
self.requests_per_minute = requests_per_minute
self.request_times: list = []
self.min_delay_seconds = 60.0 / requests_per_minute
self.last_request_time: Optional[float] = None
def wait_if_needed(self) -> None:
"""Wait if necessary to respect rate limits."""
current_time = time.time()
# Clean old request times (older than 1 minute)
cutoff_time = current_time - 60
self.request_times = [t for t in self.request_times if t > cutoff_time]
# Check if we've hit the rate limit
if len(self.request_times) >= self.requests_per_minute:
sleep_time = 60 - (current_time - self.request_times[0])
if sleep_time > 0:
logger.info(f"Rate limit reached, sleeping for {sleep_time:.2f} seconds")
time.sleep(sleep_time)
# Ensure minimum delay between requests
if self.last_request_time:
time_since_last = current_time - self.last_request_time
if time_since_last < self.min_delay_seconds:
sleep_time = self.min_delay_seconds - time_since_last
logger.debug(f"Enforcing minimum delay, sleeping for {sleep_time:.2f} seconds")
time.sleep(sleep_time)
# Record this request
self.request_times.append(time.time())
self.last_request_time = time.time()
class ContentSanitizer:
"""Enhanced content sanitization for security."""
def __init__(self):
# Allowed HTML tags for RSS content (including structural elements for parsing)
self.allowed_tags = [
'p', 'br', 'strong', 'em', 'b', 'i', 'u', 'span',
'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
'ul', 'ol', 'li', 'blockquote',
'div', 'article', 'section', 'header', 'footer', 'main', 'nav',
'a', 'img', 'figure', 'figcaption', 'time'
]
# Allowed attributes
self.allowed_attributes = {
'*': ['class', 'id'],
'a': ['href', 'title', 'class'],
'img': ['src', 'alt', 'title', 'width', 'height', 'class'],
'time': ['datetime', 'class'],
'div': ['class', 'id'],
'article': ['class', 'id'],
'section': ['class', 'id']
}
# Protocols allowed in URLs
self.allowed_protocols = ['http', 'https']
# Dangerous patterns to remove (pre-compiled for performance)
self.dangerous_patterns = [
re.compile(r'<script[^>]*>.*?</script>', re.IGNORECASE | re.DOTALL),
re.compile(r'<iframe[^>]*>.*?</iframe>', re.IGNORECASE | re.DOTALL),
re.compile(r'<object[^>]*>.*?</object>', re.IGNORECASE | re.DOTALL),
re.compile(r'<embed[^>]*>.*?</embed>', re.IGNORECASE | re.DOTALL),
re.compile(r'<applet[^>]*>.*?</applet>', re.IGNORECASE | re.DOTALL),
re.compile(r'<form[^>]*>.*?</form>', re.IGNORECASE | re.DOTALL),
re.compile(r'javascript:', re.IGNORECASE),
re.compile(r'vbscript:', re.IGNORECASE),
re.compile(r'data:', re.IGNORECASE),
re.compile(r'on\w+\s*=', re.IGNORECASE), # event handlers like onclick, onload, etc.
]
def sanitize_html(self, html_content: str) -> str:
"""Sanitize HTML content using bleach library."""
if not html_content:
return ""
try:
# First pass: remove obviously dangerous patterns
cleaned = html_content
for pattern in self.dangerous_patterns:
cleaned = pattern.sub('', cleaned)
# Second pass: use bleach for comprehensive sanitization
sanitized = bleach.clean(
cleaned,
tags=self.allowed_tags,
attributes=self.allowed_attributes,
protocols=self.allowed_protocols,
strip=True,
strip_comments=True
)
return sanitized
except Exception as e:
logger.error(f"Error sanitizing HTML: {e}")
# If sanitization fails, return empty string for safety
return ""
def sanitize_text(self, text: Optional[str]) -> str:
"""Enhanced text sanitization with better security."""
if not text:
return "No title"
# Basic cleaning
sanitized = text.strip()
# Remove null bytes and other control characters
sanitized = re.sub(r'[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]', '', sanitized)
# Remove dangerous patterns (case insensitive)
for pattern in Config.DANGEROUS_PATTERNS:
sanitized = re.sub(re.escape(pattern), '', sanitized, flags=re.IGNORECASE)
# Limit length
sanitized = sanitized[:Config.MAX_TITLE_LENGTH]
# Remove excessive whitespace
sanitized = re.sub(r'\s+', ' ', sanitized).strip()
return sanitized if sanitized else "No title"
def validate_url_security(self, url: str) -> bool:
"""Enhanced URL validation for security."""
if not url:
return False
# Check for dangerous protocols
dangerous_protocols = ['javascript:', 'vbscript:', 'data:', 'file:', 'ftp:']
url_lower = url.lower()
for protocol in dangerous_protocols:
if url_lower.startswith(protocol):
logger.warning(f"Blocked dangerous protocol in URL: {url}")
return False
# Check for suspicious patterns
suspicious_patterns = [
r'\.\./', # Path traversal
r'%2e%2e%2f', # Encoded path traversal
r'<script', # Script injection
r'javascript:', # JavaScript protocol
r'vbscript:', # VBScript protocol
]
for pattern in suspicious_patterns:
if re.search(pattern, url, re.IGNORECASE):
logger.warning(f"Blocked suspicious pattern in URL: {url}")
return False
# Check URL length (prevent buffer overflow attacks)
if len(url) > 2048:
logger.warning(f"Blocked excessively long URL (length: {len(url)})")
return False
return True
def sanitize_filename(self, filename: str) -> str:
"""Sanitize filenames to prevent directory traversal and injection."""
if not filename:
return "default"
# Remove path separators and dangerous characters
sanitized = re.sub(r'[<>:"|?*\\/]', '_', filename)
# Remove null bytes and control characters
sanitized = re.sub(r'[\x00-\x1F\x7F]', '', sanitized)
# Remove leading/trailing dots and spaces
sanitized = sanitized.strip('. ')
# Prevent reserved Windows filenames
reserved_names = [
'CON', 'PRN', 'AUX', 'NUL',
'COM1', 'COM2', 'COM3', 'COM4', 'COM5', 'COM6', 'COM7', 'COM8', 'COM9',
'LPT1', 'LPT2', 'LPT3', 'LPT4', 'LPT5', 'LPT6', 'LPT7', 'LPT8', 'LPT9'
]
if sanitized.upper() in reserved_names:
sanitized = f"file_{sanitized}"
# Limit length
sanitized = sanitized[:255]
return sanitized if sanitized else "default"
# Global instances
_rate_limiter = RateLimiter(requests_per_minute=30)
_sanitizer = ContentSanitizer()
def wait_for_rate_limit() -> None:
"""Apply rate limiting."""
_rate_limiter.wait_if_needed()
def sanitize_html_content(html: str) -> str:
"""Sanitize HTML content."""
return _sanitizer.sanitize_html(html)
def sanitize_text_content(text: Optional[str]) -> str:
"""Sanitize text content."""
return _sanitizer.sanitize_text(text)
def validate_url_security(url: str) -> bool:
"""Validate URL for security."""
return _sanitizer.validate_url_security(url)
def sanitize_filename(filename: str) -> str:
"""Sanitize filename."""
return _sanitizer.sanitize_filename(filename)