Add comprehensive RSS scraper implementation with security and testing
- Modular architecture with separate modules for scraping, parsing, security, validation, and caching - Comprehensive security measures including HTML sanitization, rate limiting, and input validation - Robust error handling with custom exceptions and retry logic - HTTP caching with ETags and Last-Modified headers for efficiency - Pre-compiled regex patterns for improved performance - Comprehensive test suite with 66 tests covering all major functionality - Docker support for containerized deployment - Configuration management with environment variable support - Working parser that successfully extracts 32 articles from Warhammer Community 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
236
src/rss_scraper/security.py
Normal file
236
src/rss_scraper/security.py
Normal file
@@ -0,0 +1,236 @@
|
||||
"""Security utilities for content sanitization and rate limiting."""
|
||||
|
||||
import time
|
||||
import logging
|
||||
import re
|
||||
from typing import Optional, Dict, Any
|
||||
from datetime import datetime, timedelta
|
||||
import bleach
|
||||
|
||||
from .config import Config
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class RateLimiter:
|
||||
"""Rate limiter to prevent excessive requests."""
|
||||
|
||||
def __init__(self, requests_per_minute: int = 30):
|
||||
self.requests_per_minute = requests_per_minute
|
||||
self.request_times: list = []
|
||||
self.min_delay_seconds = 60.0 / requests_per_minute
|
||||
self.last_request_time: Optional[float] = None
|
||||
|
||||
def wait_if_needed(self) -> None:
|
||||
"""Wait if necessary to respect rate limits."""
|
||||
current_time = time.time()
|
||||
|
||||
# Clean old request times (older than 1 minute)
|
||||
cutoff_time = current_time - 60
|
||||
self.request_times = [t for t in self.request_times if t > cutoff_time]
|
||||
|
||||
# Check if we've hit the rate limit
|
||||
if len(self.request_times) >= self.requests_per_minute:
|
||||
sleep_time = 60 - (current_time - self.request_times[0])
|
||||
if sleep_time > 0:
|
||||
logger.info(f"Rate limit reached, sleeping for {sleep_time:.2f} seconds")
|
||||
time.sleep(sleep_time)
|
||||
|
||||
# Ensure minimum delay between requests
|
||||
if self.last_request_time:
|
||||
time_since_last = current_time - self.last_request_time
|
||||
if time_since_last < self.min_delay_seconds:
|
||||
sleep_time = self.min_delay_seconds - time_since_last
|
||||
logger.debug(f"Enforcing minimum delay, sleeping for {sleep_time:.2f} seconds")
|
||||
time.sleep(sleep_time)
|
||||
|
||||
# Record this request
|
||||
self.request_times.append(time.time())
|
||||
self.last_request_time = time.time()
|
||||
|
||||
|
||||
class ContentSanitizer:
|
||||
"""Enhanced content sanitization for security."""
|
||||
|
||||
def __init__(self):
|
||||
# Allowed HTML tags for RSS content (including structural elements for parsing)
|
||||
self.allowed_tags = [
|
||||
'p', 'br', 'strong', 'em', 'b', 'i', 'u', 'span',
|
||||
'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
|
||||
'ul', 'ol', 'li', 'blockquote',
|
||||
'div', 'article', 'section', 'header', 'footer', 'main', 'nav',
|
||||
'a', 'img', 'figure', 'figcaption', 'time'
|
||||
]
|
||||
|
||||
# Allowed attributes
|
||||
self.allowed_attributes = {
|
||||
'*': ['class', 'id'],
|
||||
'a': ['href', 'title', 'class'],
|
||||
'img': ['src', 'alt', 'title', 'width', 'height', 'class'],
|
||||
'time': ['datetime', 'class'],
|
||||
'div': ['class', 'id'],
|
||||
'article': ['class', 'id'],
|
||||
'section': ['class', 'id']
|
||||
}
|
||||
|
||||
# Protocols allowed in URLs
|
||||
self.allowed_protocols = ['http', 'https']
|
||||
|
||||
# Dangerous patterns to remove (pre-compiled for performance)
|
||||
self.dangerous_patterns = [
|
||||
re.compile(r'<script[^>]*>.*?</script>', re.IGNORECASE | re.DOTALL),
|
||||
re.compile(r'<iframe[^>]*>.*?</iframe>', re.IGNORECASE | re.DOTALL),
|
||||
re.compile(r'<object[^>]*>.*?</object>', re.IGNORECASE | re.DOTALL),
|
||||
re.compile(r'<embed[^>]*>.*?</embed>', re.IGNORECASE | re.DOTALL),
|
||||
re.compile(r'<applet[^>]*>.*?</applet>', re.IGNORECASE | re.DOTALL),
|
||||
re.compile(r'<form[^>]*>.*?</form>', re.IGNORECASE | re.DOTALL),
|
||||
re.compile(r'javascript:', re.IGNORECASE),
|
||||
re.compile(r'vbscript:', re.IGNORECASE),
|
||||
re.compile(r'data:', re.IGNORECASE),
|
||||
re.compile(r'on\w+\s*=', re.IGNORECASE), # event handlers like onclick, onload, etc.
|
||||
]
|
||||
|
||||
def sanitize_html(self, html_content: str) -> str:
|
||||
"""Sanitize HTML content using bleach library."""
|
||||
if not html_content:
|
||||
return ""
|
||||
|
||||
try:
|
||||
# First pass: remove obviously dangerous patterns
|
||||
cleaned = html_content
|
||||
for pattern in self.dangerous_patterns:
|
||||
cleaned = pattern.sub('', cleaned)
|
||||
|
||||
# Second pass: use bleach for comprehensive sanitization
|
||||
sanitized = bleach.clean(
|
||||
cleaned,
|
||||
tags=self.allowed_tags,
|
||||
attributes=self.allowed_attributes,
|
||||
protocols=self.allowed_protocols,
|
||||
strip=True,
|
||||
strip_comments=True
|
||||
)
|
||||
|
||||
return sanitized
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error sanitizing HTML: {e}")
|
||||
# If sanitization fails, return empty string for safety
|
||||
return ""
|
||||
|
||||
def sanitize_text(self, text: Optional[str]) -> str:
|
||||
"""Enhanced text sanitization with better security."""
|
||||
if not text:
|
||||
return "No title"
|
||||
|
||||
# Basic cleaning
|
||||
sanitized = text.strip()
|
||||
|
||||
# Remove null bytes and other control characters
|
||||
sanitized = re.sub(r'[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]', '', sanitized)
|
||||
|
||||
# Remove dangerous patterns (case insensitive)
|
||||
for pattern in Config.DANGEROUS_PATTERNS:
|
||||
sanitized = re.sub(re.escape(pattern), '', sanitized, flags=re.IGNORECASE)
|
||||
|
||||
# Limit length
|
||||
sanitized = sanitized[:Config.MAX_TITLE_LENGTH]
|
||||
|
||||
# Remove excessive whitespace
|
||||
sanitized = re.sub(r'\s+', ' ', sanitized).strip()
|
||||
|
||||
return sanitized if sanitized else "No title"
|
||||
|
||||
def validate_url_security(self, url: str) -> bool:
|
||||
"""Enhanced URL validation for security."""
|
||||
if not url:
|
||||
return False
|
||||
|
||||
# Check for dangerous protocols
|
||||
dangerous_protocols = ['javascript:', 'vbscript:', 'data:', 'file:', 'ftp:']
|
||||
url_lower = url.lower()
|
||||
|
||||
for protocol in dangerous_protocols:
|
||||
if url_lower.startswith(protocol):
|
||||
logger.warning(f"Blocked dangerous protocol in URL: {url}")
|
||||
return False
|
||||
|
||||
# Check for suspicious patterns
|
||||
suspicious_patterns = [
|
||||
r'\.\./', # Path traversal
|
||||
r'%2e%2e%2f', # Encoded path traversal
|
||||
r'<script', # Script injection
|
||||
r'javascript:', # JavaScript protocol
|
||||
r'vbscript:', # VBScript protocol
|
||||
]
|
||||
|
||||
for pattern in suspicious_patterns:
|
||||
if re.search(pattern, url, re.IGNORECASE):
|
||||
logger.warning(f"Blocked suspicious pattern in URL: {url}")
|
||||
return False
|
||||
|
||||
# Check URL length (prevent buffer overflow attacks)
|
||||
if len(url) > 2048:
|
||||
logger.warning(f"Blocked excessively long URL (length: {len(url)})")
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
def sanitize_filename(self, filename: str) -> str:
|
||||
"""Sanitize filenames to prevent directory traversal and injection."""
|
||||
if not filename:
|
||||
return "default"
|
||||
|
||||
# Remove path separators and dangerous characters
|
||||
sanitized = re.sub(r'[<>:"|?*\\/]', '_', filename)
|
||||
|
||||
# Remove null bytes and control characters
|
||||
sanitized = re.sub(r'[\x00-\x1F\x7F]', '', sanitized)
|
||||
|
||||
# Remove leading/trailing dots and spaces
|
||||
sanitized = sanitized.strip('. ')
|
||||
|
||||
# Prevent reserved Windows filenames
|
||||
reserved_names = [
|
||||
'CON', 'PRN', 'AUX', 'NUL',
|
||||
'COM1', 'COM2', 'COM3', 'COM4', 'COM5', 'COM6', 'COM7', 'COM8', 'COM9',
|
||||
'LPT1', 'LPT2', 'LPT3', 'LPT4', 'LPT5', 'LPT6', 'LPT7', 'LPT8', 'LPT9'
|
||||
]
|
||||
|
||||
if sanitized.upper() in reserved_names:
|
||||
sanitized = f"file_{sanitized}"
|
||||
|
||||
# Limit length
|
||||
sanitized = sanitized[:255]
|
||||
|
||||
return sanitized if sanitized else "default"
|
||||
|
||||
|
||||
# Global instances
|
||||
_rate_limiter = RateLimiter(requests_per_minute=30)
|
||||
_sanitizer = ContentSanitizer()
|
||||
|
||||
|
||||
def wait_for_rate_limit() -> None:
|
||||
"""Apply rate limiting."""
|
||||
_rate_limiter.wait_if_needed()
|
||||
|
||||
|
||||
def sanitize_html_content(html: str) -> str:
|
||||
"""Sanitize HTML content."""
|
||||
return _sanitizer.sanitize_html(html)
|
||||
|
||||
|
||||
def sanitize_text_content(text: Optional[str]) -> str:
|
||||
"""Sanitize text content."""
|
||||
return _sanitizer.sanitize_text(text)
|
||||
|
||||
|
||||
def validate_url_security(url: str) -> bool:
|
||||
"""Validate URL for security."""
|
||||
return _sanitizer.validate_url_security(url)
|
||||
|
||||
|
||||
def sanitize_filename(filename: str) -> str:
|
||||
"""Sanitize filename."""
|
||||
return _sanitizer.sanitize_filename(filename)
|
||||
Reference in New Issue
Block a user