- Modular architecture with separate modules for scraping, parsing, security, validation, and caching - Comprehensive security measures including HTML sanitization, rate limiting, and input validation - Robust error handling with custom exceptions and retry logic - HTTP caching with ETags and Last-Modified headers for efficiency - Pre-compiled regex patterns for improved performance - Comprehensive test suite with 66 tests covering all major functionality - Docker support for containerized deployment - Configuration management with environment variable support - Working parser that successfully extracts 32 articles from Warhammer Community 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
236 lines
8.4 KiB
Python
236 lines
8.4 KiB
Python
"""Security utilities for content sanitization and rate limiting."""
|
|
|
|
import time
|
|
import logging
|
|
import re
|
|
from typing import Optional, Dict, Any
|
|
from datetime import datetime, timedelta
|
|
import bleach
|
|
|
|
from .config import Config
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class RateLimiter:
|
|
"""Rate limiter to prevent excessive requests."""
|
|
|
|
def __init__(self, requests_per_minute: int = 30):
|
|
self.requests_per_minute = requests_per_minute
|
|
self.request_times: list = []
|
|
self.min_delay_seconds = 60.0 / requests_per_minute
|
|
self.last_request_time: Optional[float] = None
|
|
|
|
def wait_if_needed(self) -> None:
|
|
"""Wait if necessary to respect rate limits."""
|
|
current_time = time.time()
|
|
|
|
# Clean old request times (older than 1 minute)
|
|
cutoff_time = current_time - 60
|
|
self.request_times = [t for t in self.request_times if t > cutoff_time]
|
|
|
|
# Check if we've hit the rate limit
|
|
if len(self.request_times) >= self.requests_per_minute:
|
|
sleep_time = 60 - (current_time - self.request_times[0])
|
|
if sleep_time > 0:
|
|
logger.info(f"Rate limit reached, sleeping for {sleep_time:.2f} seconds")
|
|
time.sleep(sleep_time)
|
|
|
|
# Ensure minimum delay between requests
|
|
if self.last_request_time:
|
|
time_since_last = current_time - self.last_request_time
|
|
if time_since_last < self.min_delay_seconds:
|
|
sleep_time = self.min_delay_seconds - time_since_last
|
|
logger.debug(f"Enforcing minimum delay, sleeping for {sleep_time:.2f} seconds")
|
|
time.sleep(sleep_time)
|
|
|
|
# Record this request
|
|
self.request_times.append(time.time())
|
|
self.last_request_time = time.time()
|
|
|
|
|
|
class ContentSanitizer:
|
|
"""Enhanced content sanitization for security."""
|
|
|
|
def __init__(self):
|
|
# Allowed HTML tags for RSS content (including structural elements for parsing)
|
|
self.allowed_tags = [
|
|
'p', 'br', 'strong', 'em', 'b', 'i', 'u', 'span',
|
|
'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
|
|
'ul', 'ol', 'li', 'blockquote',
|
|
'div', 'article', 'section', 'header', 'footer', 'main', 'nav',
|
|
'a', 'img', 'figure', 'figcaption', 'time'
|
|
]
|
|
|
|
# Allowed attributes
|
|
self.allowed_attributes = {
|
|
'*': ['class', 'id'],
|
|
'a': ['href', 'title', 'class'],
|
|
'img': ['src', 'alt', 'title', 'width', 'height', 'class'],
|
|
'time': ['datetime', 'class'],
|
|
'div': ['class', 'id'],
|
|
'article': ['class', 'id'],
|
|
'section': ['class', 'id']
|
|
}
|
|
|
|
# Protocols allowed in URLs
|
|
self.allowed_protocols = ['http', 'https']
|
|
|
|
# Dangerous patterns to remove (pre-compiled for performance)
|
|
self.dangerous_patterns = [
|
|
re.compile(r'<script[^>]*>.*?</script>', re.IGNORECASE | re.DOTALL),
|
|
re.compile(r'<iframe[^>]*>.*?</iframe>', re.IGNORECASE | re.DOTALL),
|
|
re.compile(r'<object[^>]*>.*?</object>', re.IGNORECASE | re.DOTALL),
|
|
re.compile(r'<embed[^>]*>.*?</embed>', re.IGNORECASE | re.DOTALL),
|
|
re.compile(r'<applet[^>]*>.*?</applet>', re.IGNORECASE | re.DOTALL),
|
|
re.compile(r'<form[^>]*>.*?</form>', re.IGNORECASE | re.DOTALL),
|
|
re.compile(r'javascript:', re.IGNORECASE),
|
|
re.compile(r'vbscript:', re.IGNORECASE),
|
|
re.compile(r'data:', re.IGNORECASE),
|
|
re.compile(r'on\w+\s*=', re.IGNORECASE), # event handlers like onclick, onload, etc.
|
|
]
|
|
|
|
def sanitize_html(self, html_content: str) -> str:
|
|
"""Sanitize HTML content using bleach library."""
|
|
if not html_content:
|
|
return ""
|
|
|
|
try:
|
|
# First pass: remove obviously dangerous patterns
|
|
cleaned = html_content
|
|
for pattern in self.dangerous_patterns:
|
|
cleaned = pattern.sub('', cleaned)
|
|
|
|
# Second pass: use bleach for comprehensive sanitization
|
|
sanitized = bleach.clean(
|
|
cleaned,
|
|
tags=self.allowed_tags,
|
|
attributes=self.allowed_attributes,
|
|
protocols=self.allowed_protocols,
|
|
strip=True,
|
|
strip_comments=True
|
|
)
|
|
|
|
return sanitized
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error sanitizing HTML: {e}")
|
|
# If sanitization fails, return empty string for safety
|
|
return ""
|
|
|
|
def sanitize_text(self, text: Optional[str]) -> str:
|
|
"""Enhanced text sanitization with better security."""
|
|
if not text:
|
|
return "No title"
|
|
|
|
# Basic cleaning
|
|
sanitized = text.strip()
|
|
|
|
# Remove null bytes and other control characters
|
|
sanitized = re.sub(r'[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]', '', sanitized)
|
|
|
|
# Remove dangerous patterns (case insensitive)
|
|
for pattern in Config.DANGEROUS_PATTERNS:
|
|
sanitized = re.sub(re.escape(pattern), '', sanitized, flags=re.IGNORECASE)
|
|
|
|
# Limit length
|
|
sanitized = sanitized[:Config.MAX_TITLE_LENGTH]
|
|
|
|
# Remove excessive whitespace
|
|
sanitized = re.sub(r'\s+', ' ', sanitized).strip()
|
|
|
|
return sanitized if sanitized else "No title"
|
|
|
|
def validate_url_security(self, url: str) -> bool:
|
|
"""Enhanced URL validation for security."""
|
|
if not url:
|
|
return False
|
|
|
|
# Check for dangerous protocols
|
|
dangerous_protocols = ['javascript:', 'vbscript:', 'data:', 'file:', 'ftp:']
|
|
url_lower = url.lower()
|
|
|
|
for protocol in dangerous_protocols:
|
|
if url_lower.startswith(protocol):
|
|
logger.warning(f"Blocked dangerous protocol in URL: {url}")
|
|
return False
|
|
|
|
# Check for suspicious patterns
|
|
suspicious_patterns = [
|
|
r'\.\./', # Path traversal
|
|
r'%2e%2e%2f', # Encoded path traversal
|
|
r'<script', # Script injection
|
|
r'javascript:', # JavaScript protocol
|
|
r'vbscript:', # VBScript protocol
|
|
]
|
|
|
|
for pattern in suspicious_patterns:
|
|
if re.search(pattern, url, re.IGNORECASE):
|
|
logger.warning(f"Blocked suspicious pattern in URL: {url}")
|
|
return False
|
|
|
|
# Check URL length (prevent buffer overflow attacks)
|
|
if len(url) > 2048:
|
|
logger.warning(f"Blocked excessively long URL (length: {len(url)})")
|
|
return False
|
|
|
|
return True
|
|
|
|
def sanitize_filename(self, filename: str) -> str:
|
|
"""Sanitize filenames to prevent directory traversal and injection."""
|
|
if not filename:
|
|
return "default"
|
|
|
|
# Remove path separators and dangerous characters
|
|
sanitized = re.sub(r'[<>:"|?*\\/]', '_', filename)
|
|
|
|
# Remove null bytes and control characters
|
|
sanitized = re.sub(r'[\x00-\x1F\x7F]', '', sanitized)
|
|
|
|
# Remove leading/trailing dots and spaces
|
|
sanitized = sanitized.strip('. ')
|
|
|
|
# Prevent reserved Windows filenames
|
|
reserved_names = [
|
|
'CON', 'PRN', 'AUX', 'NUL',
|
|
'COM1', 'COM2', 'COM3', 'COM4', 'COM5', 'COM6', 'COM7', 'COM8', 'COM9',
|
|
'LPT1', 'LPT2', 'LPT3', 'LPT4', 'LPT5', 'LPT6', 'LPT7', 'LPT8', 'LPT9'
|
|
]
|
|
|
|
if sanitized.upper() in reserved_names:
|
|
sanitized = f"file_{sanitized}"
|
|
|
|
# Limit length
|
|
sanitized = sanitized[:255]
|
|
|
|
return sanitized if sanitized else "default"
|
|
|
|
|
|
# Global instances
|
|
_rate_limiter = RateLimiter(requests_per_minute=30)
|
|
_sanitizer = ContentSanitizer()
|
|
|
|
|
|
def wait_for_rate_limit() -> None:
|
|
"""Apply rate limiting."""
|
|
_rate_limiter.wait_if_needed()
|
|
|
|
|
|
def sanitize_html_content(html: str) -> str:
|
|
"""Sanitize HTML content."""
|
|
return _sanitizer.sanitize_html(html)
|
|
|
|
|
|
def sanitize_text_content(text: Optional[str]) -> str:
|
|
"""Sanitize text content."""
|
|
return _sanitizer.sanitize_text(text)
|
|
|
|
|
|
def validate_url_security(url: str) -> bool:
|
|
"""Validate URL for security."""
|
|
return _sanitizer.validate_url_security(url)
|
|
|
|
|
|
def sanitize_filename(filename: str) -> str:
|
|
"""Sanitize filename."""
|
|
return _sanitizer.sanitize_filename(filename) |