Phil 25086fc01b Add comprehensive RSS scraper implementation with security and testing
- Modular architecture with separate modules for scraping, parsing, security, validation, and caching
- Comprehensive security measures including HTML sanitization, rate limiting, and input validation
- Robust error handling with custom exceptions and retry logic
- HTTP caching with ETags and Last-Modified headers for efficiency
- Pre-compiled regex patterns for improved performance
- Comprehensive test suite with 66 tests covering all major functionality
- Docker support for containerized deployment
- Configuration management with environment variable support
- Working parser that successfully extracts 32 articles from Warhammer Community

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-06-06 09:15:06 -06:00

221 lines
6.8 KiB
Python

import os
import sys
import logging
import argparse
from typing import Optional
from src.rss_scraper.config import Config
from src.rss_scraper.exceptions import (
ValidationError, NetworkError, PageLoadError,
ContentSizeError, ParseError, FileOperationError
)
from src.rss_scraper.validation import validate_url
from src.rss_scraper.scraper import load_page_with_retry, clear_cache, get_cache_info
from src.rss_scraper.parser import extract_articles_from_html
from src.rss_scraper.rss_generator import generate_rss_feed, save_rss_feed, save_debug_html
# Configure logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
handlers=[
logging.StreamHandler(sys.stdout),
logging.FileHandler('scraper.log')
]
)
logger = logging.getLogger(__name__)
def parse_arguments() -> argparse.Namespace:
"""Parse and validate command line arguments."""
parser = argparse.ArgumentParser(
description='RSS scraper for Warhammer Community website',
formatter_class=argparse.ArgumentDefaultsHelpFormatter
)
parser.add_argument(
'--url',
type=str,
default=Config.DEFAULT_URL,
help='URL to scrape for articles'
)
parser.add_argument(
'--output-dir',
type=str,
default=None,
help='Output directory for RSS feed and HTML files'
)
parser.add_argument(
'--max-scroll',
type=int,
default=Config.MAX_SCROLL_ITERATIONS,
help='Maximum number of scroll iterations'
)
parser.add_argument(
'--log-level',
type=str,
choices=['DEBUG', 'INFO', 'WARNING', 'ERROR'],
default='INFO',
help='Logging level'
)
parser.add_argument(
'--log-file',
type=str,
default='scraper.log',
help='Log file path'
)
parser.add_argument(
'--no-cache',
action='store_true',
help='Disable content caching'
)
parser.add_argument(
'--clear-cache',
action='store_true',
help='Clear cache before running'
)
parser.add_argument(
'--cache-info',
action='store_true',
help='Show cache information and exit'
)
args = parser.parse_args()
# Validate arguments
if args.max_scroll < 0:
parser.error("--max-scroll must be non-negative")
if args.output_dir and not os.path.isabs(args.output_dir):
# Convert relative path to absolute
args.output_dir = os.path.abspath(args.output_dir)
return args
def setup_logging(log_level: str, log_file: str) -> None:
"""Setup logging configuration."""
# Clear any existing handlers
for handler in logging.root.handlers[:]:
logging.root.removeHandler(handler)
# Set up new configuration
logging.basicConfig(
level=getattr(logging, log_level),
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
handlers=[
logging.StreamHandler(sys.stdout),
logging.FileHandler(log_file)
]
)
# Function to scrape articles using Playwright and generate an RSS feed
def scrape_and_generate_rss(url: str, output_dir: Optional[str] = None, use_cache: bool = True) -> None:
"""Main function to scrape articles and generate RSS feed."""
logger.info(f"Starting scrape of {url}")
# Validate URL first
validate_url(url)
# Set default output directory if not provided
if output_dir is None:
output_dir = Config.DEFAULT_OUTPUT_DIR
logger.info(f"Using output directory: {output_dir}")
logger.info(f"Caching {'enabled' if use_cache else 'disabled'}")
# Load page content with retry logic
html = load_page_with_retry(url, use_cache=use_cache)
# Extract articles from HTML
articles = extract_articles_from_html(html, url)
# Generate RSS feed
rss_content = generate_rss_feed(articles, url)
# Save RSS feed and debug HTML
save_rss_feed(rss_content, output_dir)
save_debug_html(html, output_dir)
logger.info(f'RSS feed generated successfully with {len(articles)} articles')
if __name__ == "__main__":
try:
# Parse command line arguments
args = parse_arguments()
# Setup logging with parsed arguments
setup_logging(args.log_level, args.log_file)
# Re-get logger after setup
logger = logging.getLogger(__name__)
# Handle cache operations first
if args.cache_info:
cache_info = get_cache_info()
print(f"Cache file: {cache_info['cache_file']}")
print(f"ETag file: {cache_info['etag_file']}")
print(f"Cache entries: {cache_info['cache_entries']}")
print(f"ETag entries: {cache_info['etag_entries']}")
print(f"Cache size: {cache_info['cache_size_bytes']} bytes")
sys.exit(0)
if args.clear_cache:
logger.info("Clearing cache...")
clear_cache()
logger.info("Cache cleared successfully")
# Validate configuration
Config.validate_config()
logger.info("Configuration validation passed")
# Determine output directory
output_dir = args.output_dir or os.getenv('OUTPUT_DIR') or Config.DEFAULT_OUTPUT_DIR
logger.info(f"Starting RSS scraper with URL: {args.url}")
logger.info(f"Output directory: {output_dir}")
logger.info(f"Max scroll iterations: {args.max_scroll}")
# Temporarily override config if max_scroll was provided
if args.max_scroll != Config.MAX_SCROLL_ITERATIONS:
Config.MAX_SCROLL_ITERATIONS = args.max_scroll
logger.info(f"Overriding max scroll iterations to: {args.max_scroll}")
# Run the function
use_cache = not args.no_cache
scrape_and_generate_rss(args.url, output_dir, use_cache)
logger.info("RSS scraping completed successfully")
except argparse.ArgumentError as e:
print(f"Argument error: {e}", file=sys.stderr)
sys.exit(1)
except (ValueError, ValidationError) as e:
print(f"Configuration/Validation error: {e}", file=sys.stderr)
sys.exit(1)
except PageLoadError as e:
logger.error(f"Page loading error: {e}")
sys.exit(3)
except NetworkError as e:
logger.error(f"Network error: {e}")
sys.exit(2)
except ParseError as e:
logger.error(f"Content parsing error: {e}")
sys.exit(4)
except FileOperationError as e:
logger.error(f"File operation error: {e}")
sys.exit(5)
except ContentSizeError as e:
logger.error(f"Content size error: {e}")
sys.exit(6)
except Exception as e:
logger.error(f"Unexpected error: {e}")
sys.exit(99)