- Modular architecture with separate modules for scraping, parsing, security, validation, and caching - Comprehensive security measures including HTML sanitization, rate limiting, and input validation - Robust error handling with custom exceptions and retry logic - HTTP caching with ETags and Last-Modified headers for efficiency - Pre-compiled regex patterns for improved performance - Comprehensive test suite with 66 tests covering all major functionality - Docker support for containerized deployment - Configuration management with environment variable support - Working parser that successfully extracts 32 articles from Warhammer Community 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
221 lines
6.8 KiB
Python
221 lines
6.8 KiB
Python
import os
|
|
import sys
|
|
import logging
|
|
import argparse
|
|
from typing import Optional
|
|
|
|
from src.rss_scraper.config import Config
|
|
from src.rss_scraper.exceptions import (
|
|
ValidationError, NetworkError, PageLoadError,
|
|
ContentSizeError, ParseError, FileOperationError
|
|
)
|
|
from src.rss_scraper.validation import validate_url
|
|
from src.rss_scraper.scraper import load_page_with_retry, clear_cache, get_cache_info
|
|
from src.rss_scraper.parser import extract_articles_from_html
|
|
from src.rss_scraper.rss_generator import generate_rss_feed, save_rss_feed, save_debug_html
|
|
|
|
# Configure logging
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
|
handlers=[
|
|
logging.StreamHandler(sys.stdout),
|
|
logging.FileHandler('scraper.log')
|
|
]
|
|
)
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
def parse_arguments() -> argparse.Namespace:
|
|
"""Parse and validate command line arguments."""
|
|
parser = argparse.ArgumentParser(
|
|
description='RSS scraper for Warhammer Community website',
|
|
formatter_class=argparse.ArgumentDefaultsHelpFormatter
|
|
)
|
|
|
|
parser.add_argument(
|
|
'--url',
|
|
type=str,
|
|
default=Config.DEFAULT_URL,
|
|
help='URL to scrape for articles'
|
|
)
|
|
|
|
parser.add_argument(
|
|
'--output-dir',
|
|
type=str,
|
|
default=None,
|
|
help='Output directory for RSS feed and HTML files'
|
|
)
|
|
|
|
parser.add_argument(
|
|
'--max-scroll',
|
|
type=int,
|
|
default=Config.MAX_SCROLL_ITERATIONS,
|
|
help='Maximum number of scroll iterations'
|
|
)
|
|
|
|
parser.add_argument(
|
|
'--log-level',
|
|
type=str,
|
|
choices=['DEBUG', 'INFO', 'WARNING', 'ERROR'],
|
|
default='INFO',
|
|
help='Logging level'
|
|
)
|
|
|
|
parser.add_argument(
|
|
'--log-file',
|
|
type=str,
|
|
default='scraper.log',
|
|
help='Log file path'
|
|
)
|
|
|
|
parser.add_argument(
|
|
'--no-cache',
|
|
action='store_true',
|
|
help='Disable content caching'
|
|
)
|
|
|
|
parser.add_argument(
|
|
'--clear-cache',
|
|
action='store_true',
|
|
help='Clear cache before running'
|
|
)
|
|
|
|
parser.add_argument(
|
|
'--cache-info',
|
|
action='store_true',
|
|
help='Show cache information and exit'
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
# Validate arguments
|
|
if args.max_scroll < 0:
|
|
parser.error("--max-scroll must be non-negative")
|
|
|
|
if args.output_dir and not os.path.isabs(args.output_dir):
|
|
# Convert relative path to absolute
|
|
args.output_dir = os.path.abspath(args.output_dir)
|
|
|
|
return args
|
|
|
|
|
|
def setup_logging(log_level: str, log_file: str) -> None:
|
|
"""Setup logging configuration."""
|
|
# Clear any existing handlers
|
|
for handler in logging.root.handlers[:]:
|
|
logging.root.removeHandler(handler)
|
|
|
|
# Set up new configuration
|
|
logging.basicConfig(
|
|
level=getattr(logging, log_level),
|
|
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
|
handlers=[
|
|
logging.StreamHandler(sys.stdout),
|
|
logging.FileHandler(log_file)
|
|
]
|
|
)
|
|
|
|
|
|
# Function to scrape articles using Playwright and generate an RSS feed
|
|
def scrape_and_generate_rss(url: str, output_dir: Optional[str] = None, use_cache: bool = True) -> None:
|
|
"""Main function to scrape articles and generate RSS feed."""
|
|
logger.info(f"Starting scrape of {url}")
|
|
|
|
# Validate URL first
|
|
validate_url(url)
|
|
|
|
# Set default output directory if not provided
|
|
if output_dir is None:
|
|
output_dir = Config.DEFAULT_OUTPUT_DIR
|
|
|
|
logger.info(f"Using output directory: {output_dir}")
|
|
logger.info(f"Caching {'enabled' if use_cache else 'disabled'}")
|
|
|
|
# Load page content with retry logic
|
|
html = load_page_with_retry(url, use_cache=use_cache)
|
|
|
|
# Extract articles from HTML
|
|
articles = extract_articles_from_html(html, url)
|
|
|
|
# Generate RSS feed
|
|
rss_content = generate_rss_feed(articles, url)
|
|
|
|
# Save RSS feed and debug HTML
|
|
save_rss_feed(rss_content, output_dir)
|
|
save_debug_html(html, output_dir)
|
|
|
|
logger.info(f'RSS feed generated successfully with {len(articles)} articles')
|
|
|
|
if __name__ == "__main__":
|
|
try:
|
|
# Parse command line arguments
|
|
args = parse_arguments()
|
|
|
|
# Setup logging with parsed arguments
|
|
setup_logging(args.log_level, args.log_file)
|
|
|
|
# Re-get logger after setup
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Handle cache operations first
|
|
if args.cache_info:
|
|
cache_info = get_cache_info()
|
|
print(f"Cache file: {cache_info['cache_file']}")
|
|
print(f"ETag file: {cache_info['etag_file']}")
|
|
print(f"Cache entries: {cache_info['cache_entries']}")
|
|
print(f"ETag entries: {cache_info['etag_entries']}")
|
|
print(f"Cache size: {cache_info['cache_size_bytes']} bytes")
|
|
sys.exit(0)
|
|
|
|
if args.clear_cache:
|
|
logger.info("Clearing cache...")
|
|
clear_cache()
|
|
logger.info("Cache cleared successfully")
|
|
|
|
# Validate configuration
|
|
Config.validate_config()
|
|
logger.info("Configuration validation passed")
|
|
|
|
# Determine output directory
|
|
output_dir = args.output_dir or os.getenv('OUTPUT_DIR') or Config.DEFAULT_OUTPUT_DIR
|
|
|
|
logger.info(f"Starting RSS scraper with URL: {args.url}")
|
|
logger.info(f"Output directory: {output_dir}")
|
|
logger.info(f"Max scroll iterations: {args.max_scroll}")
|
|
|
|
# Temporarily override config if max_scroll was provided
|
|
if args.max_scroll != Config.MAX_SCROLL_ITERATIONS:
|
|
Config.MAX_SCROLL_ITERATIONS = args.max_scroll
|
|
logger.info(f"Overriding max scroll iterations to: {args.max_scroll}")
|
|
|
|
# Run the function
|
|
use_cache = not args.no_cache
|
|
scrape_and_generate_rss(args.url, output_dir, use_cache)
|
|
logger.info("RSS scraping completed successfully")
|
|
|
|
except argparse.ArgumentError as e:
|
|
print(f"Argument error: {e}", file=sys.stderr)
|
|
sys.exit(1)
|
|
except (ValueError, ValidationError) as e:
|
|
print(f"Configuration/Validation error: {e}", file=sys.stderr)
|
|
sys.exit(1)
|
|
except PageLoadError as e:
|
|
logger.error(f"Page loading error: {e}")
|
|
sys.exit(3)
|
|
except NetworkError as e:
|
|
logger.error(f"Network error: {e}")
|
|
sys.exit(2)
|
|
except ParseError as e:
|
|
logger.error(f"Content parsing error: {e}")
|
|
sys.exit(4)
|
|
except FileOperationError as e:
|
|
logger.error(f"File operation error: {e}")
|
|
sys.exit(5)
|
|
except ContentSizeError as e:
|
|
logger.error(f"Content size error: {e}")
|
|
sys.exit(6)
|
|
except Exception as e:
|
|
logger.error(f"Unexpected error: {e}")
|
|
sys.exit(99)
|