import os import sys import logging import argparse from typing import Optional from src.rss_scraper.config import Config from src.rss_scraper.exceptions import ( ValidationError, NetworkError, PageLoadError, ContentSizeError, ParseError, FileOperationError ) from src.rss_scraper.validation import validate_url from src.rss_scraper.scraper import load_page_with_retry, clear_cache, get_cache_info from src.rss_scraper.parser import extract_articles_from_html from src.rss_scraper.rss_generator import generate_rss_feed, save_rss_feed, save_debug_html # Configure logging logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', handlers=[ logging.StreamHandler(sys.stdout), logging.FileHandler('scraper.log') ] ) logger = logging.getLogger(__name__) def parse_arguments() -> argparse.Namespace: """Parse and validate command line arguments.""" parser = argparse.ArgumentParser( description='RSS scraper for Warhammer Community website', formatter_class=argparse.ArgumentDefaultsHelpFormatter ) parser.add_argument( '--url', type=str, default=Config.DEFAULT_URL, help='URL to scrape for articles' ) parser.add_argument( '--output-dir', type=str, default=None, help='Output directory for RSS feed and HTML files' ) parser.add_argument( '--max-scroll', type=int, default=Config.MAX_SCROLL_ITERATIONS, help='Maximum number of scroll iterations' ) parser.add_argument( '--log-level', type=str, choices=['DEBUG', 'INFO', 'WARNING', 'ERROR'], default='INFO', help='Logging level' ) parser.add_argument( '--log-file', type=str, default='scraper.log', help='Log file path' ) parser.add_argument( '--no-cache', action='store_true', help='Disable content caching' ) parser.add_argument( '--clear-cache', action='store_true', help='Clear cache before running' ) parser.add_argument( '--cache-info', action='store_true', help='Show cache information and exit' ) args = parser.parse_args() # Validate arguments if args.max_scroll < 0: parser.error("--max-scroll must be non-negative") if args.output_dir and not os.path.isabs(args.output_dir): # Convert relative path to absolute args.output_dir = os.path.abspath(args.output_dir) return args def setup_logging(log_level: str, log_file: str) -> None: """Setup logging configuration.""" # Clear any existing handlers for handler in logging.root.handlers[:]: logging.root.removeHandler(handler) # Set up new configuration logging.basicConfig( level=getattr(logging, log_level), format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', handlers=[ logging.StreamHandler(sys.stdout), logging.FileHandler(log_file) ] ) # Function to scrape articles using Playwright and generate an RSS feed def scrape_and_generate_rss(url: str, output_dir: Optional[str] = None, use_cache: bool = True) -> None: """Main function to scrape articles and generate RSS feed.""" logger.info(f"Starting scrape of {url}") # Validate URL first validate_url(url) # Set default output directory if not provided if output_dir is None: output_dir = Config.DEFAULT_OUTPUT_DIR logger.info(f"Using output directory: {output_dir}") logger.info(f"Caching {'enabled' if use_cache else 'disabled'}") # Load page content with retry logic html = load_page_with_retry(url, use_cache=use_cache) # Extract articles from HTML articles = extract_articles_from_html(html, url) # Generate RSS feed rss_content = generate_rss_feed(articles, url) # Save RSS feed and debug HTML save_rss_feed(rss_content, output_dir) save_debug_html(html, output_dir) logger.info(f'RSS feed generated successfully with {len(articles)} articles') if __name__ == "__main__": try: # Parse command line arguments args = parse_arguments() # Setup logging with parsed arguments setup_logging(args.log_level, args.log_file) # Re-get logger after setup logger = logging.getLogger(__name__) # Handle cache operations first if args.cache_info: cache_info = get_cache_info() print(f"Cache file: {cache_info['cache_file']}") print(f"ETag file: {cache_info['etag_file']}") print(f"Cache entries: {cache_info['cache_entries']}") print(f"ETag entries: {cache_info['etag_entries']}") print(f"Cache size: {cache_info['cache_size_bytes']} bytes") sys.exit(0) if args.clear_cache: logger.info("Clearing cache...") clear_cache() logger.info("Cache cleared successfully") # Validate configuration Config.validate_config() logger.info("Configuration validation passed") # Determine output directory output_dir = args.output_dir or os.getenv('OUTPUT_DIR') or Config.DEFAULT_OUTPUT_DIR logger.info(f"Starting RSS scraper with URL: {args.url}") logger.info(f"Output directory: {output_dir}") logger.info(f"Max scroll iterations: {args.max_scroll}") # Temporarily override config if max_scroll was provided if args.max_scroll != Config.MAX_SCROLL_ITERATIONS: Config.MAX_SCROLL_ITERATIONS = args.max_scroll logger.info(f"Overriding max scroll iterations to: {args.max_scroll}") # Run the function use_cache = not args.no_cache scrape_and_generate_rss(args.url, output_dir, use_cache) logger.info("RSS scraping completed successfully") except argparse.ArgumentError as e: print(f"Argument error: {e}", file=sys.stderr) sys.exit(1) except (ValueError, ValidationError) as e: print(f"Configuration/Validation error: {e}", file=sys.stderr) sys.exit(1) except PageLoadError as e: logger.error(f"Page loading error: {e}") sys.exit(3) except NetworkError as e: logger.error(f"Network error: {e}") sys.exit(2) except ParseError as e: logger.error(f"Content parsing error: {e}") sys.exit(4) except FileOperationError as e: logger.error(f"File operation error: {e}") sys.exit(5) except ContentSizeError as e: logger.error(f"Content size error: {e}") sys.exit(6) except Exception as e: logger.error(f"Unexpected error: {e}") sys.exit(99)