rss_warhammer-community/main.py

import os
import sys
import logging
import argparse
from typing import Optional

from src.rss_scraper.config import Config
from src.rss_scraper.exceptions import (
    ValidationError, NetworkError, PageLoadError,
    ContentSizeError, ParseError, FileOperationError
)
from src.rss_scraper.validation import validate_url
from src.rss_scraper.scraper import load_page_with_retry, clear_cache, get_cache_info
from src.rss_scraper.parser import extract_articles_from_html
from src.rss_scraper.rss_generator import generate_rss_feed, save_rss_feed, save_debug_html

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    handlers=[
        logging.StreamHandler(sys.stdout),
        logging.FileHandler('scraper.log')
    ]
)
logger = logging.getLogger(__name__)


def parse_arguments() -> argparse.Namespace:
    """Parse and validate command line arguments."""
    parser = argparse.ArgumentParser(
        description='RSS scraper for Warhammer Community website',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter
    )

    parser.add_argument(
        '--url',
        type=str,
        default=Config.DEFAULT_URL,
        help='URL to scrape for articles'
    )

    parser.add_argument(
        '--output-dir',
        type=str,
        default=None,
        help='Output directory for RSS feed and HTML files'
    )

    parser.add_argument(
        '--max-scroll',
        type=int,
        default=Config.MAX_SCROLL_ITERATIONS,
        help='Maximum number of scroll iterations'
    )

    parser.add_argument(
        '--log-level',
        type=str,
        choices=['DEBUG', 'INFO', 'WARNING', 'ERROR'],
        default='INFO',
        help='Logging level'
    )

    parser.add_argument(
        '--log-file',
        type=str,
        default='scraper.log',
        help='Log file path'
    )

    parser.add_argument(
        '--no-cache',
        action='store_true',
        help='Disable content caching'
    )

    parser.add_argument(
        '--clear-cache',
        action='store_true',
        help='Clear cache before running'
    )

    parser.add_argument(
        '--cache-info',
        action='store_true',
        help='Show cache information and exit'
    )

    args = parser.parse_args()

    # Validate arguments
    if args.max_scroll < 0:
        parser.error("--max-scroll must be non-negative")

    if args.output_dir and not os.path.isabs(args.output_dir):
        # Convert relative path to absolute
        args.output_dir = os.path.abspath(args.output_dir)

    return args


def setup_logging(log_level: str, log_file: str) -> None:
    """Setup logging configuration."""
    # Clear any existing handlers
    for handler in logging.root.handlers[:]:
        logging.root.removeHandler(handler)

    # Set up new configuration
    logging.basicConfig(
        level=getattr(logging, log_level),
        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
        handlers=[
            logging.StreamHandler(sys.stdout),
            logging.FileHandler(log_file)
        ]
    )


# Function to scrape articles using Playwright and generate an RSS feed
def scrape_and_generate_rss(url: str, output_dir: Optional[str] = None, use_cache: bool = True) -> None:
    """Main function to scrape articles and generate RSS feed."""
    logger.info(f"Starting scrape of {url}")

    # Validate URL first
    validate_url(url)

    # Set default output directory if not provided
    if output_dir is None:
        output_dir = Config.DEFAULT_OUTPUT_DIR

    logger.info(f"Using output directory: {output_dir}")
    logger.info(f"Caching {'enabled' if use_cache else 'disabled'}")

    # Load page content with retry logic
    html = load_page_with_retry(url, use_cache=use_cache)

    # Extract articles from HTML
    articles = extract_articles_from_html(html, url)

    # Generate RSS feed
    rss_content = generate_rss_feed(articles, url)

    # Save RSS feed and debug HTML
    save_rss_feed(rss_content, output_dir)
    save_debug_html(html, output_dir)

    logger.info(f'RSS feed generated successfully with {len(articles)} articles')

if __name__ == "__main__":
    try:
        # Parse command line arguments
        args = parse_arguments()

        # Setup logging with parsed arguments
        setup_logging(args.log_level, args.log_file)

        # Re-get logger after setup
        logger = logging.getLogger(__name__)

        # Handle cache operations first
        if args.cache_info:
            cache_info = get_cache_info()
            print(f"Cache file: {cache_info['cache_file']}")
            print(f"ETag file: {cache_info['etag_file']}")
            print(f"Cache entries: {cache_info['cache_entries']}")
            print(f"ETag entries: {cache_info['etag_entries']}")
            print(f"Cache size: {cache_info['cache_size_bytes']} bytes")
            sys.exit(0)

        if args.clear_cache:
            logger.info("Clearing cache...")
            clear_cache()
            logger.info("Cache cleared successfully")

        # Validate configuration
        Config.validate_config()
        logger.info("Configuration validation passed")

        # Determine output directory
        output_dir = args.output_dir or os.getenv('OUTPUT_DIR') or Config.DEFAULT_OUTPUT_DIR

        logger.info(f"Starting RSS scraper with URL: {args.url}")
        logger.info(f"Output directory: {output_dir}")
        logger.info(f"Max scroll iterations: {args.max_scroll}")

        # Temporarily override config if max_scroll was provided
        if args.max_scroll != Config.MAX_SCROLL_ITERATIONS:
            Config.MAX_SCROLL_ITERATIONS = args.max_scroll
            logger.info(f"Overriding max scroll iterations to: {args.max_scroll}")

        # Run the function
        use_cache = not args.no_cache
        scrape_and_generate_rss(args.url, output_dir, use_cache)
        logger.info("RSS scraping completed successfully")

    except argparse.ArgumentError as e:
        print(f"Argument error: {e}", file=sys.stderr)
        sys.exit(1)
    except (ValueError, ValidationError) as e:
        print(f"Configuration/Validation error: {e}", file=sys.stderr)
        sys.exit(1)
    except PageLoadError as e:
        logger.error(f"Page loading error: {e}")
        sys.exit(3)
    except NetworkError as e:
        logger.error(f"Network error: {e}")
        sys.exit(2)
    except ParseError as e:
        logger.error(f"Content parsing error: {e}")
        sys.exit(4)
    except FileOperationError as e:
        logger.error(f"File operation error: {e}")
        sys.exit(5)
    except ContentSizeError as e:
        logger.error(f"Content size error: {e}")
        sys.exit(6)
    except Exception as e:
        logger.error(f"Unexpected error: {e}")
        sys.exit(99)