From b9b3ece3cbb454e97ac4628745daed62ff2f777b Mon Sep 17 00:00:00 2001 From: Phil Date: Thu, 5 Jun 2025 18:19:23 -0600 Subject: [PATCH] Add comprehensive security improvements MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - URL validation with domain whitelist - Path validation to prevent directory traversal - Resource limits (content size, scroll iterations) - Content filtering and sanitization - Non-root Docker execution with gosu - Configurable output directory via CLI/env vars - Fixed Docker volume permission issues 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude --- Dockerfile | 35 +- README.md | 117 + main.py | 139 +- output/page.html | 6361 ++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 6636 insertions(+), 16 deletions(-) create mode 100644 README.md create mode 100644 output/page.html diff --git a/Dockerfile b/Dockerfile index b884553..e618225 100644 --- a/Dockerfile +++ b/Dockerfile @@ -44,14 +44,41 @@ RUN pip install --upgrade pip && \ feedgen \ pytz -# Install Playwright browser binaries -RUN playwright install +# Install only Chromium (faster than all browsers) +RUN playwright install chromium + +# Create an entrypoint script to handle permissions (as root) +RUN echo '#!/bin/bash\n\ +# Fix permissions for mounted volumes\n\ +if [ -d "/app/output" ]; then\n\ + chmod 777 /app/output 2>/dev/null || true\n\ +fi\n\ +# Run as scraper user\n\ +exec gosu scraper "$@"' > /entrypoint.sh && chmod +x /entrypoint.sh + +# Install gosu for user switching +RUN apt-get update && apt-get install -y gosu && rm -rf /var/lib/apt/lists/* + +# Create non-root user for security +RUN useradd -m -u 1001 scraper && \ + mkdir -p /app/output && \ + chown -R scraper:scraper /app && \ + chmod 755 /app/output # Copy the Python script to the container COPY main.py . +RUN chown scraper:scraper main.py # Set the environment variable to ensure Playwright works in the container -ENV PLAYWRIGHT_BROWSERS_PATH=/root/.cache/ms-playwright +ENV PLAYWRIGHT_BROWSERS_PATH=/home/scraper/.cache/ms-playwright -# Command to run the Python script +# Don't switch user here - entrypoint will handle it +# USER scraper + +# Install Chromium for the scraper user (only what we need) +USER scraper +RUN playwright install chromium +USER root + +ENTRYPOINT ["/entrypoint.sh"] CMD ["python", "main.py"] diff --git a/README.md b/README.md new file mode 100644 index 0000000..ab9af9d --- /dev/null +++ b/README.md @@ -0,0 +1,117 @@ +# Warhammer Community RSS Scraper + +A Python application that scrapes the Warhammer Community website and generates an RSS feed from the latest articles. + +## Overview + +This project uses web scraping to extract articles from the Warhammer Community website and converts them into an RSS feed format. It uses Playwright for JavaScript-heavy content rendering and BeautifulSoup for HTML parsing. + +## Features + +- Scrapes articles from Warhammer Community website +- Generates RSS feed with proper formatting +- Handles duplicate article detection +- Sorts articles by publication date (newest first) +- Dockerized for easy deployment +- Saves both RSS feed and raw HTML for debugging +- **Security-focused**: URL validation, content filtering, and resource limits +- **Safe execution**: Runs as non-root user in container + +## Requirements + +- Python 3.12+ +- Dependencies listed in `requirements.txt`: + - playwright + - beautifulsoup4 + - feedgen + - pytz + - requests + +## Installation + +### Local Setup + +1. Install dependencies: +```bash +pip install -r requirements.txt +``` + +2. Install Playwright browsers: +```bash +playwright install +``` + +3. Run the script: +```bash +# Default: saves to current directory +python main.py + +# Or specify output directory +python main.py /path/to/output + +# Or use environment variable +OUTPUT_DIR=/path/to/output python main.py +``` + +### Docker Setup + +1. Build the Docker image: +```bash +docker build -t warhammer-rss . +``` + +2. Run the container (multiple options to avoid permission issues): + +**Option A: Save to current directory (simplest)** +```bash +docker run -v $(pwd):/app/output warhammer-rss +``` + +**Option B: Use environment variable for output directory** +```bash +docker run -e OUTPUT_DIR=/app/output -v $(pwd)/output:/app/output warhammer-rss +``` + +**Option C: With resource limits for additional security** +```bash +docker run --memory=512m --cpu-quota=50000 -v $(pwd):/app/output warhammer-rss +``` + +## Output + +The application generates: +- `warhammer_rss_feed.xml` - RSS feed file +- `page.html` - Raw HTML content for debugging + +Both files are saved to the specified output directory (current directory by default). + +## Security Features + +This application implements several security measures: + +- **URL Validation**: Only allows scraping from trusted Warhammer Community domains +- **Path Validation**: Prevents directory traversal attacks by validating output paths +- **Resource Limits**: Caps content size (10MB) and scroll iterations (5) to prevent DoS +- **Content Filtering**: Sanitizes extracted text to prevent XSS and injection attacks +- **Non-root Execution**: Docker container runs as user `scraper` (UID 1001) for reduced privilege +- **Input Sanitization**: All URLs and file paths are validated before use + +## How It Works + +1. **Validates** the target URL against whitelist of allowed domains +2. Uses Playwright to load the Warhammer Community homepage with full JavaScript rendering +3. Scrolls through the page to load additional content (limited to 5 iterations) +4. **Validates content size** and parses the rendered HTML with BeautifulSoup +5. **Sanitizes** and extracts article titles, links, and publication dates +6. **Validates all links** against allowed domains +7. Removes duplicates and sorts by date +8. Generates RSS feed using feedgen library +9. **Validates output paths** before saving files + +## Configuration + +The scraper targets `https://www.warhammer-community.com/en-gb/` by default and only allows URLs from: +- `warhammer-community.com` +- `www.warhammer-community.com` + +To modify allowed domains, update the `ALLOWED_DOMAINS` list in `main.py:11-14`. \ No newline at end of file diff --git a/main.py b/main.py index c77696e..bd29dbe 100644 --- a/main.py +++ b/main.py @@ -4,9 +4,100 @@ from feedgen.feed import FeedGenerator from datetime import datetime import pytz import time +import urllib.parse +import os +import sys + +# Allowed domains for scraping - security whitelist +ALLOWED_DOMAINS = [ + 'warhammer-community.com', + 'www.warhammer-community.com' +] + +# Resource limits +MAX_SCROLL_ITERATIONS = 5 +MAX_CONTENT_SIZE = 10 * 1024 * 1024 # 10MB + +def validate_url(url): + """Validate URL against whitelist of allowed domains""" + try: + parsed = urllib.parse.urlparse(url) + if not parsed.scheme or not parsed.netloc: + raise ValueError("Invalid URL format") + + # Check if domain is in allowed list + domain = parsed.netloc.lower() + if domain not in ALLOWED_DOMAINS: + raise ValueError(f"Domain {domain} not in allowed list: {ALLOWED_DOMAINS}") + + return True + except Exception as e: + raise ValueError(f"URL validation failed: {e}") + +def validate_output_path(path, base_dir): + """Validate and sanitize output file path""" + # Resolve to absolute path and check if it's safe + abs_path = os.path.abspath(path) + abs_base = os.path.abspath(base_dir) + + # Ensure path is within allowed directory + if not abs_path.startswith(abs_base): + raise ValueError(f"Output path {abs_path} is outside allowed directory {abs_base}") + + # Ensure output directory exists + os.makedirs(abs_base, exist_ok=True) + + return abs_path + +def sanitize_text(text): + """Sanitize text content to prevent injection attacks""" + if not text: + return "No title" + + # Remove potential harmful characters and limit length + sanitized = text.strip()[:500] # Limit title length + + # Remove any script tags or potentially harmful content + dangerous_patterns = [' MAX_CONTENT_SIZE: + browser.close() + raise ValueError(f"Content size {len(html)} exceeds maximum {MAX_CONTENT_SIZE}") + browser.close() # Parse the HTML content with BeautifulSoup @@ -38,13 +135,15 @@ def scrape_and_generate_rss(url): # Find all articles in the page for article in soup.find_all('article'): - # Extract the title + # Extract and sanitize the title title_tag = article.find('h3', class_='newsCard-title-sm') or article.find('h3', class_='newsCard-title-lg') - title = title_tag.text.strip() if title_tag else 'No title' + raw_title = title_tag.text.strip() if title_tag else 'No title' + title = sanitize_text(raw_title) - # Extract the link + # Extract and validate the link link_tag = article.find('a', href=True) - link = link_tag['href'] if link_tag else None + raw_link = link_tag['href'] if link_tag else None + link = validate_link(raw_link, url) # Skip this entry if the link is None or the URL has already been seen if not link or link in seen_urls: @@ -97,13 +196,29 @@ def scrape_and_generate_rss(url): # Generate the RSS feed rss_feed = fg.rss_str(pretty=True) - # Save the RSS feed to a file - with open('/app/output/warhammer_rss_feed.xml', 'wb') as f: + # Validate and save the RSS feed to a file + rss_path = validate_output_path(os.path.join(output_dir, 'warhammer_rss_feed.xml'), output_dir) + with open(rss_path, 'wb') as f: f.write(rss_feed) - with open('/app/output/page.html','w', encoding='utf-8') as f: + # Validate and save HTML for debugging + html_path = validate_output_path(os.path.join(output_dir, 'page.html'), output_dir) + with open(html_path, 'w', encoding='utf-8') as f: f.write(soup.prettify()) print('RSS feed generated and saved as warhammer_rss_feed.xml') -# Run the function -scrape_and_generate_rss('https://www.warhammer-community.com/en-gb/') +if __name__ == "__main__": + # Get output directory from environment variable or command line argument + output_dir = os.getenv('OUTPUT_DIR') + + if len(sys.argv) > 1: + output_dir = sys.argv[1] + + # Default to current directory if no output specified (avoids permission issues) + if not output_dir: + output_dir = '.' + + print(f"Using output directory: {output_dir}") + + # Run the function + scrape_and_generate_rss('https://www.warhammer-community.com/en-gb/', output_dir) diff --git a/output/page.html b/output/page.html new file mode 100644 index 0000000..202a42d --- /dev/null +++ b/output/page.html @@ -0,0 +1,6361 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + Home - Warhammer Community + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+
+ +
+
+
+
+
+
+
+
+
+
+
+
+ + + +
+
+ + + +
+
+ + + +
+
+ + + +
+
+ + + +
+
+ + + +
+
+ + + +
+
+ + + +
+
+ + + +
+
+ + + +
+
+ + + +
+
+ + + +
+
+ + + +
+
+ + + +
+
+ + +
+
+ + +
+
+
+
+
+
+ + +
+
+
+
+
+
+
+
+
+
+
+

+ FEATURED NEWS +

+
+
+ +
+
+ + + +
+
+
+
+
+
+
+
+
+
+
+ +

+ New rules and variant warscrolls for Warhammer Age of Sigmar are unleashed to coincide with the new General’s Handbook +

+
+ + + Find out more + + + + + + +
+
+
+
+
+ +
+
+
+
+
+
+
+
+
+
+

+ New Edition of Warhammer: The Horus Heresy +

+ +
+ +
+
+
+
+
+

+ LATEST NEWS +

+ +
+ +
+
+
+
+
+
+
+
+
+ +

+ All the latest previews, features, reveals, and rules FAQs straight to your inbox. +

+
+ + + Find Out More + + + + + + +
+
+
+
+
+ +
+
+
+
+
+
+
+
+
+
+

+ WARHAMMER VIDEOS +

+ +
+
+ +
+ +
+
+
+
+
+
+

+ EXPLORE LATEST NEWS FROM YOUR FAVOURITE SETTING +

+
+
+
+ + +
+
+
+
+
+
+
+
+
+
+ +
+
+ +
+
+
+
+ +
+
+
+
+
+
+ +
+
+ +
+
+
+
+ +
+
+
+
+
+
+ +
+
+ +
+
+
+
+ +
+
+
+
+
+
+ +
+
+ +
+
+
+
+ +
+
+
+
+
+
+ +
+
+ +
+
+
+
+ +
+
+
+
+
+
+ +
+
+ +
+
+
+
+ +
+
+
+
+
+
+ +
+
+ +
+
+
+
+ +
+
+
+
+
+
+ +
+
+ +
+
+
+
+ +
+
+
+
+
+
+ +
+
+ +
+
+
+
+ +
+
+
+
+
+
+ +
+
+ +
+
+
+
+ +
+
+
+
+
+
+ +
+
+ +
+
+
+
+ +
+
+
+
+
+
+ +
+
+ +
+
+
+
+ +
+
+
+
+
+
+ +
+
+ +
+
+
+
+ +
+
+
+
+
+
+ +
+
+ +
+
+
+
+ +
+
+
+
+
+
+ +
+
+ +
+
+
+
+ +
+
+
+ + +
+
+
+ + +
+
+
+
+
+
+
+
+
+
+

+ Army showcases +

+ +
+
+
+
+
+ +
+
+
+

+ RELATED TOPICS +

+
+
+ +
+
+ +
+ +
+
+ + +
+
+ +
+
+
+
+
+
+
+
+
+
+
+
+
+ +

+ Explore the Warhammer universes through animations, apps, shows, and more – all exclusive to subscribers. +

+
+ + + Find out more + + + + + + +
+
+
+
+
+ +
+
+
+
+
+
+
+
+
+
+

+ Latest from Warhammer+ +

+
+
+
+ +
+
+
+
+
+
+
+
+ +
+
+
+
+
+ +

+ Find all the latest releases on the Warhammer.com store. +

+
+ + + Visit Warhammer + + + + + + +
+
+
+
+
+
+ +
+
+
+
+
+ +

+ Novels, riveting multi-book epics, and audio dramas are a click away. +

+
+ + + EXPLORE + + + + + + +
+
+
+
+
+
+ +
+
+
+
+
+ +

+ Find your nearest Warhammer stockist today; including official Warhammer stores. +

+
+ + + Store finder + + + + + + +
+
+
+
+
+
+
+
+
+
+

+ WARHAMMER SETTINGS +

+
+
+
+
+
+
+
+ + + +
+
+ + + +
+
+ + + +
+
+ + + +
+
+ + + +
+
+ + + +
+
+ + + +
+
+ + + +
+
+ + + +
+
+ + + +
+
+ + + +
+
+ + + +
+
+ + + +
+
+ + +
+
+ + +
+
+
+
+
+
+ + +
+
+
+
+
+
+
+
+
+
+
+
+ +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +