Add comprehensive security improvements

- URL validation with domain whitelist
- Path validation to prevent directory traversal
- Resource limits (content size, scroll iterations)
- Content filtering and sanitization
- Non-root Docker execution with gosu
- Configurable output directory via CLI/env vars
- Fixed Docker volume permission issues

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
2025-06-05 18:19:23 -06:00
parent eecee074e2
commit b9b3ece3cb
4 changed files with 6636 additions and 16 deletions

139
main.py
View File

@@ -4,9 +4,100 @@ from feedgen.feed import FeedGenerator
from datetime import datetime
import pytz
import time
import urllib.parse
import os
import sys
# Allowed domains for scraping - security whitelist
ALLOWED_DOMAINS = [
'warhammer-community.com',
'www.warhammer-community.com'
]
# Resource limits
MAX_SCROLL_ITERATIONS = 5
MAX_CONTENT_SIZE = 10 * 1024 * 1024 # 10MB
def validate_url(url):
"""Validate URL against whitelist of allowed domains"""
try:
parsed = urllib.parse.urlparse(url)
if not parsed.scheme or not parsed.netloc:
raise ValueError("Invalid URL format")
# Check if domain is in allowed list
domain = parsed.netloc.lower()
if domain not in ALLOWED_DOMAINS:
raise ValueError(f"Domain {domain} not in allowed list: {ALLOWED_DOMAINS}")
return True
except Exception as e:
raise ValueError(f"URL validation failed: {e}")
def validate_output_path(path, base_dir):
"""Validate and sanitize output file path"""
# Resolve to absolute path and check if it's safe
abs_path = os.path.abspath(path)
abs_base = os.path.abspath(base_dir)
# Ensure path is within allowed directory
if not abs_path.startswith(abs_base):
raise ValueError(f"Output path {abs_path} is outside allowed directory {abs_base}")
# Ensure output directory exists
os.makedirs(abs_base, exist_ok=True)
return abs_path
def sanitize_text(text):
"""Sanitize text content to prevent injection attacks"""
if not text:
return "No title"
# Remove potential harmful characters and limit length
sanitized = text.strip()[:500] # Limit title length
# Remove any script tags or potentially harmful content
dangerous_patterns = ['<script', '</script', 'javascript:', 'data:', 'vbscript:']
for pattern in dangerous_patterns:
sanitized = sanitized.replace(pattern.lower(), '').replace(pattern.upper(), '')
return sanitized if sanitized else "No title"
def validate_link(link, base_url):
"""Validate and sanitize article links"""
if not link:
return None
try:
# Handle relative URLs
if link.startswith('/'):
parsed_base = urllib.parse.urlparse(base_url)
link = f"{parsed_base.scheme}://{parsed_base.netloc}{link}"
# Validate the resulting URL
parsed = urllib.parse.urlparse(link)
if not parsed.scheme or not parsed.netloc:
return None
# Ensure it's from allowed domain
domain = parsed.netloc.lower()
if domain not in ALLOWED_DOMAINS:
return None
return link
except Exception:
return None
# Function to scrape articles using Playwright and generate an RSS feed
def scrape_and_generate_rss(url):
def scrape_and_generate_rss(url, output_dir=None):
# Validate URL first
validate_url(url)
# Set default output directory if not provided
if output_dir is None:
output_dir = '.' # Default to current directory
articles = []
seen_urls = set() # Set to track seen URLs and avoid duplicates
@@ -21,13 +112,19 @@ def scrape_and_generate_rss(url):
# Load the Warhammer Community page
page.goto(url, wait_until="networkidle")
# Simulate scrolling to load more content if needed
for _ in range(10):
# Simulate scrolling to load more content if needed (limited for security)
for _ in range(MAX_SCROLL_ITERATIONS):
page.evaluate("window.scrollBy(0, document.body.scrollHeight)")
time.sleep(2)
# Get the fully rendered HTML content
html = page.content()
# Check content size for security
if len(html) > MAX_CONTENT_SIZE:
browser.close()
raise ValueError(f"Content size {len(html)} exceeds maximum {MAX_CONTENT_SIZE}")
browser.close()
# Parse the HTML content with BeautifulSoup
@@ -38,13 +135,15 @@ def scrape_and_generate_rss(url):
# Find all articles in the page
for article in soup.find_all('article'):
# Extract the title
# Extract and sanitize the title
title_tag = article.find('h3', class_='newsCard-title-sm') or article.find('h3', class_='newsCard-title-lg')
title = title_tag.text.strip() if title_tag else 'No title'
raw_title = title_tag.text.strip() if title_tag else 'No title'
title = sanitize_text(raw_title)
# Extract the link
# Extract and validate the link
link_tag = article.find('a', href=True)
link = link_tag['href'] if link_tag else None
raw_link = link_tag['href'] if link_tag else None
link = validate_link(raw_link, url)
# Skip this entry if the link is None or the URL has already been seen
if not link or link in seen_urls:
@@ -97,13 +196,29 @@ def scrape_and_generate_rss(url):
# Generate the RSS feed
rss_feed = fg.rss_str(pretty=True)
# Save the RSS feed to a file
with open('/app/output/warhammer_rss_feed.xml', 'wb') as f:
# Validate and save the RSS feed to a file
rss_path = validate_output_path(os.path.join(output_dir, 'warhammer_rss_feed.xml'), output_dir)
with open(rss_path, 'wb') as f:
f.write(rss_feed)
with open('/app/output/page.html','w', encoding='utf-8') as f:
# Validate and save HTML for debugging
html_path = validate_output_path(os.path.join(output_dir, 'page.html'), output_dir)
with open(html_path, 'w', encoding='utf-8') as f:
f.write(soup.prettify())
print('RSS feed generated and saved as warhammer_rss_feed.xml')
# Run the function
scrape_and_generate_rss('https://www.warhammer-community.com/en-gb/')
if __name__ == "__main__":
# Get output directory from environment variable or command line argument
output_dir = os.getenv('OUTPUT_DIR')
if len(sys.argv) > 1:
output_dir = sys.argv[1]
# Default to current directory if no output specified (avoids permission issues)
if not output_dir:
output_dir = '.'
print(f"Using output directory: {output_dir}")
# Run the function
scrape_and_generate_rss('https://www.warhammer-community.com/en-gb/', output_dir)