Phil b9b3ece3cb Add comprehensive security improvements
- URL validation with domain whitelist
- Path validation to prevent directory traversal
- Resource limits (content size, scroll iterations)
- Content filtering and sanitization
- Non-root Docker execution with gosu
- Configurable output directory via CLI/env vars
- Fixed Docker volume permission issues

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-06-05 18:19:23 -06:00

85 lines
2.0 KiB
Docker

# Use the official Python 3.12.7 Slim image as the base
FROM python:3.12.7-slim-bullseye
# Set the working directory
WORKDIR /app
# Install system dependencies needed for Playwright and its browsers
RUN apt-get update && apt-get install -y \
bash \
build-essential \
libffi-dev \
git \
curl \
ca-certificates \
wget \
gnupg \
libnss3 \
libatk-bridge2.0-0 \
libx11-xcb1 \
libxcomposite1 \
libxcursor1 \
libxdamage1 \
libxi6 \
libxtst6 \
libappindicator3-1 \
libxrandr2 \
xdg-utils \
libgbm1 \
libpango-1.0-0 \
libasound2 \
libpangocairo-1.0-0 \
libxshmfence1 \
libx11-6 \
libatk1.0-0 \
libgtk-3-0 \
libdrm2 \
&& rm -rf /var/lib/apt/lists/*
# Install Playwright and required Python dependencies
RUN pip install --upgrade pip && \
pip install \
playwright \
beautifulsoup4 \
feedgen \
pytz
# Install only Chromium (faster than all browsers)
RUN playwright install chromium
# Create an entrypoint script to handle permissions (as root)
RUN echo '#!/bin/bash\n\
# Fix permissions for mounted volumes\n\
if [ -d "/app/output" ]; then\n\
chmod 777 /app/output 2>/dev/null || true\n\
fi\n\
# Run as scraper user\n\
exec gosu scraper "$@"' > /entrypoint.sh && chmod +x /entrypoint.sh
# Install gosu for user switching
RUN apt-get update && apt-get install -y gosu && rm -rf /var/lib/apt/lists/*
# Create non-root user for security
RUN useradd -m -u 1001 scraper && \
mkdir -p /app/output && \
chown -R scraper:scraper /app && \
chmod 755 /app/output
# Copy the Python script to the container
COPY main.py .
RUN chown scraper:scraper main.py
# Set the environment variable to ensure Playwright works in the container
ENV PLAYWRIGHT_BROWSERS_PATH=/home/scraper/.cache/ms-playwright
# Don't switch user here - entrypoint will handle it
# USER scraper
# Install Chromium for the scraper user (only what we need)
USER scraper
RUN playwright install chromium
USER root
ENTRYPOINT ["/entrypoint.sh"]
CMD ["python", "main.py"]