- URL validation with domain whitelist - Path validation to prevent directory traversal - Resource limits (content size, scroll iterations) - Content filtering and sanitization - Non-root Docker execution with gosu - Configurable output directory via CLI/env vars - Fixed Docker volume permission issues 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
85 lines
2.0 KiB
Docker
85 lines
2.0 KiB
Docker
# Use the official Python 3.12.7 Slim image as the base
|
|
FROM python:3.12.7-slim-bullseye
|
|
|
|
# Set the working directory
|
|
WORKDIR /app
|
|
|
|
# Install system dependencies needed for Playwright and its browsers
|
|
RUN apt-get update && apt-get install -y \
|
|
bash \
|
|
build-essential \
|
|
libffi-dev \
|
|
git \
|
|
curl \
|
|
ca-certificates \
|
|
wget \
|
|
gnupg \
|
|
libnss3 \
|
|
libatk-bridge2.0-0 \
|
|
libx11-xcb1 \
|
|
libxcomposite1 \
|
|
libxcursor1 \
|
|
libxdamage1 \
|
|
libxi6 \
|
|
libxtst6 \
|
|
libappindicator3-1 \
|
|
libxrandr2 \
|
|
xdg-utils \
|
|
libgbm1 \
|
|
libpango-1.0-0 \
|
|
libasound2 \
|
|
libpangocairo-1.0-0 \
|
|
libxshmfence1 \
|
|
libx11-6 \
|
|
libatk1.0-0 \
|
|
libgtk-3-0 \
|
|
libdrm2 \
|
|
&& rm -rf /var/lib/apt/lists/*
|
|
|
|
# Install Playwright and required Python dependencies
|
|
RUN pip install --upgrade pip && \
|
|
pip install \
|
|
playwright \
|
|
beautifulsoup4 \
|
|
feedgen \
|
|
pytz
|
|
|
|
# Install only Chromium (faster than all browsers)
|
|
RUN playwright install chromium
|
|
|
|
# Create an entrypoint script to handle permissions (as root)
|
|
RUN echo '#!/bin/bash\n\
|
|
# Fix permissions for mounted volumes\n\
|
|
if [ -d "/app/output" ]; then\n\
|
|
chmod 777 /app/output 2>/dev/null || true\n\
|
|
fi\n\
|
|
# Run as scraper user\n\
|
|
exec gosu scraper "$@"' > /entrypoint.sh && chmod +x /entrypoint.sh
|
|
|
|
# Install gosu for user switching
|
|
RUN apt-get update && apt-get install -y gosu && rm -rf /var/lib/apt/lists/*
|
|
|
|
# Create non-root user for security
|
|
RUN useradd -m -u 1001 scraper && \
|
|
mkdir -p /app/output && \
|
|
chown -R scraper:scraper /app && \
|
|
chmod 755 /app/output
|
|
|
|
# Copy the Python script to the container
|
|
COPY main.py .
|
|
RUN chown scraper:scraper main.py
|
|
|
|
# Set the environment variable to ensure Playwright works in the container
|
|
ENV PLAYWRIGHT_BROWSERS_PATH=/home/scraper/.cache/ms-playwright
|
|
|
|
# Don't switch user here - entrypoint will handle it
|
|
# USER scraper
|
|
|
|
# Install Chromium for the scraper user (only what we need)
|
|
USER scraper
|
|
RUN playwright install chromium
|
|
USER root
|
|
|
|
ENTRYPOINT ["/entrypoint.sh"]
|
|
CMD ["python", "main.py"]
|