Improve Docker configuration and gitignore

- Enhanced Dockerfile with security improvements and cleaner dependency management
- Fixed requirements.txt to use correct package names
- Updated gitignore to properly exclude output directory and contents

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
Phil 2025-06-05 18:31:42 -06:00
parent 70540bacf0
commit e0647325ff
3 changed files with 13 additions and 16 deletions

1
.gitignore vendored
View File

@ -1,3 +1,4 @@
*.xml
.python-version
output/
output/*

View File

@ -4,7 +4,7 @@ FROM python:3.12.7-slim-bullseye
# Set the working directory
WORKDIR /app
# Install system dependencies needed for Playwright and its browsers
# Install system dependencies needed for Playwright and gosu
RUN apt-get update && apt-get install -y \
bash \
build-essential \
@ -14,6 +14,7 @@ RUN apt-get update && apt-get install -y \
ca-certificates \
wget \
gnupg \
gosu \
libnss3 \
libatk-bridge2.0-0 \
libx11-xcb1 \
@ -36,16 +37,11 @@ RUN apt-get update && apt-get install -y \
libdrm2 \
&& rm -rf /var/lib/apt/lists/*
# Install Playwright and required Python dependencies
# Copy requirements and install Python dependencies
COPY requirements.txt .
RUN pip install --upgrade pip && \
pip install \
playwright \
beautifulsoup4 \
feedgen \
pytz
pip install -r requirements.txt
# Install only Chromium (faster than all browsers)
RUN playwright install chromium
# Create an entrypoint script to handle permissions (as root)
RUN echo '#!/bin/bash\n\
@ -56,8 +52,6 @@ fi\n\
# Run as scraper user\n\
exec gosu scraper "$@"' > /entrypoint.sh && chmod +x /entrypoint.sh
# Install gosu for user switching
RUN apt-get update && apt-get install -y gosu && rm -rf /var/lib/apt/lists/*
# Create non-root user for security
RUN useradd -m -u 1001 scraper && \
@ -69,13 +63,15 @@ RUN useradd -m -u 1001 scraper && \
COPY main.py .
RUN chown scraper:scraper main.py
# Set the environment variable to ensure Playwright works in the container
ENV PLAYWRIGHT_BROWSERS_PATH=/home/scraper/.cache/ms-playwright
# Set environment variables
ENV PYTHONUNBUFFERED=1 \
PYTHONDONTWRITEBYTECODE=1 \
PLAYWRIGHT_BROWSERS_PATH=/home/scraper/.cache/ms-playwright
# Don't switch user here - entrypoint will handle it
# USER scraper
# Install Chromium for the scraper user (only what we need)
# Install Chromium for the scraper user
USER scraper
RUN playwright install chromium
USER root

View File

@ -1,5 +1,5 @@
requests
bs4
beautifulsoup4
feedgen
playwright
pytz