From e0647325ff89eab8d827ef2013b57202b76dbd19 Mon Sep 17 00:00:00 2001 From: Phil Date: Thu, 5 Jun 2025 18:31:42 -0600 Subject: [PATCH] Improve Docker configuration and gitignore MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Enhanced Dockerfile with security improvements and cleaner dependency management - Fixed requirements.txt to use correct package names - Updated gitignore to properly exclude output directory and contents 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude --- .gitignore | 3 ++- Dockerfile | 24 ++++++++++-------------- requirements.txt | 2 +- 3 files changed, 13 insertions(+), 16 deletions(-) diff --git a/.gitignore b/.gitignore index 2707d8b..015413a 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ *.xml .python-version -output/ \ No newline at end of file +output/ +output/* \ No newline at end of file diff --git a/Dockerfile b/Dockerfile index e618225..d49ae2c 100644 --- a/Dockerfile +++ b/Dockerfile @@ -4,7 +4,7 @@ FROM python:3.12.7-slim-bullseye # Set the working directory WORKDIR /app -# Install system dependencies needed for Playwright and its browsers +# Install system dependencies needed for Playwright and gosu RUN apt-get update && apt-get install -y \ bash \ build-essential \ @@ -14,6 +14,7 @@ RUN apt-get update && apt-get install -y \ ca-certificates \ wget \ gnupg \ + gosu \ libnss3 \ libatk-bridge2.0-0 \ libx11-xcb1 \ @@ -36,16 +37,11 @@ RUN apt-get update && apt-get install -y \ libdrm2 \ && rm -rf /var/lib/apt/lists/* -# Install Playwright and required Python dependencies +# Copy requirements and install Python dependencies +COPY requirements.txt . RUN pip install --upgrade pip && \ - pip install \ - playwright \ - beautifulsoup4 \ - feedgen \ - pytz + pip install -r requirements.txt -# Install only Chromium (faster than all browsers) -RUN playwright install chromium # Create an entrypoint script to handle permissions (as root) RUN echo '#!/bin/bash\n\ @@ -56,8 +52,6 @@ fi\n\ # Run as scraper user\n\ exec gosu scraper "$@"' > /entrypoint.sh && chmod +x /entrypoint.sh -# Install gosu for user switching -RUN apt-get update && apt-get install -y gosu && rm -rf /var/lib/apt/lists/* # Create non-root user for security RUN useradd -m -u 1001 scraper && \ @@ -69,13 +63,15 @@ RUN useradd -m -u 1001 scraper && \ COPY main.py . RUN chown scraper:scraper main.py -# Set the environment variable to ensure Playwright works in the container -ENV PLAYWRIGHT_BROWSERS_PATH=/home/scraper/.cache/ms-playwright +# Set environment variables +ENV PYTHONUNBUFFERED=1 \ + PYTHONDONTWRITEBYTECODE=1 \ + PLAYWRIGHT_BROWSERS_PATH=/home/scraper/.cache/ms-playwright # Don't switch user here - entrypoint will handle it # USER scraper -# Install Chromium for the scraper user (only what we need) +# Install Chromium for the scraper user USER scraper RUN playwright install chromium USER root diff --git a/requirements.txt b/requirements.txt index 5febfa4..995be38 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ requests -bs4 +beautifulsoup4 feedgen playwright pytz \ No newline at end of file