Improve Docker configuration and gitignore
- Enhanced Dockerfile with security improvements and cleaner dependency management - Fixed requirements.txt to use correct package names - Updated gitignore to properly exclude output directory and contents 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
parent
70540bacf0
commit
e0647325ff
3
.gitignore
vendored
3
.gitignore
vendored
@ -1,3 +1,4 @@
|
||||
*.xml
|
||||
.python-version
|
||||
output/
|
||||
output/
|
||||
output/*
|
24
Dockerfile
24
Dockerfile
@ -4,7 +4,7 @@ FROM python:3.12.7-slim-bullseye
|
||||
# Set the working directory
|
||||
WORKDIR /app
|
||||
|
||||
# Install system dependencies needed for Playwright and its browsers
|
||||
# Install system dependencies needed for Playwright and gosu
|
||||
RUN apt-get update && apt-get install -y \
|
||||
bash \
|
||||
build-essential \
|
||||
@ -14,6 +14,7 @@ RUN apt-get update && apt-get install -y \
|
||||
ca-certificates \
|
||||
wget \
|
||||
gnupg \
|
||||
gosu \
|
||||
libnss3 \
|
||||
libatk-bridge2.0-0 \
|
||||
libx11-xcb1 \
|
||||
@ -36,16 +37,11 @@ RUN apt-get update && apt-get install -y \
|
||||
libdrm2 \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Install Playwright and required Python dependencies
|
||||
# Copy requirements and install Python dependencies
|
||||
COPY requirements.txt .
|
||||
RUN pip install --upgrade pip && \
|
||||
pip install \
|
||||
playwright \
|
||||
beautifulsoup4 \
|
||||
feedgen \
|
||||
pytz
|
||||
pip install -r requirements.txt
|
||||
|
||||
# Install only Chromium (faster than all browsers)
|
||||
RUN playwright install chromium
|
||||
|
||||
# Create an entrypoint script to handle permissions (as root)
|
||||
RUN echo '#!/bin/bash\n\
|
||||
@ -56,8 +52,6 @@ fi\n\
|
||||
# Run as scraper user\n\
|
||||
exec gosu scraper "$@"' > /entrypoint.sh && chmod +x /entrypoint.sh
|
||||
|
||||
# Install gosu for user switching
|
||||
RUN apt-get update && apt-get install -y gosu && rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Create non-root user for security
|
||||
RUN useradd -m -u 1001 scraper && \
|
||||
@ -69,13 +63,15 @@ RUN useradd -m -u 1001 scraper && \
|
||||
COPY main.py .
|
||||
RUN chown scraper:scraper main.py
|
||||
|
||||
# Set the environment variable to ensure Playwright works in the container
|
||||
ENV PLAYWRIGHT_BROWSERS_PATH=/home/scraper/.cache/ms-playwright
|
||||
# Set environment variables
|
||||
ENV PYTHONUNBUFFERED=1 \
|
||||
PYTHONDONTWRITEBYTECODE=1 \
|
||||
PLAYWRIGHT_BROWSERS_PATH=/home/scraper/.cache/ms-playwright
|
||||
|
||||
# Don't switch user here - entrypoint will handle it
|
||||
# USER scraper
|
||||
|
||||
# Install Chromium for the scraper user (only what we need)
|
||||
# Install Chromium for the scraper user
|
||||
USER scraper
|
||||
RUN playwright install chromium
|
||||
USER root
|
||||
|
@ -1,5 +1,5 @@
|
||||
requests
|
||||
bs4
|
||||
beautifulsoup4
|
||||
feedgen
|
||||
playwright
|
||||
pytz
|
Loading…
x
Reference in New Issue
Block a user