Improve Docker configuration and gitignore
- Enhanced Dockerfile with security improvements and cleaner dependency management - Fixed requirements.txt to use correct package names - Updated gitignore to properly exclude output directory and contents 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
		
							
								
								
									
										1
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										1
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							@@ -1,3 +1,4 @@
 | 
				
			|||||||
*.xml
 | 
					*.xml
 | 
				
			||||||
.python-version
 | 
					.python-version
 | 
				
			||||||
output/
 | 
					output/
 | 
				
			||||||
 | 
					output/*
 | 
				
			||||||
							
								
								
									
										24
									
								
								Dockerfile
									
									
									
									
									
								
							
							
						
						
									
										24
									
								
								Dockerfile
									
									
									
									
									
								
							@@ -4,7 +4,7 @@ FROM python:3.12.7-slim-bullseye
 | 
				
			|||||||
# Set the working directory
 | 
					# Set the working directory
 | 
				
			||||||
WORKDIR /app
 | 
					WORKDIR /app
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# Install system dependencies needed for Playwright and its browsers
 | 
					# Install system dependencies needed for Playwright and gosu
 | 
				
			||||||
RUN apt-get update && apt-get install -y \
 | 
					RUN apt-get update && apt-get install -y \
 | 
				
			||||||
    bash \
 | 
					    bash \
 | 
				
			||||||
    build-essential \
 | 
					    build-essential \
 | 
				
			||||||
@@ -14,6 +14,7 @@ RUN apt-get update && apt-get install -y \
 | 
				
			|||||||
    ca-certificates \
 | 
					    ca-certificates \
 | 
				
			||||||
    wget \
 | 
					    wget \
 | 
				
			||||||
    gnupg \
 | 
					    gnupg \
 | 
				
			||||||
 | 
					    gosu \
 | 
				
			||||||
    libnss3 \
 | 
					    libnss3 \
 | 
				
			||||||
    libatk-bridge2.0-0 \
 | 
					    libatk-bridge2.0-0 \
 | 
				
			||||||
    libx11-xcb1 \
 | 
					    libx11-xcb1 \
 | 
				
			||||||
@@ -36,16 +37,11 @@ RUN apt-get update && apt-get install -y \
 | 
				
			|||||||
    libdrm2 \
 | 
					    libdrm2 \
 | 
				
			||||||
    && rm -rf /var/lib/apt/lists/*
 | 
					    && rm -rf /var/lib/apt/lists/*
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# Install Playwright and required Python dependencies
 | 
					# Copy requirements and install Python dependencies
 | 
				
			||||||
 | 
					COPY requirements.txt .
 | 
				
			||||||
RUN pip install --upgrade pip && \
 | 
					RUN pip install --upgrade pip && \
 | 
				
			||||||
    pip install \
 | 
					    pip install -r requirements.txt
 | 
				
			||||||
    playwright \
 | 
					 | 
				
			||||||
    beautifulsoup4 \
 | 
					 | 
				
			||||||
    feedgen \
 | 
					 | 
				
			||||||
    pytz
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
# Install only Chromium (faster than all browsers)
 | 
					 | 
				
			||||||
RUN playwright install chromium
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
# Create an entrypoint script to handle permissions (as root)
 | 
					# Create an entrypoint script to handle permissions (as root)
 | 
				
			||||||
RUN echo '#!/bin/bash\n\
 | 
					RUN echo '#!/bin/bash\n\
 | 
				
			||||||
@@ -56,8 +52,6 @@ fi\n\
 | 
				
			|||||||
# Run as scraper user\n\
 | 
					# Run as scraper user\n\
 | 
				
			||||||
exec gosu scraper "$@"' > /entrypoint.sh && chmod +x /entrypoint.sh
 | 
					exec gosu scraper "$@"' > /entrypoint.sh && chmod +x /entrypoint.sh
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# Install gosu for user switching
 | 
					 | 
				
			||||||
RUN apt-get update && apt-get install -y gosu && rm -rf /var/lib/apt/lists/*
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
# Create non-root user for security
 | 
					# Create non-root user for security
 | 
				
			||||||
RUN useradd -m -u 1001 scraper && \
 | 
					RUN useradd -m -u 1001 scraper && \
 | 
				
			||||||
@@ -69,13 +63,15 @@ RUN useradd -m -u 1001 scraper && \
 | 
				
			|||||||
COPY main.py .
 | 
					COPY main.py .
 | 
				
			||||||
RUN chown scraper:scraper main.py
 | 
					RUN chown scraper:scraper main.py
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# Set the environment variable to ensure Playwright works in the container
 | 
					# Set environment variables
 | 
				
			||||||
ENV PLAYWRIGHT_BROWSERS_PATH=/home/scraper/.cache/ms-playwright
 | 
					ENV PYTHONUNBUFFERED=1 \
 | 
				
			||||||
 | 
					    PYTHONDONTWRITEBYTECODE=1 \
 | 
				
			||||||
 | 
					    PLAYWRIGHT_BROWSERS_PATH=/home/scraper/.cache/ms-playwright
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# Don't switch user here - entrypoint will handle it
 | 
					# Don't switch user here - entrypoint will handle it
 | 
				
			||||||
# USER scraper
 | 
					# USER scraper
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# Install Chromium for the scraper user (only what we need)
 | 
					# Install Chromium for the scraper user
 | 
				
			||||||
USER scraper
 | 
					USER scraper
 | 
				
			||||||
RUN playwright install chromium
 | 
					RUN playwright install chromium
 | 
				
			||||||
USER root
 | 
					USER root
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -1,5 +1,5 @@
 | 
				
			|||||||
requests
 | 
					requests
 | 
				
			||||||
bs4
 | 
					beautifulsoup4
 | 
				
			||||||
feedgen
 | 
					feedgen
 | 
				
			||||||
playwright
 | 
					playwright
 | 
				
			||||||
pytz
 | 
					pytz
 | 
				
			||||||
		Reference in New Issue
	
	Block a user