rss_warhammer-community/main.py

from playwright.sync_api import sync_playwright
from bs4 import BeautifulSoup
from feedgen.feed import FeedGenerator
from datetime import datetime
import pytz
import time

# Function to scrape articles using Playwright and generate an RSS feed
def scrape_and_generate_rss(url):
    articles = []
    seen_urls = set()  # Set to track seen URLs and avoid duplicates

    # Use Playwright to load the page
    with sync_playwright() as p:
        browser = p.chromium.launch(headless=True)
        page = browser.new_page()
        
        # Set a longer timeout for loading the page
        page.set_default_navigation_timeout(120000)
        
        # Load the Warhammer Community page
        page.goto(url, wait_until="networkidle")
        
        # Simulate scrolling to load more content if needed
        for _ in range(10):
            page.evaluate("window.scrollBy(0, document.body.scrollHeight)")
            time.sleep(2)
        
        # Get the fully rendered HTML content
        html = page.content()
        browser.close()

    # Parse the HTML content with BeautifulSoup
    soup = BeautifulSoup(html, 'html.parser')

    # Define a timezone (UTC in this case)
    timezone = pytz.UTC

    # Find all articles in the page
    for article in soup.find_all('article'):
        # Extract the title
        title_tag = article.find('h3', class_='newsCard-title-sm') or article.find('h3', class_='newsCard-title-lg')
        title = title_tag.text.strip() if title_tag else 'No title'

        # Extract the link
        link_tag = article.find('a', href=True)
        link = link_tag['href'] if link_tag else None

        # Skip this entry if the link is None or the URL has already been seen
        if not link or link in seen_urls:
            continue  # Skip duplicates or invalid entries

        seen_urls.add(link)  # Add the URL to the set of seen URLs

        # Extract the publication date and ignore reading time
        date = None
        for time_tag in article.find_all('time'):
            raw_date = time_tag.text.strip()

            # Ignore "min" time blocks (reading time)
            if "min" not in raw_date.lower():
                try:
                    # Parse the actual date (e.g., "02 Oct 24")
                    date = datetime.strptime(raw_date, '%d %b %y')
                    date = timezone.localize(date)  # Localize with UTC
                    break  # Stop after finding the correct date
                except ValueError:
                    continue

        # If no valid date is found, use the current date as a fallback
        if not date:
            date = datetime.now(timezone)

        # Add the article to the list with its publication date
        articles.append({
            'title': title,
            'link': link,
            'date': date
        })

    # Sort the articles by publication date (newest first)
    articles.sort(key=lambda x: x['date'], reverse=True)

    # Initialize the RSS feed generator
    fg = FeedGenerator()
    fg.title('Warhammer Community RSS Feed')
    fg.link(href=url)
    fg.description('Latest Warhammer Community Articles')

    # Add the sorted articles to the RSS feed
    for article in articles:
        fe = fg.add_entry()
        fe.title(article['title'])
        fe.link(href=article['link'])
        fe.pubDate(article['date'])

    # Generate the RSS feed
    rss_feed = fg.rss_str(pretty=True)

    # Save the RSS feed to a file
    with open('/app/output/warhammer_rss_feed.xml', 'wb') as f:
        f.write(rss_feed)

    with open('/app/output/page.html','w', encoding='utf-8') as f:
        f.write(soup.prettify())
    print('RSS feed generated and saved as warhammer_rss_feed.xml')

# Run the function
scrape_and_generate_rss('https://www.warhammer-community.com/en-gb/')
initial commit 2024-10-03 20:34:27 +02:00			`from playwright.sync_api import sync_playwright`
			`from bs4 import BeautifulSoup`
			`from feedgen.feed import FeedGenerator`
			`from datetime import datetime`
			`import pytz`
			`import time`

			`# Function to scrape articles using Playwright and generate an RSS feed`
			`def scrape_and_generate_rss(url):`
			`articles = []`
			`seen_urls = set() # Set to track seen URLs and avoid duplicates`

			`# Use Playwright to load the page`
			`with sync_playwright() as p:`
			`browser = p.chromium.launch(headless=True)`
			`page = browser.new_page()`

			`# Set a longer timeout for loading the page`
added Dockerfile for container build 2024-10-08 21:55:13 +02:00			`page.set_default_navigation_timeout(120000)`
initial commit 2024-10-03 20:34:27 +02:00
			`# Load the Warhammer Community page`
			`page.goto(url, wait_until="networkidle")`

			`# Simulate scrolling to load more content if needed`
			`for _ in range(10):`
			`page.evaluate("window.scrollBy(0, document.body.scrollHeight)")`
			`time.sleep(2)`

			`# Get the fully rendered HTML content`
			`html = page.content()`
			`browser.close()`

			`# Parse the HTML content with BeautifulSoup`
			`soup = BeautifulSoup(html, 'html.parser')`

			`# Define a timezone (UTC in this case)`
			`timezone = pytz.UTC`

			`# Find all articles in the page`
			`for article in soup.find_all('article'):`
			`# Extract the title`
			`title_tag = article.find('h3', class_='newsCard-title-sm') or article.find('h3', class_='newsCard-title-lg')`
			`title = title_tag.text.strip() if title_tag else 'No title'`

			`# Extract the link`
			`link_tag = article.find('a', href=True)`
			`link = link_tag['href'] if link_tag else None`

			`# Skip this entry if the link is None or the URL has already been seen`
			`if not link or link in seen_urls:`
			`continue # Skip duplicates or invalid entries`

			`seen_urls.add(link) # Add the URL to the set of seen URLs`

			`# Extract the publication date and ignore reading time`
			`date = None`
			`for time_tag in article.find_all('time'):`
			`raw_date = time_tag.text.strip()`

			`# Ignore "min" time blocks (reading time)`
			`if "min" not in raw_date.lower():`
			`try:`
			`# Parse the actual date (e.g., "02 Oct 24")`
			`date = datetime.strptime(raw_date, '%d %b %y')`
			`date = timezone.localize(date) # Localize with UTC`
			`break # Stop after finding the correct date`
			`except ValueError:`
			`continue`

			`# If no valid date is found, use the current date as a fallback`
			`if not date:`
			`date = datetime.now(timezone)`

			`# Add the article to the list with its publication date`
			`articles.append({`
			`'title': title,`
			`'link': link,`
			`'date': date`
			`})`

			`# Sort the articles by publication date (newest first)`
			`articles.sort(key=lambda x: x['date'], reverse=True)`

			`# Initialize the RSS feed generator`
			`fg = FeedGenerator()`
			`fg.title('Warhammer Community RSS Feed')`
			`fg.link(href=url)`
			`fg.description('Latest Warhammer Community Articles')`

			`# Add the sorted articles to the RSS feed`
			`for article in articles:`
			`fe = fg.add_entry()`
			`fe.title(article['title'])`
			`fe.link(href=article['link'])`
			`fe.pubDate(article['date'])`

			`# Generate the RSS feed`
			`rss_feed = fg.rss_str(pretty=True)`

			`# Save the RSS feed to a file`
added Dockerfile for container build 2024-10-08 21:55:13 +02:00			`with open('/app/output/warhammer_rss_feed.xml', 'wb') as f:`
initial commit 2024-10-03 20:34:27 +02:00			`f.write(rss_feed)`

added Dockerfile for container build 2024-10-08 21:55:13 +02:00			`with open('/app/output/page.html','w', encoding='utf-8') as f:`
			`f.write(soup.prettify())`
initial commit 2024-10-03 20:34:27 +02:00			`print('RSS feed generated and saved as warhammer_rss_feed.xml')`

			`# Run the function`
			`scrape_and_generate_rss('https://www.warhammer-community.com/en-gb/')`