rss_warhammer-community/main.py

from playwright.sync_api import sync_playwright
from bs4 import BeautifulSoup
from feedgen.feed import FeedGenerator
from datetime import datetime
import pytz
import time

# Function to scrape articles using Playwright and generate an RSS feed
def scrape_and_generate_rss(url):
    articles = []
    seen_urls = set()  # Set to track seen URLs and avoid duplicates

    # Use Playwright to load the page
    with sync_playwright() as p:
        browser = p.chromium.launch(headless=True)
        page = browser.new_page()

        # Set a longer timeout for loading the page
        page.set_default_navigation_timeout(120000)

        # Load the Warhammer Community page
        page.goto(url, wait_until="networkidle")

        # Simulate scrolling to load more content if needed
        for _ in range(10):
            page.evaluate("window.scrollBy(0, document.body.scrollHeight)")
            time.sleep(2)

        # Get the fully rendered HTML content
        html = page.content()
        browser.close()

    # Parse the HTML content with BeautifulSoup
    soup = BeautifulSoup(html, 'html.parser')

    # Define a timezone (UTC in this case)
    timezone = pytz.UTC

    # Find all articles in the page
    for article in soup.find_all('article'):
        # Extract the title
        title_tag = article.find('h3', class_='newsCard-title-sm') or article.find('h3', class_='newsCard-title-lg')
        title = title_tag.text.strip() if title_tag else 'No title'

        # Extract the link
        link_tag = article.find('a', href=True)
        link = link_tag['href'] if link_tag else None

        # Skip this entry if the link is None or the URL has already been seen
        if not link or link in seen_urls:
            continue  # Skip duplicates or invalid entries

        seen_urls.add(link)  # Add the URL to the set of seen URLs

        # Extract the publication date and ignore reading time
        date = None
        for time_tag in article.find_all('time'):
            raw_date = time_tag.text.strip()

            # Ignore "min" time blocks (reading time)
            if "min" not in raw_date.lower():
                try:
                    # Parse the actual date (e.g., "02 Oct 24")
                    date = datetime.strptime(raw_date, '%d %b %y')
                    date = timezone.localize(date)  # Localize with UTC
                    break  # Stop after finding the correct date
                except ValueError:
                    continue

        # If no valid date is found, use the current date as a fallback
        if not date:
            date = datetime.now(timezone)

        # Add the article to the list with its publication date
        articles.append({
            'title': title,
            'link': link,
            'date': date
        })

    # Sort the articles by publication date (newest first)
    articles.sort(key=lambda x: x['date'], reverse=True)

    # Initialize the RSS feed generator
    fg = FeedGenerator()
    fg.title('Warhammer Community RSS Feed')
    fg.link(href=url)
    fg.description('Latest Warhammer Community Articles')

    # Add the sorted articles to the RSS feed
    for article in articles:
        fe = fg.add_entry()
        fe.title(article['title'])
        fe.link(href=article['link'])
        fe.pubDate(article['date'])

    # Generate the RSS feed
    rss_feed = fg.rss_str(pretty=True)

    # Save the RSS feed to a file
    with open('/app/output/warhammer_rss_feed.xml', 'wb') as f:
        f.write(rss_feed)

    with open('/app/output/page.html','w', encoding='utf-8') as f:
        f.write(soup.prettify())
    print('RSS feed generated and saved as warhammer_rss_feed.xml')

# Run the function
scrape_and_generate_rss('https://www.warhammer-community.com/en-gb/')