initial commit

2024-10-03 12:34:27 -06:00
commit 3d44106f02
3 changed files with 114 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,2 @@
 *.xml
 .python-version
--- a/main.py
+++ b/main.py
@@ -0,0 +1,107 @@
 from playwright.sync_api import sync_playwright
 from bs4 import BeautifulSoup
 from feedgen.feed import FeedGenerator
 from datetime import datetime
 import pytz
 import time
 # Function to scrape articles using Playwright and generate an RSS feed
 def scrape_and_generate_rss(url):
    articles = []
    seen_urls = set()  # Set to track seen URLs and avoid duplicates
    # Use Playwright to load the page
    with sync_playwright() as p:
        browser = p.chromium.launch(headless=True)
        page = browser.new_page()
        # Set a longer timeout for loading the page
        page.set_default_navigation_timeout(60000)
        # Load the Warhammer Community page
        page.goto(url, wait_until="networkidle")
        # Simulate scrolling to load more content if needed
        for _ in range(10):
            page.evaluate("window.scrollBy(0, document.body.scrollHeight)")
            time.sleep(2)
        # Get the fully rendered HTML content
        html = page.content()
        browser.close()
    # Parse the HTML content with BeautifulSoup
    soup = BeautifulSoup(html, 'html.parser')
    # Define a timezone (UTC in this case)
    timezone = pytz.UTC
    # Find all articles in the page
    for article in soup.find_all('article'):
        # Extract the title
        title_tag = article.find('h3', class_='newsCard-title-sm') or article.find('h3', class_='newsCard-title-lg')
        title = title_tag.text.strip() if title_tag else 'No title'
        # Extract the link
        link_tag = article.find('a', href=True)
        link = link_tag['href'] if link_tag else None
        # Skip this entry if the link is None or the URL has already been seen
        if not link or link in seen_urls:
            continue  # Skip duplicates or invalid entries
        seen_urls.add(link)  # Add the URL to the set of seen URLs
        # Extract the publication date and ignore reading time
        date = None
        for time_tag in article.find_all('time'):
            raw_date = time_tag.text.strip()
            # Ignore "min" time blocks (reading time)
            if "min" not in raw_date.lower():
                try:
                    # Parse the actual date (e.g., "02 Oct 24")
                    date = datetime.strptime(raw_date, '%d %b %y')
                    date = timezone.localize(date)  # Localize with UTC
                    break  # Stop after finding the correct date
                except ValueError:
                    continue
        # If no valid date is found, use the current date as a fallback
        if not date:
            date = datetime.now(timezone)
        # Add the article to the list with its publication date
        articles.append({
            'title': title,
            'link': link,
            'date': date
        })
    # Sort the articles by publication date (newest first)
    articles.sort(key=lambda x: x['date'], reverse=True)
    # Initialize the RSS feed generator
    fg = FeedGenerator()
    fg.title('Warhammer Community RSS Feed')
    fg.link(href=url)
    fg.description('Latest Warhammer Community Articles')
    # Add the sorted articles to the RSS feed
    for article in articles:
        fe = fg.add_entry()
        fe.title(article['title'])
        fe.link(href=article['link'])
        fe.pubDate(article['date'])
    # Generate the RSS feed
    rss_feed = fg.rss_str(pretty=True)
    # Save the RSS feed to a file
    with open('warhammer_rss_feed.xml', 'wb') as f:
        f.write(rss_feed)
    print('RSS feed generated and saved as warhammer_rss_feed.xml')
 # Run the function
 scrape_and_generate_rss('https://www.warhammer-community.com/en-gb/')
--- a/requirements.txt
+++ b/requirements.txt
@@ -0,0 +1,5 @@
 requests
 bs4
 feedgen
 playwright
 pytz