initial commit

2024-10-03 12:34:27 -06:00 · 2024-10-03 12:34:27 -06:00 · 3d44106f02
commit 3d44106f02
3 changed files with 114 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,2 @@
+*.xml
+.python-version
--- a/main.py
+++ b/main.py
@ -0,0 +1,107 @@
+from playwright.sync_api import sync_playwright
+from bs4 import BeautifulSoup
+from feedgen.feed import FeedGenerator
+from datetime import datetime
+import pytz
+import time
+
+# Function to scrape articles using Playwright and generate an RSS feed
+def scrape_and_generate_rss(url):
+    articles = []
+    seen_urls = set()  # Set to track seen URLs and avoid duplicates
+
+    # Use Playwright to load the page
+    with sync_playwright() as p:
+        browser = p.chromium.launch(headless=True)
+        page = browser.new_page()
+        
+        # Set a longer timeout for loading the page
+        page.set_default_navigation_timeout(60000)
+        
+        # Load the Warhammer Community page
+        page.goto(url, wait_until="networkidle")
+        
+        # Simulate scrolling to load more content if needed
+        for _ in range(10):
+            page.evaluate("window.scrollBy(0, document.body.scrollHeight)")
+            time.sleep(2)
+        
+        # Get the fully rendered HTML content
+        html = page.content()
+        browser.close()
+
+    # Parse the HTML content with BeautifulSoup
+    soup = BeautifulSoup(html, 'html.parser')
+
+    # Define a timezone (UTC in this case)
+    timezone = pytz.UTC
+
+    # Find all articles in the page
+    for article in soup.find_all('article'):
+        # Extract the title
+        title_tag = article.find('h3', class_='newsCard-title-sm') or article.find('h3', class_='newsCard-title-lg')
+        title = title_tag.text.strip() if title_tag else 'No title'
+
+        # Extract the link
+        link_tag = article.find('a', href=True)
+        link = link_tag['href'] if link_tag else None
+
+        # Skip this entry if the link is None or the URL has already been seen
+        if not link or link in seen_urls:
+            continue  # Skip duplicates or invalid entries
+
+        seen_urls.add(link)  # Add the URL to the set of seen URLs
+
+        # Extract the publication date and ignore reading time
+        date = None
+        for time_tag in article.find_all('time'):
+            raw_date = time_tag.text.strip()
+
+            # Ignore "min" time blocks (reading time)
+            if "min" not in raw_date.lower():
+                try:
+                    # Parse the actual date (e.g., "02 Oct 24")
+                    date = datetime.strptime(raw_date, '%d %b %y')
+                    date = timezone.localize(date)  # Localize with UTC
+                    break  # Stop after finding the correct date
+                except ValueError:
+                    continue
+
+        # If no valid date is found, use the current date as a fallback
+        if not date:
+            date = datetime.now(timezone)
+
+        # Add the article to the list with its publication date
+        articles.append({
+            'title': title,
+            'link': link,
+            'date': date
+        })
+
+    # Sort the articles by publication date (newest first)
+    articles.sort(key=lambda x: x['date'], reverse=True)
+
+    # Initialize the RSS feed generator
+    fg = FeedGenerator()
+    fg.title('Warhammer Community RSS Feed')
+    fg.link(href=url)
+    fg.description('Latest Warhammer Community Articles')
+
+    # Add the sorted articles to the RSS feed
+    for article in articles:
+        fe = fg.add_entry()
+        fe.title(article['title'])
+        fe.link(href=article['link'])
+        fe.pubDate(article['date'])
+
+    # Generate the RSS feed
+    rss_feed = fg.rss_str(pretty=True)
+
+    # Save the RSS feed to a file
+    with open('warhammer_rss_feed.xml', 'wb') as f:
+        f.write(rss_feed)
+
+    print('RSS feed generated and saved as warhammer_rss_feed.xml')
+
+# Run the function
+scrape_and_generate_rss('https://www.warhammer-community.com/en-gb/')
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,5 @@
+requests
+bs4
+feedgen
+playwright
+pytz