from playwright.sync_api import sync_playwright from bs4 import BeautifulSoup from feedgen.feed import FeedGenerator from datetime import datetime import pytz import time # Function to scrape articles using Playwright and generate an RSS feed def scrape_and_generate_rss(url): articles = [] seen_urls = set() # Set to track seen URLs and avoid duplicates # Use Playwright to load the page with sync_playwright() as p: browser = p.chromium.launch(headless=True) page = browser.new_page() # Set a longer timeout for loading the page page.set_default_navigation_timeout(120000) # Load the Warhammer Community page page.goto(url, wait_until="networkidle") # Simulate scrolling to load more content if needed for _ in range(10): page.evaluate("window.scrollBy(0, document.body.scrollHeight)") time.sleep(2) # Get the fully rendered HTML content html = page.content() browser.close() # Parse the HTML content with BeautifulSoup soup = BeautifulSoup(html, 'html.parser') # Define a timezone (UTC in this case) timezone = pytz.UTC # Find all articles in the page for article in soup.find_all('article'): # Extract the title title_tag = article.find('h3', class_='newsCard-title-sm') or article.find('h3', class_='newsCard-title-lg') title = title_tag.text.strip() if title_tag else 'No title' # Extract the link link_tag = article.find('a', href=True) link = link_tag['href'] if link_tag else None # Skip this entry if the link is None or the URL has already been seen if not link or link in seen_urls: continue # Skip duplicates or invalid entries seen_urls.add(link) # Add the URL to the set of seen URLs # Extract the publication date and ignore reading time date = None for time_tag in article.find_all('time'): raw_date = time_tag.text.strip() # Ignore "min" time blocks (reading time) if "min" not in raw_date.lower(): try: # Parse the actual date (e.g., "02 Oct 24") date = datetime.strptime(raw_date, '%d %b %y') date = timezone.localize(date) # Localize with UTC break # Stop after finding the correct date except ValueError: continue # If no valid date is found, use the current date as a fallback if not date: date = datetime.now(timezone) # Add the article to the list with its publication date articles.append({ 'title': title, 'link': link, 'date': date }) # Sort the articles by publication date (newest first) articles.sort(key=lambda x: x['date'], reverse=True) # Initialize the RSS feed generator fg = FeedGenerator() fg.title('Warhammer Community RSS Feed') fg.link(href=url) fg.description('Latest Warhammer Community Articles') # Add the sorted articles to the RSS feed for article in articles: fe = fg.add_entry() fe.title(article['title']) fe.link(href=article['link']) fe.pubDate(article['date']) # Generate the RSS feed rss_feed = fg.rss_str(pretty=True) # Save the RSS feed to a file with open('/app/output/warhammer_rss_feed.xml', 'wb') as f: f.write(rss_feed) with open('/app/output/page.html','w', encoding='utf-8') as f: f.write(soup.prettify()) print('RSS feed generated and saved as warhammer_rss_feed.xml') # Run the function scrape_and_generate_rss('https://www.warhammer-community.com/en-gb/')