commit 3d44106f02031d1ed2b2ceae31eb6e4efe458424 Author: Phil Date: Thu Oct 3 12:34:27 2024 -0600 initial commit diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..189e234 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +*.xml +.python-version \ No newline at end of file diff --git a/main.py b/main.py new file mode 100644 index 0000000..739e0c9 --- /dev/null +++ b/main.py @@ -0,0 +1,107 @@ +from playwright.sync_api import sync_playwright +from bs4 import BeautifulSoup +from feedgen.feed import FeedGenerator +from datetime import datetime +import pytz +import time + +# Function to scrape articles using Playwright and generate an RSS feed +def scrape_and_generate_rss(url): + articles = [] + seen_urls = set() # Set to track seen URLs and avoid duplicates + + # Use Playwright to load the page + with sync_playwright() as p: + browser = p.chromium.launch(headless=True) + page = browser.new_page() + + # Set a longer timeout for loading the page + page.set_default_navigation_timeout(60000) + + # Load the Warhammer Community page + page.goto(url, wait_until="networkidle") + + # Simulate scrolling to load more content if needed + for _ in range(10): + page.evaluate("window.scrollBy(0, document.body.scrollHeight)") + time.sleep(2) + + # Get the fully rendered HTML content + html = page.content() + browser.close() + + # Parse the HTML content with BeautifulSoup + soup = BeautifulSoup(html, 'html.parser') + + # Define a timezone (UTC in this case) + timezone = pytz.UTC + + # Find all articles in the page + for article in soup.find_all('article'): + # Extract the title + title_tag = article.find('h3', class_='newsCard-title-sm') or article.find('h3', class_='newsCard-title-lg') + title = title_tag.text.strip() if title_tag else 'No title' + + # Extract the link + link_tag = article.find('a', href=True) + link = link_tag['href'] if link_tag else None + + # Skip this entry if the link is None or the URL has already been seen + if not link or link in seen_urls: + continue # Skip duplicates or invalid entries + + seen_urls.add(link) # Add the URL to the set of seen URLs + + # Extract the publication date and ignore reading time + date = None + for time_tag in article.find_all('time'): + raw_date = time_tag.text.strip() + + # Ignore "min" time blocks (reading time) + if "min" not in raw_date.lower(): + try: + # Parse the actual date (e.g., "02 Oct 24") + date = datetime.strptime(raw_date, '%d %b %y') + date = timezone.localize(date) # Localize with UTC + break # Stop after finding the correct date + except ValueError: + continue + + # If no valid date is found, use the current date as a fallback + if not date: + date = datetime.now(timezone) + + # Add the article to the list with its publication date + articles.append({ + 'title': title, + 'link': link, + 'date': date + }) + + # Sort the articles by publication date (newest first) + articles.sort(key=lambda x: x['date'], reverse=True) + + # Initialize the RSS feed generator + fg = FeedGenerator() + fg.title('Warhammer Community RSS Feed') + fg.link(href=url) + fg.description('Latest Warhammer Community Articles') + + # Add the sorted articles to the RSS feed + for article in articles: + fe = fg.add_entry() + fe.title(article['title']) + fe.link(href=article['link']) + fe.pubDate(article['date']) + + # Generate the RSS feed + rss_feed = fg.rss_str(pretty=True) + + # Save the RSS feed to a file + with open('warhammer_rss_feed.xml', 'wb') as f: + f.write(rss_feed) + + print('RSS feed generated and saved as warhammer_rss_feed.xml') + +# Run the function +scrape_and_generate_rss('https://www.warhammer-community.com/en-gb/') diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..5febfa4 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,5 @@ +requests +bs4 +feedgen +playwright +pytz \ No newline at end of file