initial commit
This commit is contained in:
commit
3d44106f02
2
.gitignore
vendored
Normal file
2
.gitignore
vendored
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
*.xml
|
||||||
|
.python-version
|
107
main.py
Normal file
107
main.py
Normal file
@ -0,0 +1,107 @@
|
|||||||
|
from playwright.sync_api import sync_playwright
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
from feedgen.feed import FeedGenerator
|
||||||
|
from datetime import datetime
|
||||||
|
import pytz
|
||||||
|
import time
|
||||||
|
|
||||||
|
# Function to scrape articles using Playwright and generate an RSS feed
|
||||||
|
def scrape_and_generate_rss(url):
|
||||||
|
articles = []
|
||||||
|
seen_urls = set() # Set to track seen URLs and avoid duplicates
|
||||||
|
|
||||||
|
# Use Playwright to load the page
|
||||||
|
with sync_playwright() as p:
|
||||||
|
browser = p.chromium.launch(headless=True)
|
||||||
|
page = browser.new_page()
|
||||||
|
|
||||||
|
# Set a longer timeout for loading the page
|
||||||
|
page.set_default_navigation_timeout(60000)
|
||||||
|
|
||||||
|
# Load the Warhammer Community page
|
||||||
|
page.goto(url, wait_until="networkidle")
|
||||||
|
|
||||||
|
# Simulate scrolling to load more content if needed
|
||||||
|
for _ in range(10):
|
||||||
|
page.evaluate("window.scrollBy(0, document.body.scrollHeight)")
|
||||||
|
time.sleep(2)
|
||||||
|
|
||||||
|
# Get the fully rendered HTML content
|
||||||
|
html = page.content()
|
||||||
|
browser.close()
|
||||||
|
|
||||||
|
# Parse the HTML content with BeautifulSoup
|
||||||
|
soup = BeautifulSoup(html, 'html.parser')
|
||||||
|
|
||||||
|
# Define a timezone (UTC in this case)
|
||||||
|
timezone = pytz.UTC
|
||||||
|
|
||||||
|
# Find all articles in the page
|
||||||
|
for article in soup.find_all('article'):
|
||||||
|
# Extract the title
|
||||||
|
title_tag = article.find('h3', class_='newsCard-title-sm') or article.find('h3', class_='newsCard-title-lg')
|
||||||
|
title = title_tag.text.strip() if title_tag else 'No title'
|
||||||
|
|
||||||
|
# Extract the link
|
||||||
|
link_tag = article.find('a', href=True)
|
||||||
|
link = link_tag['href'] if link_tag else None
|
||||||
|
|
||||||
|
# Skip this entry if the link is None or the URL has already been seen
|
||||||
|
if not link or link in seen_urls:
|
||||||
|
continue # Skip duplicates or invalid entries
|
||||||
|
|
||||||
|
seen_urls.add(link) # Add the URL to the set of seen URLs
|
||||||
|
|
||||||
|
# Extract the publication date and ignore reading time
|
||||||
|
date = None
|
||||||
|
for time_tag in article.find_all('time'):
|
||||||
|
raw_date = time_tag.text.strip()
|
||||||
|
|
||||||
|
# Ignore "min" time blocks (reading time)
|
||||||
|
if "min" not in raw_date.lower():
|
||||||
|
try:
|
||||||
|
# Parse the actual date (e.g., "02 Oct 24")
|
||||||
|
date = datetime.strptime(raw_date, '%d %b %y')
|
||||||
|
date = timezone.localize(date) # Localize with UTC
|
||||||
|
break # Stop after finding the correct date
|
||||||
|
except ValueError:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# If no valid date is found, use the current date as a fallback
|
||||||
|
if not date:
|
||||||
|
date = datetime.now(timezone)
|
||||||
|
|
||||||
|
# Add the article to the list with its publication date
|
||||||
|
articles.append({
|
||||||
|
'title': title,
|
||||||
|
'link': link,
|
||||||
|
'date': date
|
||||||
|
})
|
||||||
|
|
||||||
|
# Sort the articles by publication date (newest first)
|
||||||
|
articles.sort(key=lambda x: x['date'], reverse=True)
|
||||||
|
|
||||||
|
# Initialize the RSS feed generator
|
||||||
|
fg = FeedGenerator()
|
||||||
|
fg.title('Warhammer Community RSS Feed')
|
||||||
|
fg.link(href=url)
|
||||||
|
fg.description('Latest Warhammer Community Articles')
|
||||||
|
|
||||||
|
# Add the sorted articles to the RSS feed
|
||||||
|
for article in articles:
|
||||||
|
fe = fg.add_entry()
|
||||||
|
fe.title(article['title'])
|
||||||
|
fe.link(href=article['link'])
|
||||||
|
fe.pubDate(article['date'])
|
||||||
|
|
||||||
|
# Generate the RSS feed
|
||||||
|
rss_feed = fg.rss_str(pretty=True)
|
||||||
|
|
||||||
|
# Save the RSS feed to a file
|
||||||
|
with open('warhammer_rss_feed.xml', 'wb') as f:
|
||||||
|
f.write(rss_feed)
|
||||||
|
|
||||||
|
print('RSS feed generated and saved as warhammer_rss_feed.xml')
|
||||||
|
|
||||||
|
# Run the function
|
||||||
|
scrape_and_generate_rss('https://www.warhammer-community.com/en-gb/')
|
5
requirements.txt
Normal file
5
requirements.txt
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
requests
|
||||||
|
bs4
|
||||||
|
feedgen
|
||||||
|
playwright
|
||||||
|
pytz
|
Loading…
Reference in New Issue
Block a user