110 lines
		
	
	
		
			3.7 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			110 lines
		
	
	
		
			3.7 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| from playwright.sync_api import sync_playwright
 | |
| from bs4 import BeautifulSoup
 | |
| from feedgen.feed import FeedGenerator
 | |
| from datetime import datetime
 | |
| import pytz
 | |
| import time
 | |
| 
 | |
| # Function to scrape articles using Playwright and generate an RSS feed
 | |
| def scrape_and_generate_rss(url):
 | |
|     articles = []
 | |
|     seen_urls = set()  # Set to track seen URLs and avoid duplicates
 | |
| 
 | |
|     # Use Playwright to load the page
 | |
|     with sync_playwright() as p:
 | |
|         browser = p.chromium.launch(headless=True)
 | |
|         page = browser.new_page()
 | |
|         
 | |
|         # Set a longer timeout for loading the page
 | |
|         page.set_default_navigation_timeout(120000)
 | |
|         
 | |
|         # Load the Warhammer Community page
 | |
|         page.goto(url, wait_until="networkidle")
 | |
|         
 | |
|         # Simulate scrolling to load more content if needed
 | |
|         for _ in range(10):
 | |
|             page.evaluate("window.scrollBy(0, document.body.scrollHeight)")
 | |
|             time.sleep(2)
 | |
|         
 | |
|         # Get the fully rendered HTML content
 | |
|         html = page.content()
 | |
|         browser.close()
 | |
| 
 | |
|     # Parse the HTML content with BeautifulSoup
 | |
|     soup = BeautifulSoup(html, 'html.parser')
 | |
| 
 | |
|     # Define a timezone (UTC in this case)
 | |
|     timezone = pytz.UTC
 | |
| 
 | |
|     # Find all articles in the page
 | |
|     for article in soup.find_all('article'):
 | |
|         # Extract the title
 | |
|         title_tag = article.find('h3', class_='newsCard-title-sm') or article.find('h3', class_='newsCard-title-lg')
 | |
|         title = title_tag.text.strip() if title_tag else 'No title'
 | |
| 
 | |
|         # Extract the link
 | |
|         link_tag = article.find('a', href=True)
 | |
|         link = link_tag['href'] if link_tag else None
 | |
| 
 | |
|         # Skip this entry if the link is None or the URL has already been seen
 | |
|         if not link or link in seen_urls:
 | |
|             continue  # Skip duplicates or invalid entries
 | |
| 
 | |
|         seen_urls.add(link)  # Add the URL to the set of seen URLs
 | |
| 
 | |
|         # Extract the publication date and ignore reading time
 | |
|         date = None
 | |
|         for time_tag in article.find_all('time'):
 | |
|             raw_date = time_tag.text.strip()
 | |
| 
 | |
|             # Ignore "min" time blocks (reading time)
 | |
|             if "min" not in raw_date.lower():
 | |
|                 try:
 | |
|                     # Parse the actual date (e.g., "02 Oct 24")
 | |
|                     date = datetime.strptime(raw_date, '%d %b %y')
 | |
|                     date = timezone.localize(date)  # Localize with UTC
 | |
|                     break  # Stop after finding the correct date
 | |
|                 except ValueError:
 | |
|                     continue
 | |
| 
 | |
|         # If no valid date is found, use the current date as a fallback
 | |
|         if not date:
 | |
|             date = datetime.now(timezone)
 | |
| 
 | |
|         # Add the article to the list with its publication date
 | |
|         articles.append({
 | |
|             'title': title,
 | |
|             'link': link,
 | |
|             'date': date
 | |
|         })
 | |
| 
 | |
|     # Sort the articles by publication date (newest first)
 | |
|     articles.sort(key=lambda x: x['date'], reverse=True)
 | |
| 
 | |
|     # Initialize the RSS feed generator
 | |
|     fg = FeedGenerator()
 | |
|     fg.title('Warhammer Community RSS Feed')
 | |
|     fg.link(href=url)
 | |
|     fg.description('Latest Warhammer Community Articles')
 | |
| 
 | |
|     # Add the sorted articles to the RSS feed
 | |
|     for article in articles:
 | |
|         fe = fg.add_entry()
 | |
|         fe.title(article['title'])
 | |
|         fe.link(href=article['link'])
 | |
|         fe.pubDate(article['date'])
 | |
| 
 | |
|     # Generate the RSS feed
 | |
|     rss_feed = fg.rss_str(pretty=True)
 | |
| 
 | |
|     # Save the RSS feed to a file
 | |
|     with open('/app/output/warhammer_rss_feed.xml', 'wb') as f:
 | |
|         f.write(rss_feed)
 | |
| 
 | |
|     with open('/app/output/page.html','w', encoding='utf-8') as f:
 | |
|         f.write(soup.prettify())
 | |
|     print('RSS feed generated and saved as warhammer_rss_feed.xml')
 | |
| 
 | |
| # Run the function
 | |
| scrape_and_generate_rss('https://www.warhammer-community.com/en-gb/')
 |