initial commit
This commit is contained in:
		
							
								
								
									
										2
									
								
								.gitignore
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										2
									
								
								.gitignore
									
									
									
									
										vendored
									
									
										Normal file
									
								
							| @@ -0,0 +1,2 @@ | ||||
| *.xml | ||||
| .python-version | ||||
							
								
								
									
										107
									
								
								main.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										107
									
								
								main.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,107 @@ | ||||
| from playwright.sync_api import sync_playwright | ||||
| from bs4 import BeautifulSoup | ||||
| from feedgen.feed import FeedGenerator | ||||
| from datetime import datetime | ||||
| import pytz | ||||
| import time | ||||
|  | ||||
| # Function to scrape articles using Playwright and generate an RSS feed | ||||
| def scrape_and_generate_rss(url): | ||||
|     articles = [] | ||||
|     seen_urls = set()  # Set to track seen URLs and avoid duplicates | ||||
|  | ||||
|     # Use Playwright to load the page | ||||
|     with sync_playwright() as p: | ||||
|         browser = p.chromium.launch(headless=True) | ||||
|         page = browser.new_page() | ||||
|          | ||||
|         # Set a longer timeout for loading the page | ||||
|         page.set_default_navigation_timeout(60000) | ||||
|          | ||||
|         # Load the Warhammer Community page | ||||
|         page.goto(url, wait_until="networkidle") | ||||
|          | ||||
|         # Simulate scrolling to load more content if needed | ||||
|         for _ in range(10): | ||||
|             page.evaluate("window.scrollBy(0, document.body.scrollHeight)") | ||||
|             time.sleep(2) | ||||
|          | ||||
|         # Get the fully rendered HTML content | ||||
|         html = page.content() | ||||
|         browser.close() | ||||
|  | ||||
|     # Parse the HTML content with BeautifulSoup | ||||
|     soup = BeautifulSoup(html, 'html.parser') | ||||
|  | ||||
|     # Define a timezone (UTC in this case) | ||||
|     timezone = pytz.UTC | ||||
|  | ||||
|     # Find all articles in the page | ||||
|     for article in soup.find_all('article'): | ||||
|         # Extract the title | ||||
|         title_tag = article.find('h3', class_='newsCard-title-sm') or article.find('h3', class_='newsCard-title-lg') | ||||
|         title = title_tag.text.strip() if title_tag else 'No title' | ||||
|  | ||||
|         # Extract the link | ||||
|         link_tag = article.find('a', href=True) | ||||
|         link = link_tag['href'] if link_tag else None | ||||
|  | ||||
|         # Skip this entry if the link is None or the URL has already been seen | ||||
|         if not link or link in seen_urls: | ||||
|             continue  # Skip duplicates or invalid entries | ||||
|  | ||||
|         seen_urls.add(link)  # Add the URL to the set of seen URLs | ||||
|  | ||||
|         # Extract the publication date and ignore reading time | ||||
|         date = None | ||||
|         for time_tag in article.find_all('time'): | ||||
|             raw_date = time_tag.text.strip() | ||||
|  | ||||
|             # Ignore "min" time blocks (reading time) | ||||
|             if "min" not in raw_date.lower(): | ||||
|                 try: | ||||
|                     # Parse the actual date (e.g., "02 Oct 24") | ||||
|                     date = datetime.strptime(raw_date, '%d %b %y') | ||||
|                     date = timezone.localize(date)  # Localize with UTC | ||||
|                     break  # Stop after finding the correct date | ||||
|                 except ValueError: | ||||
|                     continue | ||||
|  | ||||
|         # If no valid date is found, use the current date as a fallback | ||||
|         if not date: | ||||
|             date = datetime.now(timezone) | ||||
|  | ||||
|         # Add the article to the list with its publication date | ||||
|         articles.append({ | ||||
|             'title': title, | ||||
|             'link': link, | ||||
|             'date': date | ||||
|         }) | ||||
|  | ||||
|     # Sort the articles by publication date (newest first) | ||||
|     articles.sort(key=lambda x: x['date'], reverse=True) | ||||
|  | ||||
|     # Initialize the RSS feed generator | ||||
|     fg = FeedGenerator() | ||||
|     fg.title('Warhammer Community RSS Feed') | ||||
|     fg.link(href=url) | ||||
|     fg.description('Latest Warhammer Community Articles') | ||||
|  | ||||
|     # Add the sorted articles to the RSS feed | ||||
|     for article in articles: | ||||
|         fe = fg.add_entry() | ||||
|         fe.title(article['title']) | ||||
|         fe.link(href=article['link']) | ||||
|         fe.pubDate(article['date']) | ||||
|  | ||||
|     # Generate the RSS feed | ||||
|     rss_feed = fg.rss_str(pretty=True) | ||||
|  | ||||
|     # Save the RSS feed to a file | ||||
|     with open('warhammer_rss_feed.xml', 'wb') as f: | ||||
|         f.write(rss_feed) | ||||
|  | ||||
|     print('RSS feed generated and saved as warhammer_rss_feed.xml') | ||||
|  | ||||
| # Run the function | ||||
| scrape_and_generate_rss('https://www.warhammer-community.com/en-gb/') | ||||
							
								
								
									
										5
									
								
								requirements.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										5
									
								
								requirements.txt
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,5 @@ | ||||
| requests | ||||
| bs4 | ||||
| feedgen | ||||
| playwright | ||||
| pytz | ||||
		Reference in New Issue
	
	Block a user