"""Tests for parser module.""" import pytest from datetime import datetime import pytz from unittest.mock import patch from src.rss_scraper.parser import sanitize_text, extract_articles_from_html from src.rss_scraper.exceptions import ParseError from src.rss_scraper.config import Config class TestSanitizeText: """Test text sanitization functionality.""" def test_sanitize_normal_text(self): """Test sanitization of normal text.""" text = "Normal article title" result = sanitize_text(text) assert result == "Normal article title" def test_sanitize_none_text(self): """Test sanitization of None text.""" result = sanitize_text(None) assert result == "No title" def test_sanitize_empty_text(self): """Test sanitization of empty text.""" result = sanitize_text("") assert result == "No title" def test_sanitize_whitespace_text(self): """Test sanitization of whitespace-only text.""" result = sanitize_text(" ") assert result == "No title" def test_remove_dangerous_patterns(self): """Test removal of dangerous patterns.""" dangerous_text = "Title with content" result = sanitize_text(dangerous_text) assert "

Test Article 1

Read more

Test Article 2

Read more
""" base_url = "https://www.warhammer-community.com" articles = extract_articles_from_html(html, base_url) assert len(articles) == 2 assert articles[0]['title'] == "Test Article 2" # Sorted by date, newest first assert articles[1]['title'] == "Test Article 1" assert "warhammer-community.com" in articles[0]['link'] assert "warhammer-community.com" in articles[1]['link'] def test_extract_articles_no_articles(self): """Test extraction from HTML with no articles.""" html = """
No articles here
""" base_url = "https://www.warhammer-community.com" articles = extract_articles_from_html(html, base_url) assert len(articles) == 0 def test_extract_articles_duplicate_links(self): """Test that duplicate links are filtered out.""" html = """

Test Article 1

Read more

Test Article 1 Duplicate

Read more
""" base_url = "https://www.warhammer-community.com" articles = extract_articles_from_html(html, base_url) assert len(articles) == 1 # Duplicate should be filtered out assert articles[0]['title'] == "Test Article 1" def test_extract_articles_invalid_links(self): """Test handling of articles with invalid links.""" html = """

Valid Article

Read more

Invalid Article

Read more

No Link Article

""" base_url = "https://www.warhammer-community.com" articles = extract_articles_from_html(html, base_url) assert len(articles) == 1 # Only valid article should be included assert articles[0]['title'] == "Valid Article" def test_extract_articles_date_parsing(self): """Test parsing of various date formats.""" html = """

Article with good date

Read more

Article with bad date

Read more

Article with reading time

Read more
""" base_url = "https://www.warhammer-community.com" articles = extract_articles_from_html(html, base_url) assert len(articles) == 3 # Check that dates are parsed correctly for article in articles: assert isinstance(article['date'], datetime) assert article['date'].tzinfo is not None def test_extract_articles_malformed_html(self): """Test handling of malformed HTML.""" malformed_html = "

Unclosed tags" base_url = "https://www.warhammer-community.com" # Should not raise ParseError - BeautifulSoup handles malformed HTML gracefully articles = extract_articles_from_html(malformed_html, base_url) assert isinstance(articles, list) def test_extract_articles_invalid_html(self): """Test handling of completely invalid HTML.""" with patch('bs4.BeautifulSoup', side_effect=Exception("Parser error")): with pytest.raises(ParseError): extract_articles_from_html("", "https://example.com")