"""Tests for parser module.""" import pytest from datetime import datetime import pytz from unittest.mock import patch from src.rss_scraper.parser import sanitize_text, extract_articles_from_html from src.rss_scraper.exceptions import ParseError from src.rss_scraper.config import Config class TestSanitizeText: """Test text sanitization functionality.""" def test_sanitize_normal_text(self): """Test sanitization of normal text.""" text = "Normal article title" result = sanitize_text(text) assert result == "Normal article title" def test_sanitize_none_text(self): """Test sanitization of None text.""" result = sanitize_text(None) assert result == "No title" def test_sanitize_empty_text(self): """Test sanitization of empty text.""" result = sanitize_text("") assert result == "No title" def test_sanitize_whitespace_text(self): """Test sanitization of whitespace-only text.""" result = sanitize_text(" ") assert result == "No title" def test_remove_dangerous_patterns(self): """Test removal of dangerous patterns.""" dangerous_text = "Title with content" result = sanitize_text(dangerous_text) assert "

Test Article 1

Test Article 2

Test Article 1

Test Article 1 Duplicate

Valid Article

Invalid Article

No Link Article

03 Jan 24

""" base_url = "https://www.warhammer-community.com" articles = extract_articles_from_html(html, base_url) assert len(articles) == 1 # Only valid article should be included assert articles[0]['title'] == "Valid Article" def test_extract_articles_date_parsing(self): """Test parsing of various date formats.""" html = """

Article with good date

Article with bad date

Article with reading time

Unclosed tags" base_url = "https://www.warhammer-community.com" # Should not raise ParseError - BeautifulSoup handles malformed HTML gracefully articles = extract_articles_from_html(malformed_html, base_url) assert isinstance(articles, list) def test_extract_articles_invalid_html(self): """Test handling of completely invalid HTML.""" with patch('bs4.BeautifulSoup', side_effect=Exception("Parser error")): with pytest.raises(ParseError): extract_articles_from_html("", "https://example.com")