rss_warhammer-community/tests/test_parser.py

"""Tests for parser module."""

import pytest
from datetime import datetime
import pytz
from unittest.mock import patch

from src.rss_scraper.parser import sanitize_text, extract_articles_from_html
from src.rss_scraper.exceptions import ParseError
from src.rss_scraper.config import Config


class TestSanitizeText:
    """Test text sanitization functionality."""

    def test_sanitize_normal_text(self):
        """Test sanitization of normal text."""
        text = "Normal article title"
        result = sanitize_text(text)
        assert result == "Normal article title"

    def test_sanitize_none_text(self):
        """Test sanitization of None text."""
        result = sanitize_text(None)
        assert result == "No title"

    def test_sanitize_empty_text(self):
        """Test sanitization of empty text."""
        result = sanitize_text("")
        assert result == "No title"

    def test_sanitize_whitespace_text(self):
        """Test sanitization of whitespace-only text."""
        result = sanitize_text("   ")
        assert result == "No title"

    def test_remove_dangerous_patterns(self):
        """Test removal of dangerous patterns."""
        dangerous_text = "Title with <script>alert('xss')</script> content"
        result = sanitize_text(dangerous_text)
        assert "<script" not in result
        assert "</script" not in result
        assert "alert('xss')" in result  # Only script tags should be removed

    def test_length_limit(self):
        """Test that text is limited to max length."""
        long_text = "a" * 1000
        result = sanitize_text(long_text)
        assert len(result) <= Config.MAX_TITLE_LENGTH

    def test_case_insensitive_pattern_removal(self):
        """Test that dangerous patterns are removed case-insensitively."""
        text = "Title with <SCRIPT>alert('xss')</SCRIPT> and javascript: protocol"
        result = sanitize_text(text)
        assert "<SCRIPT" not in result
        assert "</SCRIPT" not in result
        assert "javascript:" not in result


class TestExtractArticlesFromHtml:
    """Test article extraction from HTML."""

    def test_extract_articles_valid_html(self):
        """Test extraction from valid HTML with articles."""
        html = """
        <html>
        <body>
            <article>
                <h3 class="newsCard-title-sm">Test Article 1</h3>
                <a href="/article/test-1">Read more</a>
                <time>01 Jan 24</time>
            </article>
            <article>
                <h3 class="newsCard-title-lg">Test Article 2</h3>
                <a href="https://www.warhammer-community.com/article/test-2">Read more</a>
                <time>02 Jan 24</time>
            </article>
        </body>
        </html>
        """

        base_url = "https://www.warhammer-community.com"
        articles = extract_articles_from_html(html, base_url)

        assert len(articles) == 2
        assert articles[0]['title'] == "Test Article 2"  # Sorted by date, newest first
        assert articles[1]['title'] == "Test Article 1"
        assert "warhammer-community.com" in articles[0]['link']
        assert "warhammer-community.com" in articles[1]['link']

    def test_extract_articles_no_articles(self):
        """Test extraction from HTML with no articles."""
        html = """
        <html>
        <body>
            <div>No articles here</div>
        </body>
        </html>
        """

        base_url = "https://www.warhammer-community.com"
        articles = extract_articles_from_html(html, base_url)

        assert len(articles) == 0

    def test_extract_articles_duplicate_links(self):
        """Test that duplicate links are filtered out."""
        html = """
        <html>
        <body>
            <article>
                <h3 class="newsCard-title-sm">Test Article 1</h3>
                <a href="/article/test-1">Read more</a>
                <time>01 Jan 24</time>
            </article>
            <article>
                <h3 class="newsCard-title-lg">Test Article 1 Duplicate</h3>
                <a href="/article/test-1">Read more</a>
                <time>02 Jan 24</time>
            </article>
        </body>
        </html>
        """

        base_url = "https://www.warhammer-community.com"
        articles = extract_articles_from_html(html, base_url)

        assert len(articles) == 1  # Duplicate should be filtered out
        assert articles[0]['title'] == "Test Article 1"

    def test_extract_articles_invalid_links(self):
        """Test handling of articles with invalid links."""
        html = """
        <html>
        <body>
            <article>
                <h3 class="newsCard-title-sm">Valid Article</h3>
                <a href="/article/valid">Read more</a>
                <time>01 Jan 24</time>
            </article>
            <article>
                <h3 class="newsCard-title-lg">Invalid Article</h3>
                <a href="https://malicious-site.com/article">Read more</a>
                <time>02 Jan 24</time>
            </article>
            <article>
                <h3 class="newsCard-title-sm">No Link Article</h3>
                <time>03 Jan 24</time>
            </article>
        </body>
        </html>
        """

        base_url = "https://www.warhammer-community.com"
        articles = extract_articles_from_html(html, base_url)

        assert len(articles) == 1  # Only valid article should be included
        assert articles[0]['title'] == "Valid Article"

    def test_extract_articles_date_parsing(self):
        """Test parsing of various date formats."""
        html = """
        <html>
        <body>
            <article>
                <h3 class="newsCard-title-sm">Article with good date</h3>
                <a href="/article/1">Read more</a>
                <time>15 Mar 24</time>
            </article>
            <article>
                <h3 class="newsCard-title-lg">Article with bad date</h3>
                <a href="/article/2">Read more</a>
                <time>Invalid Date Format</time>
            </article>
            <article>
                <h3 class="newsCard-title-sm">Article with reading time</h3>
                <a href="/article/3">Read more</a>
                <time>5 min read</time>
                <time>20 Mar 24</time>
            </article>
        </body>
        </html>
        """

        base_url = "https://www.warhammer-community.com"
        articles = extract_articles_from_html(html, base_url)

        assert len(articles) == 3

        # Check that dates are parsed correctly
        for article in articles:
            assert isinstance(article['date'], datetime)
            assert article['date'].tzinfo is not None

    def test_extract_articles_malformed_html(self):
        """Test handling of malformed HTML."""
        malformed_html = "<html><body><article><h3>Unclosed tags"

        base_url = "https://www.warhammer-community.com"
        # Should not raise ParseError - BeautifulSoup handles malformed HTML gracefully
        articles = extract_articles_from_html(malformed_html, base_url)
        assert isinstance(articles, list)

    def test_extract_articles_invalid_html(self):
        """Test handling of completely invalid HTML."""
        with patch('bs4.BeautifulSoup', side_effect=Exception("Parser error")):
            with pytest.raises(ParseError):
                extract_articles_from_html("<html></html>", "https://example.com")