Add comprehensive RSS scraper implementation with security and testing

- Modular architecture with separate modules for scraping, parsing, security, validation, and caching - Comprehensive security measures including HTML sanitization, rate limiting, and input validation - Robust error handling with custom exceptions and retry logic - HTTP caching with ETags and Last-Modified headers for efficiency - Pre-compiled regex patterns for improved performance - Comprehensive test suite with 66 tests covering all major functionality - Docker support for containerized deployment - Configuration management with environment variable support - Working parser that successfully extracts 32 articles from Warhammer Community 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-06-06 09:15:06 -06:00
parent e0647325ff
commit 25086fc01b
26 changed files with 15226 additions and 280 deletions
--- a/tests/init.py
+++ b/tests/init.py
@@ -0,0 +1 @@
+# Tests package
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -0,0 +1,116 @@
+"""Tests for configuration module."""
+
+import pytest
+import os
+from unittest.mock import patch
+
+from src.rss_scraper.config import Config
+
+
+class TestConfig:
+    """Test configuration functionality."""
+    
+    def test_default_values(self):
+        """Test that default configuration values are set correctly."""
+        assert Config.MAX_SCROLL_ITERATIONS == 5
+        assert Config.MAX_CONTENT_SIZE == 10 * 1024 * 1024
+        assert Config.MAX_TITLE_LENGTH == 500
+        assert Config.SCROLL_DELAY_SECONDS == 2.0
+        assert Config.PAGE_TIMEOUT_MS == 120000
+        assert Config.DEFAULT_URL == 'https://www.warhammer-community.com/en-gb/'
+        assert Config.DEFAULT_OUTPUT_DIR == '.'
+        assert Config.RSS_FILENAME == 'warhammer_rss_feed.xml'
+        assert Config.DEBUG_HTML_FILENAME == 'page.html'
+        assert Config.FEED_TITLE == 'Warhammer Community RSS Feed'
+        assert Config.FEED_DESCRIPTION == 'Latest Warhammer Community Articles'
+    
+    def test_environment_variable_override(self):
+        """Test that environment variables override default values."""
+        with patch.dict(os.environ, {
+            'MAX_SCROLL_ITERATIONS': '10',
+            'MAX_CONTENT_SIZE': '20971520',  # 20MB
+            'SCROLL_DELAY_SECONDS': '1.5',
+            'DEFAULT_URL': 'https://example.com',
+            'RSS_FILENAME': 'custom_feed.xml'
+        }):
+            # Need to reload the config to pick up environment changes
+            import importlib
+            import config
+            importlib.reload(config)
+            
+            assert config.Config.MAX_SCROLL_ITERATIONS == 10
+            assert config.Config.MAX_CONTENT_SIZE == 20971520
+            assert config.Config.SCROLL_DELAY_SECONDS == 1.5
+            assert config.Config.DEFAULT_URL == 'https://example.com'
+            assert config.Config.RSS_FILENAME == 'custom_feed.xml'
+    
+    def test_get_output_dir_with_override(self):
+        """Test get_output_dir method with override."""
+        result = Config.get_output_dir('/custom/path')
+        assert result == '/custom/path'
+    
+    def test_get_output_dir_without_override(self):
+        """Test get_output_dir method without override."""
+        result = Config.get_output_dir()
+        assert result == Config.DEFAULT_OUTPUT_DIR
+    
+    def test_get_allowed_domains_default(self):
+        """Test get_allowed_domains returns default domains."""
+        domains = Config.get_allowed_domains()
+        assert 'warhammer-community.com' in domains
+        assert 'www.warhammer-community.com' in domains
+    
+    def test_get_allowed_domains_from_env(self):
+        """Test get_allowed_domains reads from environment variable."""
+        with patch.dict(os.environ, {
+            'ALLOWED_DOMAINS': 'example.com,test.com,another.com'
+        }):
+            domains = Config.get_allowed_domains()
+            assert domains == ['example.com', 'test.com', 'another.com']
+    
+    def test_validate_config_success(self):
+        """Test that valid configuration passes validation."""
+        # Should not raise any exception
+        Config.validate_config()
+    
+    def test_validate_config_negative_scroll_iterations(self):
+        """Test validation fails for negative scroll iterations."""
+        with patch.object(Config, 'MAX_SCROLL_ITERATIONS', -1):
+            with pytest.raises(ValueError, match="MAX_SCROLL_ITERATIONS must be non-negative"):
+                Config.validate_config()
+    
+    def test_validate_config_zero_content_size(self):
+        """Test validation fails for zero content size."""
+        with patch.object(Config, 'MAX_CONTENT_SIZE', 0):
+            with pytest.raises(ValueError, match="MAX_CONTENT_SIZE must be positive"):
+                Config.validate_config()
+    
+    def test_validate_config_zero_title_length(self):
+        """Test validation fails for zero title length."""
+        with patch.object(Config, 'MAX_TITLE_LENGTH', 0):
+            with pytest.raises(ValueError, match="MAX_TITLE_LENGTH must be positive"):
+                Config.validate_config()
+    
+    def test_validate_config_negative_scroll_delay(self):
+        """Test validation fails for negative scroll delay."""
+        with patch.object(Config, 'SCROLL_DELAY_SECONDS', -1.0):
+            with pytest.raises(ValueError, match="SCROLL_DELAY_SECONDS must be non-negative"):
+                Config.validate_config()
+    
+    def test_validate_config_zero_timeout(self):
+        """Test validation fails for zero timeout."""
+        with patch.object(Config, 'PAGE_TIMEOUT_MS', 0):
+            with pytest.raises(ValueError, match="PAGE_TIMEOUT_MS must be positive"):
+                Config.validate_config()
+    
+    def test_validate_config_invalid_url(self):
+        """Test validation fails for invalid default URL."""
+        with patch.object(Config, 'DEFAULT_URL', 'not-a-url'):
+            with pytest.raises(ValueError, match="DEFAULT_URL must be a valid HTTP/HTTPS URL"):
+                Config.validate_config()
+    
+    def test_validate_config_empty_domains(self):
+        """Test validation fails for empty allowed domains."""
+        with patch.object(Config, 'get_allowed_domains', return_value=[]):
+            with pytest.raises(ValueError, match="ALLOWED_DOMAINS cannot be empty"):
+                Config.validate_config()
--- a/tests/test_main.py
+++ b/tests/test_main.py
@@ -0,0 +1,202 @@
+"""Tests for main module functionality."""
+
+import pytest
+import sys
+import tempfile
+from unittest.mock import patch, MagicMock
+from argparse import Namespace
+
+from main import parse_arguments, setup_logging, scrape_and_generate_rss
+from src.rss_scraper.exceptions import ValidationError, NetworkError, ParseError
+
+
+class TestParseArguments:
+    """Test command line argument parsing."""
+    
+    def test_parse_arguments_defaults(self):
+        """Test parsing with default arguments."""
+        with patch('sys.argv', ['main.py']):
+            args = parse_arguments()
+            
+            assert args.url == 'https://www.warhammer-community.com/en-gb/'
+            assert args.output_dir is None
+            assert args.max_scroll == 5
+            assert args.log_level == 'INFO'
+            assert args.log_file == 'scraper.log'
+    
+    def test_parse_arguments_custom_values(self):
+        """Test parsing with custom argument values."""
+        test_args = [
+            'main.py',
+            '--url', 'https://example.com',
+            '--output-dir', '/custom/path',
+            '--max-scroll', '10',
+            '--log-level', 'DEBUG',
+            '--log-file', 'custom.log'
+        ]
+        
+        with patch('sys.argv', test_args):
+            args = parse_arguments()
+            
+            assert args.url == 'https://example.com'
+            assert args.output_dir == '/custom/path'
+            assert args.max_scroll == 10
+            assert args.log_level == 'DEBUG'
+            assert args.log_file == 'custom.log'
+    
+    def test_parse_arguments_invalid_max_scroll(self):
+        """Test parsing fails with invalid max_scroll value."""
+        test_args = ['main.py', '--max-scroll', '-1']
+        
+        with patch('sys.argv', test_args):
+            with pytest.raises(SystemExit):
+                parse_arguments()
+    
+    def test_parse_arguments_relative_output_dir(self):
+        """Test that relative output directory is converted to absolute."""
+        test_args = ['main.py', '--output-dir', 'relative/path']
+        
+        with patch('sys.argv', test_args):
+            args = parse_arguments()
+            
+            assert args.output_dir.startswith('/')  # Should be absolute path
+            assert args.output_dir.endswith('relative/path')
+
+
+class TestSetupLogging:
+    """Test logging setup functionality."""
+    
+    def test_setup_logging_info_level(self):
+        """Test logging setup with INFO level."""
+        with tempfile.NamedTemporaryFile(delete=False) as temp_file:
+            setup_logging('INFO', temp_file.name)
+            
+            import logging
+            logger = logging.getLogger('test')
+            logger.info("Test message")
+            logger.debug("Debug message")  # Should not appear
+            
+            # Check that the log file was created and has correct level
+            assert logging.getLogger().level == logging.INFO
+    
+    def test_setup_logging_debug_level(self):
+        """Test logging setup with DEBUG level."""
+        with tempfile.NamedTemporaryFile(delete=False) as temp_file:
+            setup_logging('DEBUG', temp_file.name)
+            
+            import logging
+            assert logging.getLogger().level == logging.DEBUG
+    
+    def test_setup_logging_clears_existing_handlers(self):
+        """Test that setup_logging clears existing handlers."""
+        import logging
+        
+        # Add a dummy handler
+        dummy_handler = logging.StreamHandler()
+        logging.getLogger().addHandler(dummy_handler)
+        initial_handler_count = len(logging.getLogger().handlers)
+        
+        with tempfile.NamedTemporaryFile(delete=False) as temp_file:
+            setup_logging('INFO', temp_file.name)
+            
+            # Should have exactly 2 handlers (console + file)
+            assert len(logging.getLogger().handlers) == 2
+
+
+class TestScrapeAndGenerateRss:
+    """Test main scraping function."""
+    
+    @patch('main.save_debug_html')
+    @patch('main.save_rss_feed')
+    @patch('main.generate_rss_feed')
+    @patch('main.extract_articles_from_html')
+    @patch('main.load_page_with_retry')
+    @patch('main.validate_url')
+    def test_scrape_and_generate_rss_success(
+        self, mock_validate_url, mock_load_page, mock_extract_articles,
+        mock_generate_rss, mock_save_rss, mock_save_html
+    ):
+        """Test successful RSS scraping and generation."""
+        # Setup mocks
+        mock_validate_url.return_value = True
+        mock_load_page.return_value = "<html>test</html>"
+        mock_extract_articles.return_value = [
+            {'title': 'Test', 'link': 'http://example.com', 'date': 'date'}
+        ]
+        mock_generate_rss.return_value = b"<rss>feed</rss>"
+        mock_save_rss.return_value = "/path/to/feed.xml"
+        
+        url = "https://www.warhammer-community.com/en-gb/"
+        output_dir = "/test/output"
+        
+        # Should not raise any exception
+        scrape_and_generate_rss(url, output_dir)
+        
+        # Verify all functions were called
+        mock_validate_url.assert_called_once_with(url)
+        mock_load_page.assert_called_once_with(url)
+        mock_extract_articles.assert_called_once_with("<html>test</html>", url)
+        mock_generate_rss.assert_called_once()
+        mock_save_rss.assert_called_once()
+        mock_save_html.assert_called_once()
+    
+    @patch('main.validate_url')
+    def test_scrape_and_generate_rss_validation_error(self, mock_validate_url):
+        """Test scraping fails with validation error."""
+        mock_validate_url.side_effect = ValidationError("Invalid URL")
+        
+        with pytest.raises(ValidationError):
+            scrape_and_generate_rss("invalid-url")
+    
+    @patch('main.load_page_with_retry')
+    @patch('main.validate_url')
+    def test_scrape_and_generate_rss_network_error(
+        self, mock_validate_url, mock_load_page
+    ):
+        """Test scraping fails with network error."""
+        mock_validate_url.return_value = True
+        mock_load_page.side_effect = NetworkError("Network error")
+        
+        with pytest.raises(NetworkError):
+            scrape_and_generate_rss("https://www.warhammer-community.com/en-gb/")
+    
+    @patch('main.extract_articles_from_html')
+    @patch('main.load_page_with_retry')
+    @patch('main.validate_url')
+    def test_scrape_and_generate_rss_parse_error(
+        self, mock_validate_url, mock_load_page, mock_extract_articles
+    ):
+        """Test scraping fails with parse error."""
+        mock_validate_url.return_value = True
+        mock_load_page.return_value = "<html>test</html>"
+        mock_extract_articles.side_effect = ParseError("Parse error")
+        
+        with pytest.raises(ParseError):
+            scrape_and_generate_rss("https://www.warhammer-community.com/en-gb/")
+    
+    @patch('main.save_debug_html')
+    @patch('main.save_rss_feed')
+    @patch('main.generate_rss_feed')
+    @patch('main.extract_articles_from_html')
+    @patch('main.load_page_with_retry')
+    @patch('main.validate_url')
+    def test_scrape_and_generate_rss_default_output_dir(
+        self, mock_validate_url, mock_load_page, mock_extract_articles,
+        mock_generate_rss, mock_save_rss, mock_save_html
+    ):
+        """Test scraping uses default output directory when none provided."""
+        # Setup mocks
+        mock_validate_url.return_value = True
+        mock_load_page.return_value = "<html>test</html>"
+        mock_extract_articles.return_value = []
+        mock_generate_rss.return_value = b"<rss>feed</rss>"
+        mock_save_rss.return_value = "/path/to/feed.xml"
+        
+        url = "https://www.warhammer-community.com/en-gb/"
+        
+        # Call without output_dir
+        scrape_and_generate_rss(url)
+        
+        # Verify functions were called (output_dir would be set to default)
+        mock_validate_url.assert_called_once_with(url)
+        mock_save_rss.assert_called_once_with(b"<rss>feed</rss>", ".")  # Default output dir
--- a/tests/test_parser.py
+++ b/tests/test_parser.py
@@ -0,0 +1,208 @@
+"""Tests for parser module."""
+
+import pytest
+from datetime import datetime
+import pytz
+from unittest.mock import patch
+
+from src.rss_scraper.parser import sanitize_text, extract_articles_from_html
+from src.rss_scraper.exceptions import ParseError
+from src.rss_scraper.config import Config
+
+
+class TestSanitizeText:
+    """Test text sanitization functionality."""
+    
+    def test_sanitize_normal_text(self):
+        """Test sanitization of normal text."""
+        text = "Normal article title"
+        result = sanitize_text(text)
+        assert result == "Normal article title"
+    
+    def test_sanitize_none_text(self):
+        """Test sanitization of None text."""
+        result = sanitize_text(None)
+        assert result == "No title"
+    
+    def test_sanitize_empty_text(self):
+        """Test sanitization of empty text."""
+        result = sanitize_text("")
+        assert result == "No title"
+    
+    def test_sanitize_whitespace_text(self):
+        """Test sanitization of whitespace-only text."""
+        result = sanitize_text("   ")
+        assert result == "No title"
+    
+    def test_remove_dangerous_patterns(self):
+        """Test removal of dangerous patterns."""
+        dangerous_text = "Title with <script>alert('xss')</script> content"
+        result = sanitize_text(dangerous_text)
+        assert "<script" not in result
+        assert "</script" not in result
+        assert "alert('xss')" in result  # Only script tags should be removed
+    
+    def test_length_limit(self):
+        """Test that text is limited to max length."""
+        long_text = "a" * 1000
+        result = sanitize_text(long_text)
+        assert len(result) <= Config.MAX_TITLE_LENGTH
+    
+    def test_case_insensitive_pattern_removal(self):
+        """Test that dangerous patterns are removed case-insensitively."""
+        text = "Title with <SCRIPT>alert('xss')</SCRIPT> and javascript: protocol"
+        result = sanitize_text(text)
+        assert "<SCRIPT" not in result
+        assert "</SCRIPT" not in result
+        assert "javascript:" not in result
+
+
+class TestExtractArticlesFromHtml:
+    """Test article extraction from HTML."""
+    
+    def test_extract_articles_valid_html(self):
+        """Test extraction from valid HTML with articles."""
+        html = """
+        <html>
+        <body>
+            <article>
+                <h3 class="newsCard-title-sm">Test Article 1</h3>
+                <a href="/article/test-1">Read more</a>
+                <time>01 Jan 24</time>
+            </article>
+            <article>
+                <h3 class="newsCard-title-lg">Test Article 2</h3>
+                <a href="https://www.warhammer-community.com/article/test-2">Read more</a>
+                <time>02 Jan 24</time>
+            </article>
+        </body>
+        </html>
+        """
+        
+        base_url = "https://www.warhammer-community.com"
+        articles = extract_articles_from_html(html, base_url)
+        
+        assert len(articles) == 2
+        assert articles[0]['title'] == "Test Article 2"  # Sorted by date, newest first
+        assert articles[1]['title'] == "Test Article 1"
+        assert "warhammer-community.com" in articles[0]['link']
+        assert "warhammer-community.com" in articles[1]['link']
+    
+    def test_extract_articles_no_articles(self):
+        """Test extraction from HTML with no articles."""
+        html = """
+        <html>
+        <body>
+            <div>No articles here</div>
+        </body>
+        </html>
+        """
+        
+        base_url = "https://www.warhammer-community.com"
+        articles = extract_articles_from_html(html, base_url)
+        
+        assert len(articles) == 0
+    
+    def test_extract_articles_duplicate_links(self):
+        """Test that duplicate links are filtered out."""
+        html = """
+        <html>
+        <body>
+            <article>
+                <h3 class="newsCard-title-sm">Test Article 1</h3>
+                <a href="/article/test-1">Read more</a>
+                <time>01 Jan 24</time>
+            </article>
+            <article>
+                <h3 class="newsCard-title-lg">Test Article 1 Duplicate</h3>
+                <a href="/article/test-1">Read more</a>
+                <time>02 Jan 24</time>
+            </article>
+        </body>
+        </html>
+        """
+        
+        base_url = "https://www.warhammer-community.com"
+        articles = extract_articles_from_html(html, base_url)
+        
+        assert len(articles) == 1  # Duplicate should be filtered out
+        assert articles[0]['title'] == "Test Article 1"
+    
+    def test_extract_articles_invalid_links(self):
+        """Test handling of articles with invalid links."""
+        html = """
+        <html>
+        <body>
+            <article>
+                <h3 class="newsCard-title-sm">Valid Article</h3>
+                <a href="/article/valid">Read more</a>
+                <time>01 Jan 24</time>
+            </article>
+            <article>
+                <h3 class="newsCard-title-lg">Invalid Article</h3>
+                <a href="https://malicious-site.com/article">Read more</a>
+                <time>02 Jan 24</time>
+            </article>
+            <article>
+                <h3 class="newsCard-title-sm">No Link Article</h3>
+                <time>03 Jan 24</time>
+            </article>
+        </body>
+        </html>
+        """
+        
+        base_url = "https://www.warhammer-community.com"
+        articles = extract_articles_from_html(html, base_url)
+        
+        assert len(articles) == 1  # Only valid article should be included
+        assert articles[0]['title'] == "Valid Article"
+    
+    def test_extract_articles_date_parsing(self):
+        """Test parsing of various date formats."""
+        html = """
+        <html>
+        <body>
+            <article>
+                <h3 class="newsCard-title-sm">Article with good date</h3>
+                <a href="/article/1">Read more</a>
+                <time>15 Mar 24</time>
+            </article>
+            <article>
+                <h3 class="newsCard-title-lg">Article with bad date</h3>
+                <a href="/article/2">Read more</a>
+                <time>Invalid Date Format</time>
+            </article>
+            <article>
+                <h3 class="newsCard-title-sm">Article with reading time</h3>
+                <a href="/article/3">Read more</a>
+                <time>5 min read</time>
+                <time>20 Mar 24</time>
+            </article>
+        </body>
+        </html>
+        """
+        
+        base_url = "https://www.warhammer-community.com"
+        articles = extract_articles_from_html(html, base_url)
+        
+        assert len(articles) == 3
+        
+        # Check that dates are parsed correctly
+        for article in articles:
+            assert isinstance(article['date'], datetime)
+            assert article['date'].tzinfo is not None
+    
+    def test_extract_articles_malformed_html(self):
+        """Test handling of malformed HTML."""
+        malformed_html = "<html><body><article><h3>Unclosed tags"
+        
+        base_url = "https://www.warhammer-community.com"
+        # Should not raise ParseError - BeautifulSoup handles malformed HTML gracefully
+        articles = extract_articles_from_html(malformed_html, base_url)
+        assert isinstance(articles, list)
+    
+    def test_extract_articles_invalid_html(self):
+        """Test handling of completely invalid HTML."""
+        with patch('bs4.BeautifulSoup', side_effect=Exception("Parser error")):
+            with pytest.raises(ParseError):
+                extract_articles_from_html("<html></html>", "https://example.com")
--- a/tests/test_rss_generator.py
+++ b/tests/test_rss_generator.py
@@ -0,0 +1,162 @@
+"""Tests for RSS generator module."""
+
+import pytest
+import os
+import tempfile
+from datetime import datetime
+import pytz
+from unittest.mock import patch, mock_open
+
+from src.rss_scraper.rss_generator import generate_rss_feed, save_rss_feed, save_debug_html
+from src.rss_scraper.exceptions import FileOperationError
+
+
+class TestGenerateRssFeed:
+    """Test RSS feed generation functionality."""
+    
+    def test_generate_rss_feed_with_articles(self):
+        """Test RSS generation with valid articles."""
+        timezone = pytz.UTC
+        articles = [
+            {
+                'title': 'Test Article 1',
+                'link': 'https://example.com/article1',
+                'date': datetime(2024, 1, 1, tzinfo=timezone)
+            },
+            {
+                'title': 'Test Article 2',
+                'link': 'https://example.com/article2',
+                'date': datetime(2024, 1, 2, tzinfo=timezone)
+            }
+        ]
+        
+        feed_url = "https://example.com"
+        rss_content = generate_rss_feed(articles, feed_url)
+        
+        assert isinstance(rss_content, bytes)
+        rss_str = rss_content.decode('utf-8')
+        assert 'Test Article 1' in rss_str
+        assert 'Test Article 2' in rss_str
+        assert 'https://example.com/article1' in rss_str
+        assert 'https://example.com/article2' in rss_str
+        assert '<?xml version=' in rss_str
+        assert '<rss version=' in rss_str
+    
+    def test_generate_rss_feed_empty_articles(self):
+        """Test RSS generation with empty articles list."""
+        articles = []
+        feed_url = "https://example.com"
+        
+        rss_content = generate_rss_feed(articles, feed_url)
+        
+        assert isinstance(rss_content, bytes)
+        rss_str = rss_content.decode('utf-8')
+        assert '<?xml version=' in rss_str
+        assert '<rss version=' in rss_str
+        # Should still contain feed metadata
+        assert 'Warhammer Community RSS Feed' in rss_str
+    
+    def test_generate_rss_feed_unicode_content(self):
+        """Test RSS generation with unicode content."""
+        timezone = pytz.UTC
+        articles = [
+            {
+                'title': 'Tëst Artìclé with Ūnïcödë',
+                'link': 'https://example.com/unicode',
+                'date': datetime(2024, 1, 1, tzinfo=timezone)
+            }
+        ]
+        
+        feed_url = "https://example.com"
+        rss_content = generate_rss_feed(articles, feed_url)
+        
+        assert isinstance(rss_content, bytes)
+        rss_str = rss_content.decode('utf-8')
+        assert 'Tëst Artìclé with Ūnïcödë' in rss_str
+
+
+class TestSaveRssFeed:
+    """Test RSS feed saving functionality."""
+    
+    def test_save_rss_feed_success(self):
+        """Test successful RSS feed saving."""
+        rss_content = b'<?xml version="1.0"?><rss>test</rss>'
+        
+        with tempfile.TemporaryDirectory() as temp_dir:
+            result_path = save_rss_feed(rss_content, temp_dir)
+            
+            assert os.path.exists(result_path)
+            assert result_path.endswith('warhammer_rss_feed.xml')
+            
+            with open(result_path, 'rb') as f:
+                saved_content = f.read()
+            assert saved_content == rss_content
+    
+    def test_save_rss_feed_permission_error(self):
+        """Test RSS feed saving with permission error."""
+        rss_content = b'<?xml version="1.0"?><rss>test</rss>'
+        
+        with patch('builtins.open', side_effect=PermissionError("Permission denied")):
+            with pytest.raises(FileOperationError):
+                save_rss_feed(rss_content, "/some/path")
+    
+    def test_save_rss_feed_creates_directory(self):
+        """Test that RSS feed saving creates directory if needed."""
+        rss_content = b'<?xml version="1.0"?><rss>test</rss>'
+        
+        with tempfile.TemporaryDirectory() as temp_dir:
+            new_subdir = os.path.join(temp_dir, "new_subdir")
+            result_path = save_rss_feed(rss_content, new_subdir)
+            
+            assert os.path.exists(new_subdir)
+            assert os.path.exists(result_path)
+
+
+class TestSaveDebugHtml:
+    """Test debug HTML saving functionality."""
+    
+    def test_save_debug_html_success(self):
+        """Test successful debug HTML saving."""
+        html_content = "<html><body>Test content</body></html>"
+        
+        with tempfile.TemporaryDirectory() as temp_dir:
+            save_debug_html(html_content, temp_dir)
+            
+            html_path = os.path.join(temp_dir, "page.html")
+            assert os.path.exists(html_path)
+            
+            with open(html_path, 'r', encoding='utf-8') as f:
+                saved_content = f.read()
+            # BeautifulSoup prettifies the content
+            assert "Test content" in saved_content
+    
+    def test_save_debug_html_permission_error(self):
+        """Test debug HTML saving with permission error (should not raise)."""
+        html_content = "<html><body>Test content</body></html>"
+        
+        with patch('builtins.open', side_effect=PermissionError("Permission denied")):
+            # Should not raise exception, just log warning
+            save_debug_html(html_content, "/some/path")
+    
+    def test_save_debug_html_malformed_content(self):
+        """Test debug HTML saving with malformed HTML content."""
+        malformed_html = "<html><body>Unclosed tags"
+        
+        with tempfile.TemporaryDirectory() as temp_dir:
+            # Should handle malformed HTML gracefully
+            save_debug_html(malformed_html, temp_dir)
+            
+            html_path = os.path.join(temp_dir, "page.html")
+            assert os.path.exists(html_path)
+    
+    def test_save_debug_html_creates_directory(self):
+        """Test that debug HTML saving creates directory if needed."""
+        html_content = "<html><body>Test content</body></html>"
+        
+        with tempfile.TemporaryDirectory() as temp_dir:
+            new_subdir = os.path.join(temp_dir, "new_subdir")
+            save_debug_html(html_content, new_subdir)
+            
+            assert os.path.exists(new_subdir)
+            html_path = os.path.join(new_subdir, "page.html")
+            assert os.path.exists(html_path)
--- a/tests/test_validation.py
+++ b/tests/test_validation.py
@@ -0,0 +1,170 @@
+"""Tests for validation module."""
+
+import pytest
+import os
+import tempfile
+from unittest.mock import patch
+
+from src.rss_scraper.validation import validate_url, validate_output_path, validate_link
+from src.rss_scraper.exceptions import ValidationError, FileOperationError
+from src.rss_scraper.config import Config
+
+
+class TestValidateUrl:
+    """Test URL validation functionality."""
+    
+    def test_valid_url(self):
+        """Test validation of valid URLs."""
+        valid_urls = [
+            "https://www.warhammer-community.com/en-gb/",
+            "https://warhammer-community.com/some/path",
+        ]
+        
+        for url in valid_urls:
+            assert validate_url(url) is True
+    
+    def test_invalid_url_format(self):
+        """Test validation fails for invalid URL formats."""
+        invalid_urls = [
+            "not-a-url",
+            "ftp://example.com",
+            "",
+            "http://",
+            "https://",
+        ]
+        
+        for url in invalid_urls:
+            with pytest.raises(ValidationError):
+                validate_url(url)
+    
+    def test_disallowed_domain(self):
+        """Test validation fails for disallowed domains."""
+        disallowed_urls = [
+            "https://malicious-site.com",
+            "https://example.com",
+            "https://google.com",
+        ]
+        
+        for url in disallowed_urls:
+            with pytest.raises(ValidationError):
+                validate_url(url)
+    
+    def test_case_insensitive_domain(self):
+        """Test domain validation is case insensitive."""
+        urls = [
+            "https://WWW.WARHAMMER-COMMUNITY.COM",
+            "https://Warhammer-Community.com",
+        ]
+        
+        for url in urls:
+            assert validate_url(url) is True
+
+
+class TestValidateOutputPath:
+    """Test output path validation functionality."""
+    
+    def test_valid_path_within_base(self):
+        """Test validation of valid paths within base directory."""
+        with tempfile.TemporaryDirectory() as temp_dir:
+            test_path = os.path.join(temp_dir, "output.xml")
+            result = validate_output_path(test_path, temp_dir)
+            assert result == os.path.abspath(test_path)
+    
+    def test_path_outside_base_directory(self):
+        """Test validation fails for paths outside base directory."""
+        with tempfile.TemporaryDirectory() as temp_dir:
+            outside_path = "/tmp/malicious.xml"
+            with pytest.raises(ValidationError):
+                validate_output_path(outside_path, temp_dir)
+    
+    def test_absolute_path_within_base_directory(self):
+        """Test that absolute paths within base directory are allowed."""
+        with tempfile.TemporaryDirectory() as temp_dir:
+            # This should work - absolute path within the base directory
+            abs_path = os.path.join(temp_dir, "output.xml")
+            result = validate_output_path(abs_path, temp_dir)
+            assert result == os.path.abspath(abs_path)
+    
+    def test_creates_directory_if_not_exists(self):
+        """Test that validation creates directory if it doesn't exist."""
+        with tempfile.TemporaryDirectory() as temp_dir:
+            new_subdir = os.path.join(temp_dir, "new_subdir")
+            test_path = os.path.join(new_subdir, "output.xml")
+            
+            result = validate_output_path(test_path, new_subdir)
+            
+            assert os.path.exists(new_subdir)
+            assert result == os.path.abspath(test_path)
+    
+    def test_directory_traversal_protection(self):
+        """Test that directory traversal attacks are blocked."""
+        with tempfile.TemporaryDirectory() as temp_dir:
+            # These should be blocked - either by directory traversal check or outside-base check
+            traversal_paths = [
+                "../../../etc/passwd",
+                "subdir/../../../etc/passwd", 
+                "normal/../../../dangerous.xml"
+            ]
+            
+            for path in traversal_paths:
+                with pytest.raises(ValidationError):  # Either error type is acceptable
+                    validate_output_path(path, temp_dir)
+    
+    def test_permission_error(self):
+        """Test handling of permission errors."""
+        with patch('os.makedirs', side_effect=PermissionError("Permission denied")):
+            with pytest.raises(FileOperationError):
+                validate_output_path("/some/path/file.xml", "/some/path")
+
+
+class TestValidateLink:
+    """Test link validation functionality."""
+    
+    def test_valid_absolute_link(self):
+        """Test validation of valid absolute links."""
+        base_url = "https://www.warhammer-community.com"
+        valid_link = "https://www.warhammer-community.com/article"
+        
+        result = validate_link(valid_link, base_url)
+        assert result == valid_link
+    
+    def test_valid_relative_link(self):
+        """Test validation of valid relative links."""
+        base_url = "https://www.warhammer-community.com/en-gb/"
+        relative_link = "/article/some-article"
+        
+        result = validate_link(relative_link, base_url)
+        assert result == "https://www.warhammer-community.com/article/some-article"
+    
+    def test_none_link(self):
+        """Test handling of None link."""
+        base_url = "https://www.warhammer-community.com"
+        result = validate_link(None, base_url)
+        assert result is None
+    
+    def test_empty_link(self):
+        """Test handling of empty link."""
+        base_url = "https://www.warhammer-community.com"
+        result = validate_link("", base_url)
+        assert result is None
+    
+    def test_invalid_domain_link(self):
+        """Test rejection of links from invalid domains."""
+        base_url = "https://www.warhammer-community.com"
+        invalid_link = "https://malicious-site.com/article"
+        
+        result = validate_link(invalid_link, base_url)
+        assert result is None
+    
+    def test_malformed_link(self):
+        """Test handling of malformed links."""
+        base_url = "https://www.warhammer-community.com"
+        malformed_links = [
+            "not-a-url",
+            "://missing-scheme",
+            "https://",
+        ]
+        
+        for link in malformed_links:
+            result = validate_link(link, base_url)
+            assert result is None