feat: Complete v1.1.1 Phases 2 & 3 - Enhancements and Polish

Phase 2 - Enhancements: - Add performance monitoring infrastructure with MetricsBuffer - Implement three-tier health checks (/health, /health?detailed, /admin/health) - Enhance search with FTS5 fallback and XSS-safe highlighting - Add Unicode slug generation with timestamp fallback - Expose database pool statistics via /admin/metrics - Create missing error templates (400, 401, 403, 405, 503) Phase 3 - Polish: - Implement RSS streaming optimization (memory O(n) → O(1)) - Add admin metrics dashboard with htmx and Chart.js - Fix flaky migration race condition tests - Create comprehensive operational documentation - Add upgrade guide and troubleshooting guide Testing: 632 tests passing, zero flaky tests Documentation: Complete operational guides Security: All security reviews passed 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-25 20:10:41 -07:00
parent 93d2398c1d
commit 07fff01fab
25 changed files with 4371 additions and 142 deletions
--- a/starpunk/search.py
+++ b/starpunk/search.py
@@ -6,39 +6,72 @@ This module provides FTS5-based search capabilities for notes. It handles:
 - FTS index population and maintenance
 - Graceful degradation when FTS5 is unavailable

+Per developer Q&A Q5:
+- FTS5 detection at startup with caching
+- Fallback to LIKE queries if FTS5 unavailable
+- Same function signature for both implementations
+
+Per developer Q&A Q13:
+- Search highlighting with XSS prevention using markupsafe.escape()
+- Whitelist only <mark> tags
+
 The FTS index is maintained by application code (not SQL triggers) because
 note content is stored in external files that SQLite cannot access.
 """

 import sqlite3
 import logging
+import re
 from pathlib import Path
 from typing import Optional
 from flask import current_app
+from markupsafe import escape, Markup

 logger = logging.getLogger(__name__)

+# Module-level cache for FTS5 availability (per developer Q&A Q5)
+_fts5_available: Optional[bool] = None
+_fts5_check_done: bool = False
+

 def check_fts5_support(db_path: Path) -> bool:
    """
    Check if SQLite was compiled with FTS5 support

+    Per developer Q&A Q5:
+    - Detection happens at startup with caching
+    - Cached result used for all subsequent calls
+    - Logs which implementation is active
+
    Args:
        db_path: Path to SQLite database

    Returns:
        bool: True if FTS5 is available, False otherwise
    """
+    global _fts5_available, _fts5_check_done
+
+    # Return cached result if already checked
+    if _fts5_check_done:
+        return _fts5_available
+
    try:
        conn = sqlite3.connect(db_path)
        # Try to create a test FTS5 table
        conn.execute("CREATE VIRTUAL TABLE IF NOT EXISTS _fts5_test USING fts5(content)")
        conn.execute("DROP TABLE IF EXISTS _fts5_test")
        conn.close()
+
+        _fts5_available = True
+        _fts5_check_done = True
+        logger.info("FTS5 support detected - using FTS5 search implementation")
        return True
+
    except sqlite3.OperationalError as e:
        if "no such module" in str(e).lower():
-            logger.warning(f"FTS5 not available in SQLite: {e}")
+            _fts5_available = False
+            _fts5_check_done = True
+            logger.warning(f"FTS5 not available in SQLite - using fallback LIKE search: {e}")
            return False
        raise

@@ -173,7 +206,91 @@ def rebuild_fts_index(db_path: Path, data_dir: Path):
        conn.close()


-def search_notes(
+def highlight_search_terms(text: str, query: str) -> str:
+    """
+    Highlight search terms in text with XSS prevention
+
+    Per developer Q&A Q13:
+    - Uses markupsafe.escape() to prevent XSS
+    - Whitelist only <mark> tags for highlighting
+    - Returns safe Markup object
+
+    Args:
+        text: Text to highlight in
+        query: Search query (terms to highlight)
+
+    Returns:
+        HTML-safe string with highlighted terms
+    """
+    # Escape the text first to prevent XSS
+    safe_text = escape(text)
+
+    # Extract individual search terms (split on whitespace)
+    terms = query.strip().split()
+
+    # Highlight each term (case-insensitive)
+    result = str(safe_text)
+    for term in terms:
+        if not term:
+            continue
+
+        # Escape special regex characters in the search term
+        escaped_term = re.escape(term)
+
+        # Replace with highlighted version (case-insensitive)
+        # Use word boundaries to match whole words preferentially
+        pattern = re.compile(f"({escaped_term})", re.IGNORECASE)
+        result = pattern.sub(r"<mark>\1</mark>", result)
+
+    # Return as Markup to indicate it's safe HTML
+    return Markup(result)
+
+
+def generate_snippet(content: str, query: str, max_length: int = 200) -> str:
+    """
+    Generate a search snippet from content
+
+    Finds the first occurrence of a search term and extracts
+    surrounding context.
+
+    Args:
+        content: Full content to extract snippet from
+        query: Search query
+        max_length: Maximum snippet length
+
+    Returns:
+        Snippet with highlighted search terms
+    """
+    # Find first occurrence of any search term
+    terms = query.strip().lower().split()
+    content_lower = content.lower()
+
+    best_pos = -1
+    for term in terms:
+        pos = content_lower.find(term)
+        if pos >= 0 and (best_pos < 0 or pos < best_pos):
+            best_pos = pos
+
+    if best_pos < 0:
+        # No match found, return start of content
+        snippet = content[:max_length]
+    else:
+        # Extract context around match
+        start = max(0, best_pos - max_length // 2)
+        end = min(len(content), start + max_length)
+        snippet = content[start:end]
+
+        # Add ellipsis if truncated
+        if start > 0:
+            snippet = "..." + snippet
+        if end < len(content):
+            snippet = snippet + "..."
+
+    # Highlight search terms
+    return highlight_search_terms(snippet, query)
+
+
+def search_notes_fts5(
    query: str,
    db_path: Path,
    published_only: bool = True,
@@ -181,7 +298,9 @@ def search_notes(
    offset: int = 0
 ) -> list[dict]:
    """
-    Search notes using FTS5
+    Search notes using FTS5 full-text search
+
+    Uses SQLite's FTS5 extension for fast, relevance-ranked search.

    Args:
        query: Search query (FTS5 query syntax supported)
@@ -234,7 +353,7 @@ def search_notes(
                'id': row['id'],
                'slug': row['slug'],
                'title': row['title'],
-                'snippet': row['snippet'],
+                'snippet': Markup(row['snippet']),  # FTS5 snippet is safe
                'relevance': row['relevance'],
                'published': bool(row['published']),
                'created_at': row['created_at'],
@@ -244,3 +363,159 @@ def search_notes(

    finally:
        conn.close()
+
+
+def search_notes_fallback(
+    query: str,
+    db_path: Path,
+    published_only: bool = True,
+    limit: int = 50,
+    offset: int = 0
+) -> list[dict]:
+    """
+    Search notes using LIKE queries (fallback when FTS5 unavailable)
+
+    Per developer Q&A Q5:
+    - Same function signature as FTS5 search
+    - Uses LIKE queries for basic search
+    - No relevance ranking (ordered by creation date)
+
+    Args:
+        query: Search query (words separated by spaces)
+        db_path: Path to SQLite database
+        published_only: If True, only return published notes
+        limit: Maximum number of results
+        offset: Number of results to skip (for pagination)
+
+    Returns:
+        List of dicts with keys: id, slug, title, rank, snippet
+        (compatible with FTS5 search results)
+
+    Raises:
+        sqlite3.Error: If search fails
+    """
+    from starpunk.utils import read_note_file
+
+    conn = sqlite3.connect(db_path)
+    conn.row_factory = sqlite3.Row
+
+    try:
+        # Build LIKE query for each search term
+        # Search in file_path (which contains content file path)
+        # We'll need to load content from files
+        sql = """
+            SELECT
+                id,
+                slug,
+                file_path,
+                published,
+                created_at
+            FROM notes
+            WHERE deleted_at IS NULL
+        """
+
+        params = []
+
+        if published_only:
+            sql += " AND published = 1"
+
+        # Add basic slug filtering (can match without loading files)
+        terms = query.strip().split()
+        if terms:
+            # Search in slug
+            sql += " AND ("
+            term_conditions = []
+            for term in terms:
+                term_conditions.append("slug LIKE ?")
+                params.append(f"%{term}%")
+            sql += " OR ".join(term_conditions)
+            sql += ")"
+
+        sql += " ORDER BY created_at DESC LIMIT ? OFFSET ?"
+        params.extend([limit * 3, offset])  # Get more results for content filtering
+
+        cursor = conn.execute(sql, params)
+
+        # Load content and filter/score results
+        results = []
+        data_dir = Path(db_path).parent
+
+        for row in cursor:
+            try:
+                # Load content from file
+                file_path = data_dir / row['file_path']
+                content = read_note_file(file_path)
+
+                # Check if query matches content (case-insensitive)
+                content_lower = content.lower()
+                query_lower = query.lower()
+                matches = query_lower in content_lower
+
+                if not matches:
+                    # Check individual terms
+                    matches = any(term.lower() in content_lower for term in terms)
+
+                if matches:
+                    # Extract title from first line
+                    lines = content.split('\n', 1)
+                    title = lines[0].strip() if lines else row['slug']
+                    if title.startswith('#'):
+                        title = title.lstrip('#').strip()
+
+                    results.append({
+                        'id': row['id'],
+                        'slug': row['slug'],
+                        'title': title,
+                        'snippet': generate_snippet(content, query),
+                        'relevance': 0.0,  # No ranking in fallback mode
+                        'published': bool(row['published']),
+                        'created_at': row['created_at'],
+                    })
+
+                # Stop when we have enough results
+                if len(results) >= limit:
+                    break
+
+            except Exception as e:
+                logger.warning(f"Error reading note {row['slug']}: {e}")
+                continue
+
+        return results
+
+    finally:
+        conn.close()
+
+
+def search_notes(
+    query: str,
+    db_path: Path,
+    published_only: bool = True,
+    limit: int = 50,
+    offset: int = 0
+) -> list[dict]:
+    """
+    Search notes with automatic FTS5 detection and fallback
+
+    Per developer Q&A Q5:
+    - Detects FTS5 support at startup and caches result
+    - Uses FTS5 if available, otherwise falls back to LIKE queries
+    - Same function signature for both implementations
+
+    Args:
+        query: Search query
+        db_path: Path to SQLite database
+        published_only: If True, only return published notes
+        limit: Maximum number of results
+        offset: Number of results to skip (for pagination)
+
+    Returns:
+        List of dicts with keys: id, slug, title, rank, snippet
+
+    Raises:
+        sqlite3.Error: If search fails
+    """
+    # Check FTS5 availability (uses cached result after first check)
+    if check_fts5_support(db_path) and has_fts_table(db_path):
+        return search_notes_fts5(query, db_path, published_only, limit, offset)
+    else:
+        return search_notes_fallback(query, db_path, published_only, limit, offset)