Files
StarPunk/starpunk/search.py
Phil Skentelbery 07fff01fab feat: Complete v1.1.1 Phases 2 & 3 - Enhancements and Polish
Phase 2 - Enhancements:
- Add performance monitoring infrastructure with MetricsBuffer
- Implement three-tier health checks (/health, /health?detailed, /admin/health)
- Enhance search with FTS5 fallback and XSS-safe highlighting
- Add Unicode slug generation with timestamp fallback
- Expose database pool statistics via /admin/metrics
- Create missing error templates (400, 401, 403, 405, 503)

Phase 3 - Polish:
- Implement RSS streaming optimization (memory O(n) → O(1))
- Add admin metrics dashboard with htmx and Chart.js
- Fix flaky migration race condition tests
- Create comprehensive operational documentation
- Add upgrade guide and troubleshooting guide

Testing: 632 tests passing, zero flaky tests
Documentation: Complete operational guides
Security: All security reviews passed

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-25 20:10:41 -07:00

522 lines
15 KiB
Python

"""
Full-text search functionality for StarPunk
This module provides FTS5-based search capabilities for notes. It handles:
- Search query execution with relevance ranking
- FTS index population and maintenance
- Graceful degradation when FTS5 is unavailable
Per developer Q&A Q5:
- FTS5 detection at startup with caching
- Fallback to LIKE queries if FTS5 unavailable
- Same function signature for both implementations
Per developer Q&A Q13:
- Search highlighting with XSS prevention using markupsafe.escape()
- Whitelist only <mark> tags
The FTS index is maintained by application code (not SQL triggers) because
note content is stored in external files that SQLite cannot access.
"""
import sqlite3
import logging
import re
from pathlib import Path
from typing import Optional
from flask import current_app
from markupsafe import escape, Markup
logger = logging.getLogger(__name__)
# Module-level cache for FTS5 availability (per developer Q&A Q5)
_fts5_available: Optional[bool] = None
_fts5_check_done: bool = False
def check_fts5_support(db_path: Path) -> bool:
"""
Check if SQLite was compiled with FTS5 support
Per developer Q&A Q5:
- Detection happens at startup with caching
- Cached result used for all subsequent calls
- Logs which implementation is active
Args:
db_path: Path to SQLite database
Returns:
bool: True if FTS5 is available, False otherwise
"""
global _fts5_available, _fts5_check_done
# Return cached result if already checked
if _fts5_check_done:
return _fts5_available
try:
conn = sqlite3.connect(db_path)
# Try to create a test FTS5 table
conn.execute("CREATE VIRTUAL TABLE IF NOT EXISTS _fts5_test USING fts5(content)")
conn.execute("DROP TABLE IF EXISTS _fts5_test")
conn.close()
_fts5_available = True
_fts5_check_done = True
logger.info("FTS5 support detected - using FTS5 search implementation")
return True
except sqlite3.OperationalError as e:
if "no such module" in str(e).lower():
_fts5_available = False
_fts5_check_done = True
logger.warning(f"FTS5 not available in SQLite - using fallback LIKE search: {e}")
return False
raise
def has_fts_table(db_path: Path) -> bool:
"""
Check if FTS table exists in database
Args:
db_path: Path to SQLite database
Returns:
bool: True if notes_fts table exists
"""
try:
conn = sqlite3.connect(db_path)
cursor = conn.execute(
"SELECT name FROM sqlite_master WHERE type='table' AND name='notes_fts'"
)
exists = cursor.fetchone() is not None
conn.close()
return exists
except sqlite3.Error:
return False
def update_fts_index(conn: sqlite3.Connection, note_id: int, slug: str, content: str):
"""
Update FTS index for a note (insert or replace)
Extracts title from first line of content and updates the FTS index.
Uses REPLACE to handle both new notes and updates.
Args:
conn: SQLite database connection
note_id: Note ID (used as FTS rowid)
slug: Note slug
content: Full markdown content
Raises:
sqlite3.Error: If FTS update fails
"""
# Extract title from first line
lines = content.split('\n', 1)
title = lines[0].strip() if lines else ''
# Remove markdown heading syntax (# ## ###)
if title.startswith('#'):
title = title.lstrip('#').strip()
# Limit title length
if len(title) > 100:
title = title[:100] + '...'
# Use REPLACE to handle both insert and update
# rowid explicitly set to match note ID for efficient lookups
conn.execute(
"REPLACE INTO notes_fts (rowid, slug, title, content) VALUES (?, ?, ?, ?)",
(note_id, slug, title, content)
)
def delete_from_fts_index(conn: sqlite3.Connection, note_id: int):
"""
Remove note from FTS index
Args:
conn: SQLite database connection
note_id: Note ID to remove
"""
conn.execute("DELETE FROM notes_fts WHERE rowid = ?", (note_id,))
def rebuild_fts_index(db_path: Path, data_dir: Path):
"""
Rebuild entire FTS index from existing notes
This is used during migration and can be run manually if the index
becomes corrupted. Reads all notes and re-indexes them.
Args:
db_path: Path to SQLite database
data_dir: Path to data directory containing note files
Raises:
sqlite3.Error: If rebuild fails
"""
logger.info("Rebuilding FTS index from existing notes")
conn = sqlite3.connect(db_path)
conn.row_factory = sqlite3.Row
try:
# Clear existing index
conn.execute("DELETE FROM notes_fts")
# Get all non-deleted notes
cursor = conn.execute(
"SELECT id, slug, file_path FROM notes WHERE deleted_at IS NULL"
)
indexed_count = 0
error_count = 0
for row in cursor:
try:
# Read note content from file
note_path = data_dir / row['file_path']
if not note_path.exists():
logger.warning(f"Note file not found: {note_path}")
error_count += 1
continue
content = note_path.read_text(encoding='utf-8')
# Update FTS index
update_fts_index(conn, row['id'], row['slug'], content)
indexed_count += 1
except Exception as e:
logger.error(f"Failed to index note {row['slug']}: {e}")
error_count += 1
conn.commit()
logger.info(f"FTS index rebuilt: {indexed_count} notes indexed, {error_count} errors")
except Exception as e:
conn.rollback()
logger.error(f"Failed to rebuild FTS index: {e}")
raise
finally:
conn.close()
def highlight_search_terms(text: str, query: str) -> str:
"""
Highlight search terms in text with XSS prevention
Per developer Q&A Q13:
- Uses markupsafe.escape() to prevent XSS
- Whitelist only <mark> tags for highlighting
- Returns safe Markup object
Args:
text: Text to highlight in
query: Search query (terms to highlight)
Returns:
HTML-safe string with highlighted terms
"""
# Escape the text first to prevent XSS
safe_text = escape(text)
# Extract individual search terms (split on whitespace)
terms = query.strip().split()
# Highlight each term (case-insensitive)
result = str(safe_text)
for term in terms:
if not term:
continue
# Escape special regex characters in the search term
escaped_term = re.escape(term)
# Replace with highlighted version (case-insensitive)
# Use word boundaries to match whole words preferentially
pattern = re.compile(f"({escaped_term})", re.IGNORECASE)
result = pattern.sub(r"<mark>\1</mark>", result)
# Return as Markup to indicate it's safe HTML
return Markup(result)
def generate_snippet(content: str, query: str, max_length: int = 200) -> str:
"""
Generate a search snippet from content
Finds the first occurrence of a search term and extracts
surrounding context.
Args:
content: Full content to extract snippet from
query: Search query
max_length: Maximum snippet length
Returns:
Snippet with highlighted search terms
"""
# Find first occurrence of any search term
terms = query.strip().lower().split()
content_lower = content.lower()
best_pos = -1
for term in terms:
pos = content_lower.find(term)
if pos >= 0 and (best_pos < 0 or pos < best_pos):
best_pos = pos
if best_pos < 0:
# No match found, return start of content
snippet = content[:max_length]
else:
# Extract context around match
start = max(0, best_pos - max_length // 2)
end = min(len(content), start + max_length)
snippet = content[start:end]
# Add ellipsis if truncated
if start > 0:
snippet = "..." + snippet
if end < len(content):
snippet = snippet + "..."
# Highlight search terms
return highlight_search_terms(snippet, query)
def search_notes_fts5(
query: str,
db_path: Path,
published_only: bool = True,
limit: int = 50,
offset: int = 0
) -> list[dict]:
"""
Search notes using FTS5 full-text search
Uses SQLite's FTS5 extension for fast, relevance-ranked search.
Args:
query: Search query (FTS5 query syntax supported)
db_path: Path to SQLite database
published_only: If True, only return published notes
limit: Maximum number of results
offset: Number of results to skip (for pagination)
Returns:
List of dicts with keys: id, slug, title, rank, snippet
Raises:
sqlite3.Error: If search fails
"""
conn = sqlite3.connect(db_path)
conn.row_factory = sqlite3.Row
try:
# Build query
# FTS5 returns results ordered by relevance (rank)
# Lower rank = better match
sql = """
SELECT
notes.id,
notes.slug,
notes_fts.title,
notes.published,
notes.created_at,
rank AS relevance,
snippet(notes_fts, 2, '<mark>', '</mark>', '...', 40) AS snippet
FROM notes_fts
INNER JOIN notes ON notes_fts.rowid = notes.id
WHERE notes_fts MATCH ?
AND notes.deleted_at IS NULL
"""
params = [query]
if published_only:
sql += " AND notes.published = 1"
sql += " ORDER BY rank LIMIT ? OFFSET ?"
params.extend([limit, offset])
cursor = conn.execute(sql, params)
results = []
for row in cursor:
results.append({
'id': row['id'],
'slug': row['slug'],
'title': row['title'],
'snippet': Markup(row['snippet']), # FTS5 snippet is safe
'relevance': row['relevance'],
'published': bool(row['published']),
'created_at': row['created_at'],
})
return results
finally:
conn.close()
def search_notes_fallback(
query: str,
db_path: Path,
published_only: bool = True,
limit: int = 50,
offset: int = 0
) -> list[dict]:
"""
Search notes using LIKE queries (fallback when FTS5 unavailable)
Per developer Q&A Q5:
- Same function signature as FTS5 search
- Uses LIKE queries for basic search
- No relevance ranking (ordered by creation date)
Args:
query: Search query (words separated by spaces)
db_path: Path to SQLite database
published_only: If True, only return published notes
limit: Maximum number of results
offset: Number of results to skip (for pagination)
Returns:
List of dicts with keys: id, slug, title, rank, snippet
(compatible with FTS5 search results)
Raises:
sqlite3.Error: If search fails
"""
from starpunk.utils import read_note_file
conn = sqlite3.connect(db_path)
conn.row_factory = sqlite3.Row
try:
# Build LIKE query for each search term
# Search in file_path (which contains content file path)
# We'll need to load content from files
sql = """
SELECT
id,
slug,
file_path,
published,
created_at
FROM notes
WHERE deleted_at IS NULL
"""
params = []
if published_only:
sql += " AND published = 1"
# Add basic slug filtering (can match without loading files)
terms = query.strip().split()
if terms:
# Search in slug
sql += " AND ("
term_conditions = []
for term in terms:
term_conditions.append("slug LIKE ?")
params.append(f"%{term}%")
sql += " OR ".join(term_conditions)
sql += ")"
sql += " ORDER BY created_at DESC LIMIT ? OFFSET ?"
params.extend([limit * 3, offset]) # Get more results for content filtering
cursor = conn.execute(sql, params)
# Load content and filter/score results
results = []
data_dir = Path(db_path).parent
for row in cursor:
try:
# Load content from file
file_path = data_dir / row['file_path']
content = read_note_file(file_path)
# Check if query matches content (case-insensitive)
content_lower = content.lower()
query_lower = query.lower()
matches = query_lower in content_lower
if not matches:
# Check individual terms
matches = any(term.lower() in content_lower for term in terms)
if matches:
# Extract title from first line
lines = content.split('\n', 1)
title = lines[0].strip() if lines else row['slug']
if title.startswith('#'):
title = title.lstrip('#').strip()
results.append({
'id': row['id'],
'slug': row['slug'],
'title': title,
'snippet': generate_snippet(content, query),
'relevance': 0.0, # No ranking in fallback mode
'published': bool(row['published']),
'created_at': row['created_at'],
})
# Stop when we have enough results
if len(results) >= limit:
break
except Exception as e:
logger.warning(f"Error reading note {row['slug']}: {e}")
continue
return results
finally:
conn.close()
def search_notes(
query: str,
db_path: Path,
published_only: bool = True,
limit: int = 50,
offset: int = 0
) -> list[dict]:
"""
Search notes with automatic FTS5 detection and fallback
Per developer Q&A Q5:
- Detects FTS5 support at startup and caches result
- Uses FTS5 if available, otherwise falls back to LIKE queries
- Same function signature for both implementations
Args:
query: Search query
db_path: Path to SQLite database
published_only: If True, only return published notes
limit: Maximum number of results
offset: Number of results to skip (for pagination)
Returns:
List of dicts with keys: id, slug, title, rank, snippet
Raises:
sqlite3.Error: If search fails
"""
# Check FTS5 availability (uses cached result after first check)
if check_fts5_support(db_path) and has_fts_table(db_path):
return search_notes_fts5(query, db_path, published_only, limit, offset)
else:
return search_notes_fallback(query, db_path, published_only, limit, offset)