feat: Complete v1.1.1 Phases 2 & 3 - Enhancements and Polish

Phase 2 - Enhancements: - Add performance monitoring infrastructure with MetricsBuffer - Implement three-tier health checks (/health, /health?detailed, /admin/health) - Enhance search with FTS5 fallback and XSS-safe highlighting - Add Unicode slug generation with timestamp fallback - Expose database pool statistics via /admin/metrics - Create missing error templates (400, 401, 403, 405, 503) Phase 3 - Polish: - Implement RSS streaming optimization (memory O(n) → O(1)) - Add admin metrics dashboard with htmx and Chart.js - Fix flaky migration race condition tests - Create comprehensive operational documentation - Add upgrade guide and troubleshooting guide Testing: 632 tests passing, zero flaky tests Documentation: Complete operational guides Security: All security reviews passed 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-25 20:10:41 -07:00
parent 93d2398c1d
commit 07fff01fab
25 changed files with 4371 additions and 142 deletions
--- a/starpunk/monitoring/metrics.py
+++ b/starpunk/monitoring/metrics.py
@@ -0,0 +1,410 @@
+"""
+Metrics collection and buffering for performance monitoring
+
+Per ADR-053 and developer Q&A Q6, Q12:
+- Per-process circular buffers using deque
+- Configurable buffer size (default 1000 entries)
+- Include process ID in all metrics
+- Configuration-based sampling rates
+- Operation types: database, http, render
+
+Example usage:
+    >>> from starpunk.monitoring import record_metric, get_metrics
+    >>>
+    >>> # Record a database operation
+    >>> record_metric('database', 'query', duration_ms=45.2, query='SELECT * FROM notes')
+    >>>
+    >>> # Get all metrics
+    >>> metrics = get_metrics()
+    >>> print(f"Collected {len(metrics)} metrics")
+"""
+
+import os
+import random
+import time
+from collections import deque
+from dataclasses import dataclass, field, asdict
+from datetime import datetime
+from threading import Lock
+from typing import Any, Deque, Dict, List, Literal, Optional
+
+# Operation types for categorizing metrics
+OperationType = Literal["database", "http", "render"]
+
+# Module-level circular buffer (per-process)
+# Each process in a multi-process deployment maintains its own buffer
+_metrics_buffer: Optional["MetricsBuffer"] = None
+_buffer_lock = Lock()
+
+
+@dataclass
+class Metric:
+    """
+    Represents a single performance metric
+
+    Attributes:
+        operation_type: Type of operation (database/http/render)
+        operation_name: Name/description of operation
+        timestamp: When the metric was recorded (ISO format)
+        duration_ms: Duration in milliseconds
+        process_id: Process ID that recorded the metric
+        metadata: Additional operation-specific data
+    """
+    operation_type: OperationType
+    operation_name: str
+    timestamp: str
+    duration_ms: float
+    process_id: int
+    metadata: Dict[str, Any] = field(default_factory=dict)
+
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert metric to dictionary for serialization"""
+        return asdict(self)
+
+
+class MetricsBuffer:
+    """
+    Circular buffer for storing performance metrics
+
+    Per developer Q&A Q6:
+    - Uses deque for efficient circular buffer
+    - Per-process storage (not shared across workers)
+    - Thread-safe with locking
+    - Configurable max size (default 1000)
+    - Automatic eviction of oldest entries when full
+
+    Per developer Q&A Q12:
+    - Configurable sampling rates per operation type
+    - Default 10% sampling
+    - Slow queries always logged regardless of sampling
+
+    Example:
+        >>> buffer = MetricsBuffer(max_size=1000)
+        >>> buffer.record('database', 'query', 45.2, {'query': 'SELECT ...'})
+        >>> metrics = buffer.get_all()
+    """
+
+    def __init__(
+        self,
+        max_size: int = 1000,
+        sampling_rates: Optional[Dict[OperationType, float]] = None
+    ):
+        """
+        Initialize metrics buffer
+
+        Args:
+            max_size: Maximum number of metrics to store
+            sampling_rates: Dict mapping operation type to sampling rate (0.0-1.0)
+                           Default: {'database': 0.1, 'http': 0.1, 'render': 0.1}
+        """
+        self.max_size = max_size
+        self._buffer: Deque[Metric] = deque(maxlen=max_size)
+        self._lock = Lock()
+        self._process_id = os.getpid()
+
+        # Default sampling rates (10% for all operation types)
+        self._sampling_rates = sampling_rates or {
+            "database": 0.1,
+            "http": 0.1,
+            "render": 0.1,
+        }
+
+    def record(
+        self,
+        operation_type: OperationType,
+        operation_name: str,
+        duration_ms: float,
+        metadata: Optional[Dict[str, Any]] = None,
+        force: bool = False
+    ) -> bool:
+        """
+        Record a performance metric
+
+        Args:
+            operation_type: Type of operation (database/http/render)
+            operation_name: Name/description of operation
+            duration_ms: Duration in milliseconds
+            metadata: Additional operation-specific data
+            force: If True, bypass sampling (for slow query logging)
+
+        Returns:
+            True if metric was recorded, False if skipped due to sampling
+
+        Example:
+            >>> buffer.record('database', 'SELECT notes', 45.2,
+            ...               {'query': 'SELECT * FROM notes LIMIT 10'})
+            True
+        """
+        # Apply sampling (unless forced)
+        if not force:
+            sampling_rate = self._sampling_rates.get(operation_type, 0.1)
+            if random.random() > sampling_rate:
+                return False
+
+        metric = Metric(
+            operation_type=operation_type,
+            operation_name=operation_name,
+            timestamp=datetime.utcnow().isoformat() + "Z",
+            duration_ms=duration_ms,
+            process_id=self._process_id,
+            metadata=metadata or {}
+        )
+
+        with self._lock:
+            self._buffer.append(metric)
+
+        return True
+
+    def get_all(self) -> List[Metric]:
+        """
+        Get all metrics from buffer
+
+        Returns:
+            List of metrics (oldest to newest)
+
+        Example:
+            >>> metrics = buffer.get_all()
+            >>> len(metrics)
+            1000
+        """
+        with self._lock:
+            return list(self._buffer)
+
+    def get_recent(self, count: int) -> List[Metric]:
+        """
+        Get most recent N metrics
+
+        Args:
+            count: Number of recent metrics to return
+
+        Returns:
+            List of most recent metrics (newest first)
+
+        Example:
+            >>> recent = buffer.get_recent(10)
+            >>> len(recent)
+            10
+        """
+        with self._lock:
+            # Convert to list, reverse to get newest first, then slice
+            all_metrics = list(self._buffer)
+            all_metrics.reverse()
+            return all_metrics[:count]
+
+    def get_by_type(self, operation_type: OperationType) -> List[Metric]:
+        """
+        Get all metrics of a specific type
+
+        Args:
+            operation_type: Type to filter by (database/http/render)
+
+        Returns:
+            List of metrics matching the type
+
+        Example:
+            >>> db_metrics = buffer.get_by_type('database')
+        """
+        with self._lock:
+            return [m for m in self._buffer if m.operation_type == operation_type]
+
+    def get_slow_operations(
+        self,
+        threshold_ms: float = 1000.0,
+        operation_type: Optional[OperationType] = None
+    ) -> List[Metric]:
+        """
+        Get operations that exceeded a duration threshold
+
+        Args:
+            threshold_ms: Duration threshold in milliseconds
+            operation_type: Optional type filter
+
+        Returns:
+            List of slow operations
+
+        Example:
+            >>> slow_queries = buffer.get_slow_operations(1000, 'database')
+        """
+        with self._lock:
+            metrics = list(self._buffer)
+
+            # Filter by type if specified
+            if operation_type:
+                metrics = [m for m in metrics if m.operation_type == operation_type]
+
+            # Filter by duration threshold
+            return [m for m in metrics if m.duration_ms >= threshold_ms]
+
+    def get_stats(self) -> Dict[str, Any]:
+        """
+        Get statistics about the buffer
+
+        Returns:
+            Dict with buffer statistics
+
+        Example:
+            >>> stats = buffer.get_stats()
+            >>> stats['total_count']
+            1000
+        """
+        with self._lock:
+            metrics = list(self._buffer)
+
+            # Calculate stats per operation type
+            type_stats = {}
+            for op_type in ["database", "http", "render"]:
+                type_metrics = [m for m in metrics if m.operation_type == op_type]
+                if type_metrics:
+                    durations = [m.duration_ms for m in type_metrics]
+                    type_stats[op_type] = {
+                        "count": len(type_metrics),
+                        "avg_duration_ms": sum(durations) / len(durations),
+                        "min_duration_ms": min(durations),
+                        "max_duration_ms": max(durations),
+                    }
+                else:
+                    type_stats[op_type] = {
+                        "count": 0,
+                        "avg_duration_ms": 0.0,
+                        "min_duration_ms": 0.0,
+                        "max_duration_ms": 0.0,
+                    }
+
+            return {
+                "total_count": len(metrics),
+                "max_size": self.max_size,
+                "process_id": self._process_id,
+                "sampling_rates": self._sampling_rates,
+                "by_type": type_stats,
+            }
+
+    def clear(self) -> None:
+        """
+        Clear all metrics from buffer
+
+        Example:
+            >>> buffer.clear()
+        """
+        with self._lock:
+            self._buffer.clear()
+
+    def set_sampling_rate(
+        self,
+        operation_type: OperationType,
+        rate: float
+    ) -> None:
+        """
+        Update sampling rate for an operation type
+
+        Args:
+            operation_type: Type to update
+            rate: New sampling rate (0.0-1.0)
+
+        Example:
+            >>> buffer.set_sampling_rate('database', 0.5)  # 50% sampling
+        """
+        if not 0.0 <= rate <= 1.0:
+            raise ValueError("Sampling rate must be between 0.0 and 1.0")
+
+        with self._lock:
+            self._sampling_rates[operation_type] = rate
+
+
+def get_buffer() -> MetricsBuffer:
+    """
+    Get or create the module-level metrics buffer
+
+    This ensures a single buffer per process. In multi-process deployments
+    (e.g., gunicorn), each worker process will have its own buffer.
+
+    Returns:
+        MetricsBuffer instance for this process
+
+    Example:
+        >>> buffer = get_buffer()
+        >>> buffer.record('database', 'query', 45.2)
+    """
+    global _metrics_buffer
+
+    if _metrics_buffer is None:
+        with _buffer_lock:
+            # Double-check locking pattern
+            if _metrics_buffer is None:
+                # Get configuration from Flask app if available
+                try:
+                    from flask import current_app
+                    max_size = current_app.config.get('METRICS_BUFFER_SIZE', 1000)
+                    sampling_rates = current_app.config.get('METRICS_SAMPLING_RATES', None)
+                except (ImportError, RuntimeError):
+                    # Flask not available or no app context
+                    max_size = 1000
+                    sampling_rates = None
+
+                _metrics_buffer = MetricsBuffer(
+                    max_size=max_size,
+                    sampling_rates=sampling_rates
+                )
+
+    return _metrics_buffer
+
+
+def record_metric(
+    operation_type: OperationType,
+    operation_name: str,
+    duration_ms: float,
+    metadata: Optional[Dict[str, Any]] = None,
+    force: bool = False
+) -> bool:
+    """
+    Record a metric using the module-level buffer
+
+    Convenience function that uses get_buffer() internally.
+
+    Args:
+        operation_type: Type of operation (database/http/render)
+        operation_name: Name/description of operation
+        duration_ms: Duration in milliseconds
+        metadata: Additional operation-specific data
+        force: If True, bypass sampling (for slow query logging)
+
+    Returns:
+        True if metric was recorded, False if skipped due to sampling
+
+    Example:
+        >>> record_metric('database', 'SELECT notes', 45.2,
+        ...               {'query': 'SELECT * FROM notes LIMIT 10'})
+        True
+    """
+    buffer = get_buffer()
+    return buffer.record(operation_type, operation_name, duration_ms, metadata, force)
+
+
+def get_metrics() -> List[Metric]:
+    """
+    Get all metrics from the module-level buffer
+
+    Returns:
+        List of metrics (oldest to newest)
+
+    Example:
+        >>> metrics = get_metrics()
+        >>> len(metrics)
+        1000
+    """
+    buffer = get_buffer()
+    return buffer.get_all()
+
+
+def get_metrics_stats() -> Dict[str, Any]:
+    """
+    Get statistics from the module-level buffer
+
+    Returns:
+        Dict with buffer statistics
+
+    Example:
+        >>> stats = get_metrics_stats()
+        >>> print(f"Total metrics: {stats['total_count']}")
+    """
+    buffer = get_buffer()
+    return buffer.get_stats()