Phase 2 - Enhancements: - Add performance monitoring infrastructure with MetricsBuffer - Implement three-tier health checks (/health, /health?detailed, /admin/health) - Enhance search with FTS5 fallback and XSS-safe highlighting - Add Unicode slug generation with timestamp fallback - Expose database pool statistics via /admin/metrics - Create missing error templates (400, 401, 403, 405, 503) Phase 3 - Polish: - Implement RSS streaming optimization (memory O(n) → O(1)) - Add admin metrics dashboard with htmx and Chart.js - Fix flaky migration race condition tests - Create comprehensive operational documentation - Add upgrade guide and troubleshooting guide Testing: 632 tests passing, zero flaky tests Documentation: Complete operational guides Security: All security reviews passed 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
411 lines
12 KiB
Python
411 lines
12 KiB
Python
"""
|
|
Metrics collection and buffering for performance monitoring
|
|
|
|
Per ADR-053 and developer Q&A Q6, Q12:
|
|
- Per-process circular buffers using deque
|
|
- Configurable buffer size (default 1000 entries)
|
|
- Include process ID in all metrics
|
|
- Configuration-based sampling rates
|
|
- Operation types: database, http, render
|
|
|
|
Example usage:
|
|
>>> from starpunk.monitoring import record_metric, get_metrics
|
|
>>>
|
|
>>> # Record a database operation
|
|
>>> record_metric('database', 'query', duration_ms=45.2, query='SELECT * FROM notes')
|
|
>>>
|
|
>>> # Get all metrics
|
|
>>> metrics = get_metrics()
|
|
>>> print(f"Collected {len(metrics)} metrics")
|
|
"""
|
|
|
|
import os
|
|
import random
|
|
import time
|
|
from collections import deque
|
|
from dataclasses import dataclass, field, asdict
|
|
from datetime import datetime
|
|
from threading import Lock
|
|
from typing import Any, Deque, Dict, List, Literal, Optional
|
|
|
|
# Operation types for categorizing metrics
|
|
OperationType = Literal["database", "http", "render"]
|
|
|
|
# Module-level circular buffer (per-process)
|
|
# Each process in a multi-process deployment maintains its own buffer
|
|
_metrics_buffer: Optional["MetricsBuffer"] = None
|
|
_buffer_lock = Lock()
|
|
|
|
|
|
@dataclass
|
|
class Metric:
|
|
"""
|
|
Represents a single performance metric
|
|
|
|
Attributes:
|
|
operation_type: Type of operation (database/http/render)
|
|
operation_name: Name/description of operation
|
|
timestamp: When the metric was recorded (ISO format)
|
|
duration_ms: Duration in milliseconds
|
|
process_id: Process ID that recorded the metric
|
|
metadata: Additional operation-specific data
|
|
"""
|
|
operation_type: OperationType
|
|
operation_name: str
|
|
timestamp: str
|
|
duration_ms: float
|
|
process_id: int
|
|
metadata: Dict[str, Any] = field(default_factory=dict)
|
|
|
|
def to_dict(self) -> Dict[str, Any]:
|
|
"""Convert metric to dictionary for serialization"""
|
|
return asdict(self)
|
|
|
|
|
|
class MetricsBuffer:
|
|
"""
|
|
Circular buffer for storing performance metrics
|
|
|
|
Per developer Q&A Q6:
|
|
- Uses deque for efficient circular buffer
|
|
- Per-process storage (not shared across workers)
|
|
- Thread-safe with locking
|
|
- Configurable max size (default 1000)
|
|
- Automatic eviction of oldest entries when full
|
|
|
|
Per developer Q&A Q12:
|
|
- Configurable sampling rates per operation type
|
|
- Default 10% sampling
|
|
- Slow queries always logged regardless of sampling
|
|
|
|
Example:
|
|
>>> buffer = MetricsBuffer(max_size=1000)
|
|
>>> buffer.record('database', 'query', 45.2, {'query': 'SELECT ...'})
|
|
>>> metrics = buffer.get_all()
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
max_size: int = 1000,
|
|
sampling_rates: Optional[Dict[OperationType, float]] = None
|
|
):
|
|
"""
|
|
Initialize metrics buffer
|
|
|
|
Args:
|
|
max_size: Maximum number of metrics to store
|
|
sampling_rates: Dict mapping operation type to sampling rate (0.0-1.0)
|
|
Default: {'database': 0.1, 'http': 0.1, 'render': 0.1}
|
|
"""
|
|
self.max_size = max_size
|
|
self._buffer: Deque[Metric] = deque(maxlen=max_size)
|
|
self._lock = Lock()
|
|
self._process_id = os.getpid()
|
|
|
|
# Default sampling rates (10% for all operation types)
|
|
self._sampling_rates = sampling_rates or {
|
|
"database": 0.1,
|
|
"http": 0.1,
|
|
"render": 0.1,
|
|
}
|
|
|
|
def record(
|
|
self,
|
|
operation_type: OperationType,
|
|
operation_name: str,
|
|
duration_ms: float,
|
|
metadata: Optional[Dict[str, Any]] = None,
|
|
force: bool = False
|
|
) -> bool:
|
|
"""
|
|
Record a performance metric
|
|
|
|
Args:
|
|
operation_type: Type of operation (database/http/render)
|
|
operation_name: Name/description of operation
|
|
duration_ms: Duration in milliseconds
|
|
metadata: Additional operation-specific data
|
|
force: If True, bypass sampling (for slow query logging)
|
|
|
|
Returns:
|
|
True if metric was recorded, False if skipped due to sampling
|
|
|
|
Example:
|
|
>>> buffer.record('database', 'SELECT notes', 45.2,
|
|
... {'query': 'SELECT * FROM notes LIMIT 10'})
|
|
True
|
|
"""
|
|
# Apply sampling (unless forced)
|
|
if not force:
|
|
sampling_rate = self._sampling_rates.get(operation_type, 0.1)
|
|
if random.random() > sampling_rate:
|
|
return False
|
|
|
|
metric = Metric(
|
|
operation_type=operation_type,
|
|
operation_name=operation_name,
|
|
timestamp=datetime.utcnow().isoformat() + "Z",
|
|
duration_ms=duration_ms,
|
|
process_id=self._process_id,
|
|
metadata=metadata or {}
|
|
)
|
|
|
|
with self._lock:
|
|
self._buffer.append(metric)
|
|
|
|
return True
|
|
|
|
def get_all(self) -> List[Metric]:
|
|
"""
|
|
Get all metrics from buffer
|
|
|
|
Returns:
|
|
List of metrics (oldest to newest)
|
|
|
|
Example:
|
|
>>> metrics = buffer.get_all()
|
|
>>> len(metrics)
|
|
1000
|
|
"""
|
|
with self._lock:
|
|
return list(self._buffer)
|
|
|
|
def get_recent(self, count: int) -> List[Metric]:
|
|
"""
|
|
Get most recent N metrics
|
|
|
|
Args:
|
|
count: Number of recent metrics to return
|
|
|
|
Returns:
|
|
List of most recent metrics (newest first)
|
|
|
|
Example:
|
|
>>> recent = buffer.get_recent(10)
|
|
>>> len(recent)
|
|
10
|
|
"""
|
|
with self._lock:
|
|
# Convert to list, reverse to get newest first, then slice
|
|
all_metrics = list(self._buffer)
|
|
all_metrics.reverse()
|
|
return all_metrics[:count]
|
|
|
|
def get_by_type(self, operation_type: OperationType) -> List[Metric]:
|
|
"""
|
|
Get all metrics of a specific type
|
|
|
|
Args:
|
|
operation_type: Type to filter by (database/http/render)
|
|
|
|
Returns:
|
|
List of metrics matching the type
|
|
|
|
Example:
|
|
>>> db_metrics = buffer.get_by_type('database')
|
|
"""
|
|
with self._lock:
|
|
return [m for m in self._buffer if m.operation_type == operation_type]
|
|
|
|
def get_slow_operations(
|
|
self,
|
|
threshold_ms: float = 1000.0,
|
|
operation_type: Optional[OperationType] = None
|
|
) -> List[Metric]:
|
|
"""
|
|
Get operations that exceeded a duration threshold
|
|
|
|
Args:
|
|
threshold_ms: Duration threshold in milliseconds
|
|
operation_type: Optional type filter
|
|
|
|
Returns:
|
|
List of slow operations
|
|
|
|
Example:
|
|
>>> slow_queries = buffer.get_slow_operations(1000, 'database')
|
|
"""
|
|
with self._lock:
|
|
metrics = list(self._buffer)
|
|
|
|
# Filter by type if specified
|
|
if operation_type:
|
|
metrics = [m for m in metrics if m.operation_type == operation_type]
|
|
|
|
# Filter by duration threshold
|
|
return [m for m in metrics if m.duration_ms >= threshold_ms]
|
|
|
|
def get_stats(self) -> Dict[str, Any]:
|
|
"""
|
|
Get statistics about the buffer
|
|
|
|
Returns:
|
|
Dict with buffer statistics
|
|
|
|
Example:
|
|
>>> stats = buffer.get_stats()
|
|
>>> stats['total_count']
|
|
1000
|
|
"""
|
|
with self._lock:
|
|
metrics = list(self._buffer)
|
|
|
|
# Calculate stats per operation type
|
|
type_stats = {}
|
|
for op_type in ["database", "http", "render"]:
|
|
type_metrics = [m for m in metrics if m.operation_type == op_type]
|
|
if type_metrics:
|
|
durations = [m.duration_ms for m in type_metrics]
|
|
type_stats[op_type] = {
|
|
"count": len(type_metrics),
|
|
"avg_duration_ms": sum(durations) / len(durations),
|
|
"min_duration_ms": min(durations),
|
|
"max_duration_ms": max(durations),
|
|
}
|
|
else:
|
|
type_stats[op_type] = {
|
|
"count": 0,
|
|
"avg_duration_ms": 0.0,
|
|
"min_duration_ms": 0.0,
|
|
"max_duration_ms": 0.0,
|
|
}
|
|
|
|
return {
|
|
"total_count": len(metrics),
|
|
"max_size": self.max_size,
|
|
"process_id": self._process_id,
|
|
"sampling_rates": self._sampling_rates,
|
|
"by_type": type_stats,
|
|
}
|
|
|
|
def clear(self) -> None:
|
|
"""
|
|
Clear all metrics from buffer
|
|
|
|
Example:
|
|
>>> buffer.clear()
|
|
"""
|
|
with self._lock:
|
|
self._buffer.clear()
|
|
|
|
def set_sampling_rate(
|
|
self,
|
|
operation_type: OperationType,
|
|
rate: float
|
|
) -> None:
|
|
"""
|
|
Update sampling rate for an operation type
|
|
|
|
Args:
|
|
operation_type: Type to update
|
|
rate: New sampling rate (0.0-1.0)
|
|
|
|
Example:
|
|
>>> buffer.set_sampling_rate('database', 0.5) # 50% sampling
|
|
"""
|
|
if not 0.0 <= rate <= 1.0:
|
|
raise ValueError("Sampling rate must be between 0.0 and 1.0")
|
|
|
|
with self._lock:
|
|
self._sampling_rates[operation_type] = rate
|
|
|
|
|
|
def get_buffer() -> MetricsBuffer:
|
|
"""
|
|
Get or create the module-level metrics buffer
|
|
|
|
This ensures a single buffer per process. In multi-process deployments
|
|
(e.g., gunicorn), each worker process will have its own buffer.
|
|
|
|
Returns:
|
|
MetricsBuffer instance for this process
|
|
|
|
Example:
|
|
>>> buffer = get_buffer()
|
|
>>> buffer.record('database', 'query', 45.2)
|
|
"""
|
|
global _metrics_buffer
|
|
|
|
if _metrics_buffer is None:
|
|
with _buffer_lock:
|
|
# Double-check locking pattern
|
|
if _metrics_buffer is None:
|
|
# Get configuration from Flask app if available
|
|
try:
|
|
from flask import current_app
|
|
max_size = current_app.config.get('METRICS_BUFFER_SIZE', 1000)
|
|
sampling_rates = current_app.config.get('METRICS_SAMPLING_RATES', None)
|
|
except (ImportError, RuntimeError):
|
|
# Flask not available or no app context
|
|
max_size = 1000
|
|
sampling_rates = None
|
|
|
|
_metrics_buffer = MetricsBuffer(
|
|
max_size=max_size,
|
|
sampling_rates=sampling_rates
|
|
)
|
|
|
|
return _metrics_buffer
|
|
|
|
|
|
def record_metric(
|
|
operation_type: OperationType,
|
|
operation_name: str,
|
|
duration_ms: float,
|
|
metadata: Optional[Dict[str, Any]] = None,
|
|
force: bool = False
|
|
) -> bool:
|
|
"""
|
|
Record a metric using the module-level buffer
|
|
|
|
Convenience function that uses get_buffer() internally.
|
|
|
|
Args:
|
|
operation_type: Type of operation (database/http/render)
|
|
operation_name: Name/description of operation
|
|
duration_ms: Duration in milliseconds
|
|
metadata: Additional operation-specific data
|
|
force: If True, bypass sampling (for slow query logging)
|
|
|
|
Returns:
|
|
True if metric was recorded, False if skipped due to sampling
|
|
|
|
Example:
|
|
>>> record_metric('database', 'SELECT notes', 45.2,
|
|
... {'query': 'SELECT * FROM notes LIMIT 10'})
|
|
True
|
|
"""
|
|
buffer = get_buffer()
|
|
return buffer.record(operation_type, operation_name, duration_ms, metadata, force)
|
|
|
|
|
|
def get_metrics() -> List[Metric]:
|
|
"""
|
|
Get all metrics from the module-level buffer
|
|
|
|
Returns:
|
|
List of metrics (oldest to newest)
|
|
|
|
Example:
|
|
>>> metrics = get_metrics()
|
|
>>> len(metrics)
|
|
1000
|
|
"""
|
|
buffer = get_buffer()
|
|
return buffer.get_all()
|
|
|
|
|
|
def get_metrics_stats() -> Dict[str, Any]:
|
|
"""
|
|
Get statistics from the module-level buffer
|
|
|
|
Returns:
|
|
Dict with buffer statistics
|
|
|
|
Example:
|
|
>>> stats = get_metrics_stats()
|
|
>>> print(f"Total metrics: {stats['total_count']}")
|
|
"""
|
|
buffer = get_buffer()
|
|
return buffer.get_stats()
|