feat: Complete v1.1.1 Phases 2 & 3 - Enhancements and Polish
Phase 2 - Enhancements: - Add performance monitoring infrastructure with MetricsBuffer - Implement three-tier health checks (/health, /health?detailed, /admin/health) - Enhance search with FTS5 fallback and XSS-safe highlighting - Add Unicode slug generation with timestamp fallback - Expose database pool statistics via /admin/metrics - Create missing error templates (400, 401, 403, 405, 503) Phase 3 - Polish: - Implement RSS streaming optimization (memory O(n) → O(1)) - Add admin metrics dashboard with htmx and Chart.js - Fix flaky migration race condition tests - Create comprehensive operational documentation - Add upgrade guide and troubleshooting guide Testing: 632 tests passing, zero flaky tests Documentation: Complete operational guides Security: All security reviews passed 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
410
starpunk/monitoring/metrics.py
Normal file
410
starpunk/monitoring/metrics.py
Normal file
@@ -0,0 +1,410 @@
|
||||
"""
|
||||
Metrics collection and buffering for performance monitoring
|
||||
|
||||
Per ADR-053 and developer Q&A Q6, Q12:
|
||||
- Per-process circular buffers using deque
|
||||
- Configurable buffer size (default 1000 entries)
|
||||
- Include process ID in all metrics
|
||||
- Configuration-based sampling rates
|
||||
- Operation types: database, http, render
|
||||
|
||||
Example usage:
|
||||
>>> from starpunk.monitoring import record_metric, get_metrics
|
||||
>>>
|
||||
>>> # Record a database operation
|
||||
>>> record_metric('database', 'query', duration_ms=45.2, query='SELECT * FROM notes')
|
||||
>>>
|
||||
>>> # Get all metrics
|
||||
>>> metrics = get_metrics()
|
||||
>>> print(f"Collected {len(metrics)} metrics")
|
||||
"""
|
||||
|
||||
import os
|
||||
import random
|
||||
import time
|
||||
from collections import deque
|
||||
from dataclasses import dataclass, field, asdict
|
||||
from datetime import datetime
|
||||
from threading import Lock
|
||||
from typing import Any, Deque, Dict, List, Literal, Optional
|
||||
|
||||
# Operation types for categorizing metrics
|
||||
OperationType = Literal["database", "http", "render"]
|
||||
|
||||
# Module-level circular buffer (per-process)
|
||||
# Each process in a multi-process deployment maintains its own buffer
|
||||
_metrics_buffer: Optional["MetricsBuffer"] = None
|
||||
_buffer_lock = Lock()
|
||||
|
||||
|
||||
@dataclass
|
||||
class Metric:
|
||||
"""
|
||||
Represents a single performance metric
|
||||
|
||||
Attributes:
|
||||
operation_type: Type of operation (database/http/render)
|
||||
operation_name: Name/description of operation
|
||||
timestamp: When the metric was recorded (ISO format)
|
||||
duration_ms: Duration in milliseconds
|
||||
process_id: Process ID that recorded the metric
|
||||
metadata: Additional operation-specific data
|
||||
"""
|
||||
operation_type: OperationType
|
||||
operation_name: str
|
||||
timestamp: str
|
||||
duration_ms: float
|
||||
process_id: int
|
||||
metadata: Dict[str, Any] = field(default_factory=dict)
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
"""Convert metric to dictionary for serialization"""
|
||||
return asdict(self)
|
||||
|
||||
|
||||
class MetricsBuffer:
|
||||
"""
|
||||
Circular buffer for storing performance metrics
|
||||
|
||||
Per developer Q&A Q6:
|
||||
- Uses deque for efficient circular buffer
|
||||
- Per-process storage (not shared across workers)
|
||||
- Thread-safe with locking
|
||||
- Configurable max size (default 1000)
|
||||
- Automatic eviction of oldest entries when full
|
||||
|
||||
Per developer Q&A Q12:
|
||||
- Configurable sampling rates per operation type
|
||||
- Default 10% sampling
|
||||
- Slow queries always logged regardless of sampling
|
||||
|
||||
Example:
|
||||
>>> buffer = MetricsBuffer(max_size=1000)
|
||||
>>> buffer.record('database', 'query', 45.2, {'query': 'SELECT ...'})
|
||||
>>> metrics = buffer.get_all()
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
max_size: int = 1000,
|
||||
sampling_rates: Optional[Dict[OperationType, float]] = None
|
||||
):
|
||||
"""
|
||||
Initialize metrics buffer
|
||||
|
||||
Args:
|
||||
max_size: Maximum number of metrics to store
|
||||
sampling_rates: Dict mapping operation type to sampling rate (0.0-1.0)
|
||||
Default: {'database': 0.1, 'http': 0.1, 'render': 0.1}
|
||||
"""
|
||||
self.max_size = max_size
|
||||
self._buffer: Deque[Metric] = deque(maxlen=max_size)
|
||||
self._lock = Lock()
|
||||
self._process_id = os.getpid()
|
||||
|
||||
# Default sampling rates (10% for all operation types)
|
||||
self._sampling_rates = sampling_rates or {
|
||||
"database": 0.1,
|
||||
"http": 0.1,
|
||||
"render": 0.1,
|
||||
}
|
||||
|
||||
def record(
|
||||
self,
|
||||
operation_type: OperationType,
|
||||
operation_name: str,
|
||||
duration_ms: float,
|
||||
metadata: Optional[Dict[str, Any]] = None,
|
||||
force: bool = False
|
||||
) -> bool:
|
||||
"""
|
||||
Record a performance metric
|
||||
|
||||
Args:
|
||||
operation_type: Type of operation (database/http/render)
|
||||
operation_name: Name/description of operation
|
||||
duration_ms: Duration in milliseconds
|
||||
metadata: Additional operation-specific data
|
||||
force: If True, bypass sampling (for slow query logging)
|
||||
|
||||
Returns:
|
||||
True if metric was recorded, False if skipped due to sampling
|
||||
|
||||
Example:
|
||||
>>> buffer.record('database', 'SELECT notes', 45.2,
|
||||
... {'query': 'SELECT * FROM notes LIMIT 10'})
|
||||
True
|
||||
"""
|
||||
# Apply sampling (unless forced)
|
||||
if not force:
|
||||
sampling_rate = self._sampling_rates.get(operation_type, 0.1)
|
||||
if random.random() > sampling_rate:
|
||||
return False
|
||||
|
||||
metric = Metric(
|
||||
operation_type=operation_type,
|
||||
operation_name=operation_name,
|
||||
timestamp=datetime.utcnow().isoformat() + "Z",
|
||||
duration_ms=duration_ms,
|
||||
process_id=self._process_id,
|
||||
metadata=metadata or {}
|
||||
)
|
||||
|
||||
with self._lock:
|
||||
self._buffer.append(metric)
|
||||
|
||||
return True
|
||||
|
||||
def get_all(self) -> List[Metric]:
|
||||
"""
|
||||
Get all metrics from buffer
|
||||
|
||||
Returns:
|
||||
List of metrics (oldest to newest)
|
||||
|
||||
Example:
|
||||
>>> metrics = buffer.get_all()
|
||||
>>> len(metrics)
|
||||
1000
|
||||
"""
|
||||
with self._lock:
|
||||
return list(self._buffer)
|
||||
|
||||
def get_recent(self, count: int) -> List[Metric]:
|
||||
"""
|
||||
Get most recent N metrics
|
||||
|
||||
Args:
|
||||
count: Number of recent metrics to return
|
||||
|
||||
Returns:
|
||||
List of most recent metrics (newest first)
|
||||
|
||||
Example:
|
||||
>>> recent = buffer.get_recent(10)
|
||||
>>> len(recent)
|
||||
10
|
||||
"""
|
||||
with self._lock:
|
||||
# Convert to list, reverse to get newest first, then slice
|
||||
all_metrics = list(self._buffer)
|
||||
all_metrics.reverse()
|
||||
return all_metrics[:count]
|
||||
|
||||
def get_by_type(self, operation_type: OperationType) -> List[Metric]:
|
||||
"""
|
||||
Get all metrics of a specific type
|
||||
|
||||
Args:
|
||||
operation_type: Type to filter by (database/http/render)
|
||||
|
||||
Returns:
|
||||
List of metrics matching the type
|
||||
|
||||
Example:
|
||||
>>> db_metrics = buffer.get_by_type('database')
|
||||
"""
|
||||
with self._lock:
|
||||
return [m for m in self._buffer if m.operation_type == operation_type]
|
||||
|
||||
def get_slow_operations(
|
||||
self,
|
||||
threshold_ms: float = 1000.0,
|
||||
operation_type: Optional[OperationType] = None
|
||||
) -> List[Metric]:
|
||||
"""
|
||||
Get operations that exceeded a duration threshold
|
||||
|
||||
Args:
|
||||
threshold_ms: Duration threshold in milliseconds
|
||||
operation_type: Optional type filter
|
||||
|
||||
Returns:
|
||||
List of slow operations
|
||||
|
||||
Example:
|
||||
>>> slow_queries = buffer.get_slow_operations(1000, 'database')
|
||||
"""
|
||||
with self._lock:
|
||||
metrics = list(self._buffer)
|
||||
|
||||
# Filter by type if specified
|
||||
if operation_type:
|
||||
metrics = [m for m in metrics if m.operation_type == operation_type]
|
||||
|
||||
# Filter by duration threshold
|
||||
return [m for m in metrics if m.duration_ms >= threshold_ms]
|
||||
|
||||
def get_stats(self) -> Dict[str, Any]:
|
||||
"""
|
||||
Get statistics about the buffer
|
||||
|
||||
Returns:
|
||||
Dict with buffer statistics
|
||||
|
||||
Example:
|
||||
>>> stats = buffer.get_stats()
|
||||
>>> stats['total_count']
|
||||
1000
|
||||
"""
|
||||
with self._lock:
|
||||
metrics = list(self._buffer)
|
||||
|
||||
# Calculate stats per operation type
|
||||
type_stats = {}
|
||||
for op_type in ["database", "http", "render"]:
|
||||
type_metrics = [m for m in metrics if m.operation_type == op_type]
|
||||
if type_metrics:
|
||||
durations = [m.duration_ms for m in type_metrics]
|
||||
type_stats[op_type] = {
|
||||
"count": len(type_metrics),
|
||||
"avg_duration_ms": sum(durations) / len(durations),
|
||||
"min_duration_ms": min(durations),
|
||||
"max_duration_ms": max(durations),
|
||||
}
|
||||
else:
|
||||
type_stats[op_type] = {
|
||||
"count": 0,
|
||||
"avg_duration_ms": 0.0,
|
||||
"min_duration_ms": 0.0,
|
||||
"max_duration_ms": 0.0,
|
||||
}
|
||||
|
||||
return {
|
||||
"total_count": len(metrics),
|
||||
"max_size": self.max_size,
|
||||
"process_id": self._process_id,
|
||||
"sampling_rates": self._sampling_rates,
|
||||
"by_type": type_stats,
|
||||
}
|
||||
|
||||
def clear(self) -> None:
|
||||
"""
|
||||
Clear all metrics from buffer
|
||||
|
||||
Example:
|
||||
>>> buffer.clear()
|
||||
"""
|
||||
with self._lock:
|
||||
self._buffer.clear()
|
||||
|
||||
def set_sampling_rate(
|
||||
self,
|
||||
operation_type: OperationType,
|
||||
rate: float
|
||||
) -> None:
|
||||
"""
|
||||
Update sampling rate for an operation type
|
||||
|
||||
Args:
|
||||
operation_type: Type to update
|
||||
rate: New sampling rate (0.0-1.0)
|
||||
|
||||
Example:
|
||||
>>> buffer.set_sampling_rate('database', 0.5) # 50% sampling
|
||||
"""
|
||||
if not 0.0 <= rate <= 1.0:
|
||||
raise ValueError("Sampling rate must be between 0.0 and 1.0")
|
||||
|
||||
with self._lock:
|
||||
self._sampling_rates[operation_type] = rate
|
||||
|
||||
|
||||
def get_buffer() -> MetricsBuffer:
|
||||
"""
|
||||
Get or create the module-level metrics buffer
|
||||
|
||||
This ensures a single buffer per process. In multi-process deployments
|
||||
(e.g., gunicorn), each worker process will have its own buffer.
|
||||
|
||||
Returns:
|
||||
MetricsBuffer instance for this process
|
||||
|
||||
Example:
|
||||
>>> buffer = get_buffer()
|
||||
>>> buffer.record('database', 'query', 45.2)
|
||||
"""
|
||||
global _metrics_buffer
|
||||
|
||||
if _metrics_buffer is None:
|
||||
with _buffer_lock:
|
||||
# Double-check locking pattern
|
||||
if _metrics_buffer is None:
|
||||
# Get configuration from Flask app if available
|
||||
try:
|
||||
from flask import current_app
|
||||
max_size = current_app.config.get('METRICS_BUFFER_SIZE', 1000)
|
||||
sampling_rates = current_app.config.get('METRICS_SAMPLING_RATES', None)
|
||||
except (ImportError, RuntimeError):
|
||||
# Flask not available or no app context
|
||||
max_size = 1000
|
||||
sampling_rates = None
|
||||
|
||||
_metrics_buffer = MetricsBuffer(
|
||||
max_size=max_size,
|
||||
sampling_rates=sampling_rates
|
||||
)
|
||||
|
||||
return _metrics_buffer
|
||||
|
||||
|
||||
def record_metric(
|
||||
operation_type: OperationType,
|
||||
operation_name: str,
|
||||
duration_ms: float,
|
||||
metadata: Optional[Dict[str, Any]] = None,
|
||||
force: bool = False
|
||||
) -> bool:
|
||||
"""
|
||||
Record a metric using the module-level buffer
|
||||
|
||||
Convenience function that uses get_buffer() internally.
|
||||
|
||||
Args:
|
||||
operation_type: Type of operation (database/http/render)
|
||||
operation_name: Name/description of operation
|
||||
duration_ms: Duration in milliseconds
|
||||
metadata: Additional operation-specific data
|
||||
force: If True, bypass sampling (for slow query logging)
|
||||
|
||||
Returns:
|
||||
True if metric was recorded, False if skipped due to sampling
|
||||
|
||||
Example:
|
||||
>>> record_metric('database', 'SELECT notes', 45.2,
|
||||
... {'query': 'SELECT * FROM notes LIMIT 10'})
|
||||
True
|
||||
"""
|
||||
buffer = get_buffer()
|
||||
return buffer.record(operation_type, operation_name, duration_ms, metadata, force)
|
||||
|
||||
|
||||
def get_metrics() -> List[Metric]:
|
||||
"""
|
||||
Get all metrics from the module-level buffer
|
||||
|
||||
Returns:
|
||||
List of metrics (oldest to newest)
|
||||
|
||||
Example:
|
||||
>>> metrics = get_metrics()
|
||||
>>> len(metrics)
|
||||
1000
|
||||
"""
|
||||
buffer = get_buffer()
|
||||
return buffer.get_all()
|
||||
|
||||
|
||||
def get_metrics_stats() -> Dict[str, Any]:
|
||||
"""
|
||||
Get statistics from the module-level buffer
|
||||
|
||||
Returns:
|
||||
Dict with buffer statistics
|
||||
|
||||
Example:
|
||||
>>> stats = get_metrics_stats()
|
||||
>>> print(f"Total metrics: {stats['total_count']}")
|
||||
"""
|
||||
buffer = get_buffer()
|
||||
return buffer.get_stats()
|
||||
Reference in New Issue
Block a user