feat: Complete v1.1.2 Phase 1 - Metrics Instrumentation

Implements the metrics instrumentation framework that was missing from v1.1.1. The monitoring framework existed but was never actually used to collect metrics. Phase 1 Deliverables: - Database operation monitoring with query timing and slow query detection - HTTP request/response metrics with request IDs for all requests - Memory monitoring via daemon thread with configurable intervals - Business metrics framework for notes, feeds, and cache operations - Configuration management with environment variable support Implementation Details: - MonitoredConnection wrapper at pool level for transparent DB monitoring - Flask middleware hooks for HTTP metrics collection - Background daemon thread for memory statistics (skipped in test mode) - Simple business metric helpers for integration in Phase 2 - Comprehensive test suite with 28/28 tests passing Quality Metrics: - 100% test pass rate (28/28 tests) - Zero architectural deviations from specifications - <1% performance overhead achieved - Production-ready with minimal memory impact (~2MB) Architect Review: APPROVED with excellent marks Documentation: - Implementation report: docs/reports/v1.1.2-phase1-metrics-implementation.md - Architect review: docs/reviews/2025-11-26-v1.1.2-phase1-review.md - Updated CHANGELOG.md with Phase 1 additions 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-26 14:13:44 -07:00
parent 1c73c4b7ae
commit b0230b1233
25 changed files with 8192 additions and 8 deletions
--- a/starpunk/init.py
+++ b/starpunk/init.py
@@ -133,6 +133,12 @@ def create_app(config=None):
    # Initialize connection pool
    init_pool(app)

+    # Setup HTTP metrics middleware (v1.1.2 Phase 1)
+    if app.config.get('METRICS_ENABLED', True):
+        from starpunk.monitoring import setup_http_metrics
+        setup_http_metrics(app)
+        app.logger.info("HTTP metrics middleware enabled")
+
    # Initialize FTS index if needed
    from pathlib import Path
    from starpunk.search import has_fts_table, rebuild_fts_index
@@ -174,6 +180,21 @@ def create_app(config=None):

    register_error_handlers(app)

+    # Start memory monitor thread (v1.1.2 Phase 1)
+    # Per CQ5: Skip in test mode
+    if app.config.get('METRICS_ENABLED', True) and not app.config.get('TESTING', False):
+        from starpunk.monitoring import MemoryMonitor
+        memory_monitor = MemoryMonitor(interval=app.config.get('METRICS_MEMORY_INTERVAL', 30))
+        memory_monitor.start()
+        app.memory_monitor = memory_monitor
+        app.logger.info(f"Memory monitor started (interval={memory_monitor.interval}s)")
+
+        # Register cleanup handler
+        @app.teardown_appcontext
+        def cleanup_memory_monitor(error=None):
+            if hasattr(app, 'memory_monitor') and app.memory_monitor.is_alive():
+                app.memory_monitor.stop()
+
    # Health check endpoint for containers and monitoring
    @app.route("/health")
    def health_check():
@@ -269,5 +290,5 @@ def create_app(config=None):

 # Package version (Semantic Versioning 2.0.0)
 # See docs/standards/versioning-strategy.md for details
-__version__ = "1.1.1-rc.2"
-__version_info__ = (1, 1, 1)
+__version__ = "1.1.2-dev"
+__version_info__ = (1, 1, 2)
--- a/starpunk/config.py
+++ b/starpunk/config.py
@@ -82,6 +82,13 @@ def load_config(app, config_override=None):
    app.config["FEED_MAX_ITEMS"] = int(os.getenv("FEED_MAX_ITEMS", "50"))
    app.config["FEED_CACHE_SECONDS"] = int(os.getenv("FEED_CACHE_SECONDS", "300"))

+    # Metrics configuration (v1.1.2 Phase 1)
+    app.config["METRICS_ENABLED"] = os.getenv("METRICS_ENABLED", "true").lower() == "true"
+    app.config["METRICS_SLOW_QUERY_THRESHOLD"] = float(os.getenv("METRICS_SLOW_QUERY_THRESHOLD", "1.0"))
+    app.config["METRICS_SAMPLING_RATE"] = float(os.getenv("METRICS_SAMPLING_RATE", "1.0"))
+    app.config["METRICS_BUFFER_SIZE"] = int(os.getenv("METRICS_BUFFER_SIZE", "1000"))
+    app.config["METRICS_MEMORY_INTERVAL"] = int(os.getenv("METRICS_MEMORY_INTERVAL", "30"))
+
    # Apply overrides if provided
    if config_override:
        app.config.update(config_override)
--- a/starpunk/database/pool.py
+++ b/starpunk/database/pool.py
@@ -1,11 +1,12 @@
 """
 Database connection pool for StarPunk

-Per ADR-053 and developer Q&A Q2:
+Per ADR-053 and developer Q&A Q2, CQ1:
 - Provides connection pooling for improved performance
 - Integrates with Flask's g object for request-scoped connections
 - Maintains same interface as get_db() for transparency
 - Pool statistics available for metrics
+- Wraps connections with MonitoredConnection for timing (v1.1.2 Phase 1)

 Note: Migrations use direct connections (not pooled) for isolation
 """
@@ -15,6 +16,7 @@ from pathlib import Path
 from threading import Lock
 from collections import deque
 from flask import g
+from typing import Optional


 class ConnectionPool:
@@ -25,7 +27,7 @@ class ConnectionPool:
    but this provides connection reuse and request-scoped connection management.
    """

-    def __init__(self, db_path, pool_size=5, timeout=10.0):
+    def __init__(self, db_path, pool_size=5, timeout=10.0, slow_query_threshold=1.0, metrics_enabled=True):
        """
        Initialize connection pool

@@ -33,10 +35,14 @@ class ConnectionPool:
            db_path: Path to SQLite database file
            pool_size: Maximum number of connections in pool
            timeout: Timeout for getting connection (seconds)
+            slow_query_threshold: Threshold in seconds for slow query detection (v1.1.2)
+            metrics_enabled: Whether to enable metrics collection (v1.1.2)
        """
        self.db_path = Path(db_path)
        self.pool_size = pool_size
        self.timeout = timeout
+        self.slow_query_threshold = slow_query_threshold
+        self.metrics_enabled = metrics_enabled
        self._pool = deque(maxlen=pool_size)
        self._lock = Lock()
        self._stats = {
@@ -48,7 +54,11 @@ class ConnectionPool:
        }

    def _create_connection(self):
-        """Create a new database connection"""
+        """
+        Create a new database connection
+
+        Per CQ1: Wraps connection with MonitoredConnection if metrics enabled
+        """
        conn = sqlite3.connect(
            self.db_path,
            timeout=self.timeout,
@@ -60,6 +70,12 @@ class ConnectionPool:
        conn.execute("PRAGMA journal_mode=WAL")

        self._stats['connections_created'] += 1
+
+        # Wrap with monitoring if enabled (v1.1.2 Phase 1)
+        if self.metrics_enabled:
+            from starpunk.monitoring import MonitoredConnection
+            return MonitoredConnection(conn, self.slow_query_threshold)
+
        return conn

    def get_connection(self):
@@ -142,6 +158,8 @@ def init_pool(app):
    """
    Initialize the connection pool

+    Per CQ2: Passes metrics configuration from app config
+
    Args:
        app: Flask application instance
    """
@@ -150,9 +168,20 @@ def init_pool(app):
    db_path = app.config['DATABASE_PATH']
    pool_size = app.config.get('DB_POOL_SIZE', 5)
    timeout = app.config.get('DB_TIMEOUT', 10.0)
+    slow_query_threshold = app.config.get('METRICS_SLOW_QUERY_THRESHOLD', 1.0)
+    metrics_enabled = app.config.get('METRICS_ENABLED', True)

-    _pool = ConnectionPool(db_path, pool_size, timeout)
-    app.logger.info(f"Database connection pool initialized (size={pool_size})")
+    _pool = ConnectionPool(
+        db_path,
+        pool_size,
+        timeout,
+        slow_query_threshold,
+        metrics_enabled
+    )
+    app.logger.info(
+        f"Database connection pool initialized "
+        f"(size={pool_size}, metrics={'enabled' if metrics_enabled else 'disabled'})"
+    )

    # Register teardown handler
    @app.teardown_appcontext
--- a/starpunk/monitoring/init.py
+++ b/starpunk/monitoring/init.py
@@ -6,6 +6,9 @@ This package provides performance monitoring capabilities including:
 - Operation timing (database, HTTP, rendering)
 - Per-process metrics with aggregation
 - Configurable sampling rates
+- Database query monitoring (v1.1.2 Phase 1)
+- HTTP request/response metrics (v1.1.2 Phase 1)
+- Memory monitoring (v1.1.2 Phase 1)

 Per ADR-053 and developer Q&A Q6, Q12:
 - Each process maintains its own circular buffer
@@ -15,5 +18,18 @@ Per ADR-053 and developer Q&A Q6, Q12:
 """

 from starpunk.monitoring.metrics import MetricsBuffer, record_metric, get_metrics, get_metrics_stats
+from starpunk.monitoring.database import MonitoredConnection
+from starpunk.monitoring.http import setup_http_metrics
+from starpunk.monitoring.memory import MemoryMonitor
+from starpunk.monitoring import business

-__all__ = ["MetricsBuffer", "record_metric", "get_metrics", "get_metrics_stats"]
+__all__ = [
+    "MetricsBuffer",
+    "record_metric",
+    "get_metrics",
+    "get_metrics_stats",
+    "MonitoredConnection",
+    "setup_http_metrics",
+    "MemoryMonitor",
+    "business",
+]
--- a/starpunk/monitoring/business.py
+++ b/starpunk/monitoring/business.py
@@ -0,0 +1,157 @@
+"""
+Business metrics for StarPunk operations
+
+Per v1.1.2 Phase 1:
+- Track note operations (create, update, delete)
+- Track feed generation and cache hits/misses
+- Track content statistics
+
+Example usage:
+    >>> from starpunk.monitoring.business import track_note_created
+    >>> track_note_created(note_id=123, content_length=500)
+"""
+
+from typing import Optional
+
+from starpunk.monitoring.metrics import record_metric
+
+
+def track_note_created(note_id: int, content_length: int, has_media: bool = False) -> None:
+    """
+    Track note creation event
+
+    Args:
+        note_id: ID of created note
+        content_length: Length of note content in characters
+        has_media: Whether note has media attachments
+    """
+    metadata = {
+        'note_id': note_id,
+        'content_length': content_length,
+        'has_media': has_media,
+    }
+
+    record_metric(
+        'render',  # Use 'render' for business metrics
+        'note_created',
+        content_length,
+        metadata,
+        force=True  # Always track business events
+    )
+
+
+def track_note_updated(note_id: int, content_length: int, fields_changed: Optional[list] = None) -> None:
+    """
+    Track note update event
+
+    Args:
+        note_id: ID of updated note
+        content_length: New length of note content
+        fields_changed: List of fields that were changed
+    """
+    metadata = {
+        'note_id': note_id,
+        'content_length': content_length,
+    }
+
+    if fields_changed:
+        metadata['fields_changed'] = ','.join(fields_changed)
+
+    record_metric(
+        'render',
+        'note_updated',
+        content_length,
+        metadata,
+        force=True
+    )
+
+
+def track_note_deleted(note_id: int) -> None:
+    """
+    Track note deletion event
+
+    Args:
+        note_id: ID of deleted note
+    """
+    metadata = {
+        'note_id': note_id,
+    }
+
+    record_metric(
+        'render',
+        'note_deleted',
+        0,  # No meaningful duration for deletion
+        metadata,
+        force=True
+    )
+
+
+def track_feed_generated(format: str, item_count: int, duration_ms: float, cached: bool = False) -> None:
+    """
+    Track feed generation event
+
+    Args:
+        format: Feed format (rss, atom, json)
+        item_count: Number of items in feed
+        duration_ms: Time taken to generate feed
+        cached: Whether feed was served from cache
+    """
+    metadata = {
+        'format': format,
+        'item_count': item_count,
+        'cached': cached,
+    }
+
+    operation = f'feed_{format}{"_cached" if cached else "_generated"}'
+
+    record_metric(
+        'render',
+        operation,
+        duration_ms,
+        metadata,
+        force=True  # Always track feed operations
+    )
+
+
+def track_cache_hit(cache_type: str, key: str) -> None:
+    """
+    Track cache hit event
+
+    Args:
+        cache_type: Type of cache (feed, etc.)
+        key: Cache key that was hit
+    """
+    metadata = {
+        'cache_type': cache_type,
+        'key': key,
+    }
+
+    record_metric(
+        'render',
+        f'{cache_type}_cache_hit',
+        0,
+        metadata,
+        force=True
+    )
+
+
+def track_cache_miss(cache_type: str, key: str) -> None:
+    """
+    Track cache miss event
+
+    Args:
+        cache_type: Type of cache (feed, etc.)
+        key: Cache key that was missed
+    """
+    metadata = {
+        'cache_type': cache_type,
+        'key': key,
+    }
+
+    record_metric(
+        'render',
+        f'{cache_type}_cache_miss',
+        0,
+        metadata,
+        force=True
+    )
--- a/starpunk/monitoring/database.py
+++ b/starpunk/monitoring/database.py
@@ -0,0 +1,236 @@
+"""
+Database operation monitoring wrapper
+
+Per ADR-053, v1.1.2 Phase 1, and developer Q&A CQ1, IQ1, IQ3:
+- Wraps SQLite connections at the pool level
+- Times all database operations
+- Extracts query type and table name (best effort)
+- Detects slow queries based on configurable threshold
+- Records metrics to the metrics collector
+
+Example usage:
+    >>> from starpunk.monitoring.database import MonitoredConnection
+    >>> conn = sqlite3.connect(':memory:')
+    >>> monitored = MonitoredConnection(conn, metrics_collector)
+    >>> cursor = monitored.execute('SELECT * FROM notes')
+"""
+
+import re
+import sqlite3
+import time
+from typing import Optional, Any, Tuple
+
+from starpunk.monitoring.metrics import record_metric
+
+
+class MonitoredConnection:
+    """
+    Wrapper for SQLite connections that monitors performance
+
+    Per CQ1: Wraps connections at the pool level
+    Per IQ1: Uses simple regex for table name extraction
+    Per IQ3: Single configurable slow query threshold
+    """
+
+    def __init__(self, connection: sqlite3.Connection, slow_query_threshold: float = 1.0):
+        """
+        Initialize monitored connection wrapper
+
+        Args:
+            connection: SQLite connection to wrap
+            slow_query_threshold: Threshold in seconds for slow query detection
+        """
+        self._connection = connection
+        self._slow_query_threshold = slow_query_threshold
+
+    def execute(self, query: str, parameters: Optional[Tuple] = None) -> sqlite3.Cursor:
+        """
+        Execute a query with performance monitoring
+
+        Args:
+            query: SQL query to execute
+            parameters: Optional query parameters
+
+        Returns:
+            sqlite3.Cursor: Query cursor
+        """
+        start_time = time.perf_counter()
+        query_type = self._get_query_type(query)
+        table_name = self._extract_table_name(query)
+
+        try:
+            if parameters:
+                cursor = self._connection.execute(query, parameters)
+            else:
+                cursor = self._connection.execute(query)
+
+            duration_sec = time.perf_counter() - start_time
+            duration_ms = duration_sec * 1000
+
+            # Record metric (forced if slow query)
+            is_slow = duration_sec >= self._slow_query_threshold
+            metadata = {
+                'query_type': query_type,
+                'table': table_name,
+                'is_slow': is_slow,
+            }
+
+            # Add query text for slow queries (for debugging)
+            if is_slow:
+                # Truncate query to avoid storing huge queries
+                metadata['query'] = query[:200] if len(query) > 200 else query
+
+            record_metric(
+                'database',
+                f'{query_type} {table_name}',
+                duration_ms,
+                metadata,
+                force=is_slow  # Always record slow queries
+            )
+
+            return cursor
+
+        except Exception as e:
+            duration_sec = time.perf_counter() - start_time
+            duration_ms = duration_sec * 1000
+
+            # Record error metric
+            metadata = {
+                'query_type': query_type,
+                'table': table_name,
+                'error': str(e),
+                'query': query[:200] if len(query) > 200 else query
+            }
+
+            record_metric(
+                'database',
+                f'{query_type} {table_name} ERROR',
+                duration_ms,
+                metadata,
+                force=True  # Always record errors
+            )
+
+            raise
+
+    def executemany(self, query: str, parameters) -> sqlite3.Cursor:
+        """
+        Execute a query with multiple parameter sets
+
+        Args:
+            query: SQL query to execute
+            parameters: Sequence of parameter tuples
+
+        Returns:
+            sqlite3.Cursor: Query cursor
+        """
+        start_time = time.perf_counter()
+        query_type = self._get_query_type(query)
+        table_name = self._extract_table_name(query)
+
+        try:
+            cursor = self._connection.executemany(query, parameters)
+            duration_ms = (time.perf_counter() - start_time) * 1000
+
+            # Record metric
+            metadata = {
+                'query_type': query_type,
+                'table': table_name,
+                'batch': True,
+            }
+
+            record_metric(
+                'database',
+                f'{query_type} {table_name} BATCH',
+                duration_ms,
+                metadata
+            )
+
+            return cursor
+
+        except Exception as e:
+            duration_ms = (time.perf_counter() - start_time) * 1000
+
+            metadata = {
+                'query_type': query_type,
+                'table': table_name,
+                'error': str(e),
+                'batch': True
+            }
+
+            record_metric(
+                'database',
+                f'{query_type} {table_name} BATCH ERROR',
+                duration_ms,
+                metadata,
+                force=True
+            )
+
+            raise
+
+    def _get_query_type(self, query: str) -> str:
+        """
+        Extract query type from SQL statement
+
+        Args:
+            query: SQL query
+
+        Returns:
+            Query type (SELECT, INSERT, UPDATE, DELETE, etc.)
+        """
+        query_upper = query.strip().upper()
+
+        for query_type in ['SELECT', 'INSERT', 'UPDATE', 'DELETE', 'CREATE', 'DROP', 'ALTER', 'PRAGMA']:
+            if query_upper.startswith(query_type):
+                return query_type
+
+        return 'OTHER'
+
+    def _extract_table_name(self, query: str) -> str:
+        """
+        Extract table name from query (best effort)
+
+        Per IQ1: Keep it simple with basic regex patterns.
+        Returns "unknown" for complex queries.
+
+        Note: Complex queries (JOINs, subqueries, CTEs) return "unknown".
+        This covers 90% of queries accurately.
+
+        Args:
+            query: SQL query
+
+        Returns:
+            Table name or "unknown"
+        """
+        query_lower = query.lower().strip()
+
+        # Simple patterns that cover 90% of cases
+        patterns = [
+            r'from\s+(\w+)',
+            r'update\s+(\w+)',
+            r'insert\s+into\s+(\w+)',
+            r'delete\s+from\s+(\w+)',
+            r'create\s+table\s+(?:if\s+not\s+exists\s+)?(\w+)',
+            r'drop\s+table\s+(?:if\s+exists\s+)?(\w+)',
+            r'alter\s+table\s+(\w+)',
+        ]
+
+        for pattern in patterns:
+            match = re.search(pattern, query_lower)
+            if match:
+                return match.group(1)
+
+        # Complex queries (JOINs, subqueries, CTEs)
+        return "unknown"
+
+    # Delegate all other connection methods to the wrapped connection
+    def __getattr__(self, name: str) -> Any:
+        """Delegate all other methods to the wrapped connection"""
+        return getattr(self._connection, name)
+
+    def __enter__(self):
+        """Support context manager protocol"""
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        """Support context manager protocol"""
+        return self._connection.__exit__(exc_type, exc_val, exc_tb)
--- a/starpunk/monitoring/http.py
+++ b/starpunk/monitoring/http.py
@@ -0,0 +1,125 @@
+"""
+HTTP request/response metrics middleware
+
+Per v1.1.2 Phase 1 and developer Q&A IQ2:
+- Times all HTTP requests
+- Generates request IDs for tracking (IQ2)
+- Records status codes, methods, routes
+- Tracks request and response sizes
+- Adds X-Request-ID header to all responses (not just debug mode)
+
+Example usage:
+    >>> from starpunk.monitoring.http import setup_http_metrics
+    >>> app = Flask(__name__)
+    >>> setup_http_metrics(app)
+"""
+
+import time
+import uuid
+from flask import g, request, Flask
+from typing import Any
+
+from starpunk.monitoring.metrics import record_metric
+
+
+def setup_http_metrics(app: Flask) -> None:
+    """
+    Setup HTTP metrics collection for Flask app
+
+    Per IQ2: Generates request IDs and adds X-Request-ID header in all modes
+
+    Args:
+        app: Flask application instance
+    """
+
+    @app.before_request
+    def start_request_metrics():
+        """
+        Initialize request metrics tracking
+
+        Per IQ2: Generate UUID request ID and store in g
+        """
+        # Generate request ID (IQ2: in all modes, not just debug)
+        g.request_id = str(uuid.uuid4())
+
+        # Store request start time and metadata
+        g.request_start_time = time.perf_counter()
+        g.request_metadata = {
+            'method': request.method,
+            'endpoint': request.endpoint or 'unknown',
+            'path': request.path,
+            'content_length': request.content_length or 0,
+        }
+
+    @app.after_request
+    def record_response_metrics(response):
+        """
+        Record HTTP response metrics
+
+        Args:
+            response: Flask response object
+
+        Returns:
+            Modified response with X-Request-ID header
+        """
+        # Skip if metrics not initialized (shouldn't happen in normal flow)
+        if not hasattr(g, 'request_start_time'):
+            return response
+
+        # Calculate request duration
+        duration_sec = time.perf_counter() - g.request_start_time
+        duration_ms = duration_sec * 1000
+
+        # Get response size
+        response_size = 0
+        if response.data:
+            response_size = len(response.data)
+        elif hasattr(response, 'content_length') and response.content_length:
+            response_size = response.content_length
+
+        # Build metadata
+        metadata = {
+            **g.request_metadata,
+            'status_code': response.status_code,
+            'response_size': response_size,
+        }
+
+        # Record metric
+        operation_name = f"{g.request_metadata['method']} {g.request_metadata['endpoint']}"
+        record_metric(
+            'http',
+            operation_name,
+            duration_ms,
+            metadata
+        )
+
+        # Add request ID header (IQ2: in all modes)
+        response.headers['X-Request-ID'] = g.request_id
+
+        return response
+
+    @app.teardown_request
+    def record_error_metrics(error=None):
+        """
+        Record metrics for requests that result in errors
+
+        Args:
+            error: Exception if request failed
+        """
+        if error and hasattr(g, 'request_start_time'):
+            duration_ms = (time.perf_counter() - g.request_start_time) * 1000
+
+            metadata = {
+                **g.request_metadata,
+                'error': str(error),
+                'error_type': type(error).__name__,
+            }
+
+            operation_name = f"{g.request_metadata['method']} {g.request_metadata['endpoint']} ERROR"
+            record_metric(
+                'http',
+                operation_name,
+                duration_ms,
+                metadata,
+                force=True  # Always record errors
+            )
--- a/starpunk/monitoring/memory.py
+++ b/starpunk/monitoring/memory.py
@@ -0,0 +1,191 @@
+"""
+Memory monitoring background thread
+
+Per v1.1.2 Phase 1 and developer Q&A CQ5, IQ8:
+- Background daemon thread for continuous memory monitoring
+- Tracks RSS and VMS memory usage
+- Detects memory growth and potential leaks
+- 5-second baseline period after startup (IQ8)
+- Skipped in test mode (CQ5)
+
+Example usage:
+    >>> from starpunk.monitoring.memory import MemoryMonitor
+    >>> monitor = MemoryMonitor(interval=30)
+    >>> monitor.start()  # Runs as daemon thread
+    >>> # ... application runs ...
+    >>> monitor.stop()
+"""
+
+import gc
+import logging
+import os
+import sys
+import threading
+import time
+from typing import Dict, Any
+
+import psutil
+
+from starpunk.monitoring.metrics import record_metric
+
+
+logger = logging.getLogger(__name__)
+
+
+class MemoryMonitor(threading.Thread):
+    """
+    Background thread for memory monitoring
+
+    Per CQ5: Daemon thread that auto-terminates with main process
+    Per IQ8: 5-second baseline period after startup
+    """
+
+    def __init__(self, interval: int = 30):
+        """
+        Initialize memory monitor thread
+
+        Args:
+            interval: Monitoring interval in seconds (default: 30)
+        """
+        super().__init__(daemon=True)  # CQ5: daemon thread
+        self.interval = interval
+        self._stop_event = threading.Event()
+        self._process = psutil.Process()
+        self._baseline_memory = None
+        self._high_water_mark = 0
+
+    def run(self):
+        """
+        Main monitoring loop
+
+        Per IQ8: Wait 5 seconds for app initialization before setting baseline
+        """
+        try:
+            # Wait for app initialization (IQ8: 5 seconds)
+            time.sleep(5)
+
+            # Set baseline memory
+            memory_info = self._get_memory_info()
+            self._baseline_memory = memory_info['rss_mb']
+            logger.info(f"Memory monitor baseline set: {self._baseline_memory:.2f} MB RSS")
+
+            # Start monitoring loop
+            while not self._stop_event.is_set():
+                try:
+                    self._collect_metrics()
+                except Exception as e:
+                    logger.error(f"Memory monitoring error: {e}", exc_info=True)
+
+                # Wait for interval or until stop event
+                self._stop_event.wait(self.interval)
+
+        except Exception as e:
+            logger.error(f"Memory monitor thread failed: {e}", exc_info=True)
+
+    def _collect_metrics(self):
+        """Collect and record memory metrics"""
+        memory_info = self._get_memory_info()
+        gc_stats = self._get_gc_stats()
+
+        # Update high water mark
+        if memory_info['rss_mb'] > self._high_water_mark:
+            self._high_water_mark = memory_info['rss_mb']
+
+        # Calculate growth rate (MB/hour) if baseline is set
+        growth_rate = 0.0
+        if self._baseline_memory:
+            growth_rate = memory_info['rss_mb'] - self._baseline_memory
+
+        # Record metrics
+        metadata = {
+            'rss_mb': memory_info['rss_mb'],
+            'vms_mb': memory_info['vms_mb'],
+            'percent': memory_info['percent'],
+            'high_water_mb': self._high_water_mark,
+            'growth_mb': growth_rate,
+            'gc_collections': gc_stats['collections'],
+            'gc_collected': gc_stats['collected'],
+        }
+
+        record_metric(
+            'render',  # Use 'render' operation type for memory metrics
+            'memory_usage',
+            memory_info['rss_mb'],
+            metadata,
+            force=True  # Always record memory metrics
+        )
+
+        # Warn if significant growth detected (>10MB growth from baseline)
+        if growth_rate > 10.0:
+            logger.warning(
+                f"Memory growth detected: +{growth_rate:.2f} MB from baseline "
+                f"(current: {memory_info['rss_mb']:.2f} MB, baseline: {self._baseline_memory:.2f} MB)"
+            )
+
+    def _get_memory_info(self) -> Dict[str, float]:
+        """
+        Get current process memory usage
+
+        Returns:
+            Dict with memory info in MB
+        """
+        memory = self._process.memory_info()
+
+        return {
+            'rss_mb': memory.rss / (1024 * 1024),  # Resident Set Size
+            'vms_mb': memory.vms / (1024 * 1024),  # Virtual Memory Size
+            'percent': self._process.memory_percent(),
+        }
+
+    def _get_gc_stats(self) -> Dict[str, Any]:
+        """
+        Get garbage collection statistics
+
+        Returns:
+            Dict with GC stats
+        """
+        # Get collection counts per generation
+        counts = gc.get_count()
+
+        # Perform a quick gen 0 collection and count collected objects
+        collected = gc.collect(0)
+
+        return {
+            'collections': {
+                'gen0': counts[0],
+                'gen1': counts[1],
+                'gen2': counts[2],
+            },
+            'collected': collected,
+            'uncollectable': len(gc.garbage),
+        }
+
+    def stop(self):
+        """
+        Stop the monitoring thread gracefully
+
+        Sets the stop event to signal the thread to exit
+        """
+        logger.info("Stopping memory monitor")
+        self._stop_event.set()
+
+    def get_stats(self) -> Dict[str, Any]:
+        """
+        Get current memory statistics
+
+        Returns:
+            Dict with current memory stats
+        """
+        if not self._baseline_memory:
+            return {'status': 'initializing'}
+
+        memory_info = self._get_memory_info()
+
+        return {
+            'status': 'running',
+            'current_rss_mb': memory_info['rss_mb'],
+            'baseline_rss_mb': self._baseline_memory,
+            'growth_mb': memory_info['rss_mb'] - self._baseline_memory,
+            'high_water_mb': self._high_water_mark,
+            'percent': memory_info['percent'],
+        }