StarPunk/starpunk/migrations.py

"""
Database migration runner for StarPunk

Automatically discovers and applies pending migrations on startup.
Migrations are numbered SQL files in the migrations/ directory.

Fresh Database Detection:
- If schema_migrations table is empty AND schema is current
- Marks all migrations as applied (skip execution)
- This handles databases created with current INITIAL_SCHEMA_SQL

Existing Database Behavior:
- Applies only pending migrations
- Migrations already in schema_migrations are skipped

Concurrency Protection:
- Uses database-level locking (BEGIN IMMEDIATE) to prevent race conditions
- Multiple workers can start simultaneously; only one applies migrations
- Other workers wait and verify completion using exponential backoff retry
"""

import sqlite3
from pathlib import Path
import logging
import time
import random


class MigrationError(Exception):
    """Raised when a migration fails to apply"""
    pass


def create_migrations_table(conn):
    """
    Create schema_migrations tracking table if it doesn't exist

    Args:
        conn: SQLite connection
    """
    conn.execute("""
        CREATE TABLE IF NOT EXISTS schema_migrations (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            migration_name TEXT UNIQUE NOT NULL,
            applied_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP
        )
    """)

    conn.execute("""
        CREATE INDEX IF NOT EXISTS idx_schema_migrations_name
            ON schema_migrations(migration_name)
    """)

    conn.commit()


def is_schema_current(conn):
    """
    Check if database schema is current (matches INITIAL_SCHEMA_SQL + all migrations)

    Uses heuristic: Check for presence of latest schema features
    Checks for:
    - code_verifier column NOT in auth_state (removed in migration 003)
    - authorization_codes table (migration 002 or INITIAL_SCHEMA_SQL >= v1.0.0-rc.1)
    - token_hash column in tokens table (migration 002)
    - Token indexes (migration 002 only, removed from INITIAL_SCHEMA_SQL in v1.0.0-rc.2)

    Args:
        conn: SQLite connection

    Returns:
        bool: True if schema is fully current (all tables, columns, AND indexes exist)
              False if any piece is missing (legacy database needing migrations)
    """
    try:
        # Check for code_verifier column NOT in auth_state (removed in migration 003)
        # If it still exists, schema is outdated
        if column_exists(conn, 'auth_state', 'code_verifier'):
            return False

        # Check for authorization_codes table (added in migration 002)
        if not table_exists(conn, 'authorization_codes'):
            return False

        # Check for token_hash column in tokens table (migration 002)
        if not column_exists(conn, 'tokens', 'token_hash'):
            return False

        # Check for token indexes (created by migration 002 ONLY)
        # These indexes were removed from INITIAL_SCHEMA_SQL in v1.0.0-rc.2
        # to prevent conflicts when migrations run.
        # A database with tables/columns but no indexes means:
        # - INITIAL_SCHEMA_SQL was run (creating tables/columns)
        # - But migration 002 hasn't run yet (no indexes)
        # So it's NOT fully current and needs migrations.
        if not index_exists(conn, 'idx_tokens_hash'):
            return False
        if not index_exists(conn, 'idx_tokens_me'):
            return False
        if not index_exists(conn, 'idx_tokens_expires'):
            return False

        return True
    except sqlite3.OperationalError:
        # Schema check failed - definitely not current
        return False


def table_exists(conn, table_name):
    """
    Check if table exists in database

    Args:
        conn: SQLite connection
        table_name: Name of table to check

    Returns:
        bool: True if table exists
    """
    cursor = conn.execute(
        "SELECT name FROM sqlite_master WHERE type='table' AND name=?",
        (table_name,)
    )
    return cursor.fetchone() is not None


def column_exists(conn, table_name, column_name):
    """
    Check if column exists in table

    Args:
        conn: SQLite connection
        table_name: Name of table
        column_name: Name of column

    Returns:
        bool: True if column exists
    """
    try:
        cursor = conn.execute(f"PRAGMA table_info({table_name})")
        columns = [row[1] for row in cursor.fetchall()]
        return column_name in columns
    except sqlite3.OperationalError:
        return False


def index_exists(conn, index_name):
    """
    Check if index exists in database

    Args:
        conn: SQLite connection
        index_name: Name of index to check

    Returns:
        bool: True if index exists
    """
    cursor = conn.execute(
        "SELECT name FROM sqlite_master WHERE type='index' AND name=?",
        (index_name,)
    )
    return cursor.fetchone() is not None


def is_migration_needed(conn, migration_name):
    """
    Check if a specific migration is needed based on database state

    This is used for fresh databases where INITIAL_SCHEMA_SQL may have already
    included some migration features. We check the actual database state
    rather than just applying all migrations blindly.

    Args:
        conn: SQLite connection
        migration_name: Migration filename to check

    Returns:
        bool: True if migration should be applied, False if already applied via INITIAL_SCHEMA_SQL
    """
    # Migration 001: Adds code_verifier column to auth_state
    if migration_name == "001_add_code_verifier_to_auth_state.sql":
        # Check if column already exists (was added to INITIAL_SCHEMA_SQL in v0.8.0)
        return not column_exists(conn, 'auth_state', 'code_verifier')

    # Migration 002: Creates new tokens/authorization_codes tables with indexes
    if migration_name == "002_secure_tokens_and_authorization_codes.sql":
        # This migration drops and recreates the tokens table, so we check if:
        # 1. The new tokens table structure exists (token_hash column)
        # 2. The authorization_codes table exists
        # 3. The indexes exist

        # If tables/columns are missing, this is a truly legacy database - migration needed
        if not table_exists(conn, 'authorization_codes'):
            return True
        if not column_exists(conn, 'tokens', 'token_hash'):
            return True

        # If tables exist with correct structure, check indexes
        # If indexes are missing but tables exist, this is a fresh database from
        # INITIAL_SCHEMA_SQL that just needs indexes. We CANNOT run the full migration
        # (it will fail trying to CREATE TABLE). Instead, we mark it as not needed
        # and apply indexes separately.
        has_all_indexes = (
            index_exists(conn, 'idx_tokens_hash') and
            index_exists(conn, 'idx_tokens_me') and
            index_exists(conn, 'idx_tokens_expires') and
            index_exists(conn, 'idx_auth_codes_hash') and
            index_exists(conn, 'idx_auth_codes_expires')
        )

        if not has_all_indexes:
            # Tables exist but indexes missing - this is a fresh database from INITIAL_SCHEMA_SQL
            # We need to create just the indexes, not run the full migration
            # Return False (don't run migration) and handle indexes separately
            return False

        # All features exist - migration not needed
        return False

    # Migration 003: Removes code_verifier column from auth_state
    if migration_name == "003_remove_code_verifier_from_auth_state.sql":
        # Check if column still exists (should be removed)
        return column_exists(conn, 'auth_state', 'code_verifier')

    # Unknown migration - assume it's needed
    return True


def get_applied_migrations(conn):
    """
    Get set of already-applied migration names

    Args:
        conn: SQLite connection

    Returns:
        set: Set of migration filenames that have been applied
    """
    cursor = conn.execute(
        "SELECT migration_name FROM schema_migrations ORDER BY id"
    )
    return set(row[0] for row in cursor.fetchall())


def discover_migration_files(migrations_dir):
    """
    Discover all migration files in migrations directory

    Args:
        migrations_dir: Path to migrations directory

    Returns:
        list: Sorted list of (filename, full_path) tuples
    """
    if not migrations_dir.exists():
        return []

    migration_files = []
    for file_path in migrations_dir.glob("*.sql"):
        migration_files.append((file_path.name, file_path))

    # Sort by filename (numeric prefix ensures correct order)
    migration_files.sort(key=lambda x: x[0])

    return migration_files


def apply_migration(conn, migration_name, migration_path, logger=None):
    """
    Apply a single migration file

    Args:
        conn: SQLite connection
        migration_name: Filename of migration
        migration_path: Full path to migration file
        logger: Optional logger for output

    Raises:
        MigrationError: If migration fails to apply
    """
    try:
        # Read migration SQL
        migration_sql = migration_path.read_text()

        if logger:
            logger.debug(f"Applying migration: {migration_name}")

        # Execute migration in transaction
        conn.execute("BEGIN")
        conn.executescript(migration_sql)

        # Record migration as applied
        conn.execute(
            "INSERT INTO schema_migrations (migration_name) VALUES (?)",
            (migration_name,)
        )

        conn.commit()

        if logger:
            logger.info(f"Applied migration: {migration_name}")

    except Exception as e:
        conn.rollback()
        error_msg = f"Migration {migration_name} failed: {e}"
        if logger:
            logger.error(error_msg)
        raise MigrationError(error_msg)


def run_migrations(db_path, logger=None):
    """
    Run all pending database migrations with concurrency protection

    Uses database-level locking (BEGIN IMMEDIATE) to prevent race conditions
    when multiple workers start simultaneously. Only one worker will apply
    migrations; others will wait and verify completion.

    Called automatically during database initialization.
    Discovers migration files, checks which have been applied,
    and applies any pending migrations in order.

    Fresh Database Behavior:
    - If schema_migrations table is empty AND schema is current
    - Marks all migrations as applied (skip execution)
    - This handles databases created with current INITIAL_SCHEMA_SQL

    Existing Database Behavior:
    - Applies only pending migrations
    - Migrations already in schema_migrations are skipped

    Concurrency Protection:
    - Uses BEGIN IMMEDIATE for database-level locking
    - Implements exponential backoff retry (10 attempts, up to 120s total)
    - Graduated logging (DEBUG → INFO → WARNING) based on retry count
    - Creates new connection for each retry attempt

    Args:
        db_path: Path to SQLite database file
        logger: Optional logger for output

    Raises:
        MigrationError: If any migration fails to apply or lock cannot be acquired
    """
    if logger is None:
        logger = logging.getLogger(__name__)

    # Determine migrations directory
    # Assumes migrations/ is in project root, sibling to starpunk/
    migrations_dir = Path(__file__).parent.parent / "migrations"

    if not migrations_dir.exists():
        logger.warning(f"Migrations directory not found: {migrations_dir}")
        return

    # Retry configuration for lock acquisition
    max_retries = 10
    retry_count = 0
    base_delay = 0.1  # 100ms
    start_time = time.time()
    max_total_time = 120  # 2 minutes absolute maximum

    while retry_count < max_retries and (time.time() - start_time) < max_total_time:
        conn = None
        try:
            # Connect with longer timeout for lock contention
            # 30s per attempt allows one worker to complete migrations
            conn = sqlite3.connect(db_path, timeout=30.0)

            # Attempt to acquire exclusive lock for migrations
            # BEGIN IMMEDIATE acquires RESERVED lock, preventing other writes
            # but allowing reads. Escalates to EXCLUSIVE during actual writes.
            conn.execute("BEGIN IMMEDIATE")

            try:
                # Ensure migrations tracking table exists
                create_migrations_table(conn)

                # Quick check: have migrations already been applied by another worker?
                cursor = conn.execute("SELECT COUNT(*) FROM schema_migrations")
                migration_count = cursor.fetchone()[0]

                # Discover migration files
                migration_files = discover_migration_files(migrations_dir)

                if not migration_files:
                    conn.commit()
                    logger.info("No migration files found")
                    return

                # If migrations exist and we're not the first worker, verify and exit
                if migration_count > 0:
                    # Check if all migrations are applied
                    applied = get_applied_migrations(conn)
                    pending = [m for m, _ in migration_files if m not in applied]

                    if not pending:
                        conn.commit()
                        logger.debug("All migrations already applied by another worker")
                        return
                    # If there are pending migrations, we continue to apply them
                    logger.info(f"Found {len(pending)} pending migrations to apply")

                # Fresh database detection (original logic preserved)
                if migration_count == 0:
                    if is_schema_current(conn):
                        # Schema is current - mark all migrations as applied
                        for migration_name, _ in migration_files:
                            conn.execute(
                                "INSERT INTO schema_migrations (migration_name) VALUES (?)",
                                (migration_name,)
                            )
                        conn.commit()
                        logger.info(
                            f"Fresh database detected: marked {len(migration_files)} "
                            f"migrations as applied (schema already current)"
                        )
                        return
                    else:
                        logger.info("Fresh database with partial schema: applying needed migrations")

                # Get already-applied migrations
                applied = get_applied_migrations(conn)

                # Apply pending migrations (original logic preserved)
                pending_count = 0
                skipped_count = 0
                for migration_name, migration_path in migration_files:
                    if migration_name not in applied:
                        # Check if migration is actually needed
                        # For fresh databases (migration_count == 0), check all migrations
                        # For migration 002, ALWAYS check (handles partially migrated databases)
                        should_check_needed = (
                            migration_count == 0 or
                            migration_name == "002_secure_tokens_and_authorization_codes.sql"
                        )

                        if should_check_needed and not is_migration_needed(conn, migration_name):
                            # Special handling for migration 002: if tables exist but indexes don't,
                            # create just the indexes
                            if migration_name == "002_secure_tokens_and_authorization_codes.sql":
                                # Check if we need to create indexes
                                indexes_to_create = []
                                if not index_exists(conn, 'idx_tokens_hash'):
                                    indexes_to_create.append("CREATE INDEX idx_tokens_hash ON tokens(token_hash)")
                                if not index_exists(conn, 'idx_tokens_me'):
                                    indexes_to_create.append("CREATE INDEX idx_tokens_me ON tokens(me)")
                                if not index_exists(conn, 'idx_tokens_expires'):
                                    indexes_to_create.append("CREATE INDEX idx_tokens_expires ON tokens(expires_at)")
                                if not index_exists(conn, 'idx_auth_codes_hash'):
                                    indexes_to_create.append("CREATE INDEX idx_auth_codes_hash ON authorization_codes(code_hash)")
                                if not index_exists(conn, 'idx_auth_codes_expires'):
                                    indexes_to_create.append("CREATE INDEX idx_auth_codes_expires ON authorization_codes(expires_at)")

                                if indexes_to_create:
                                    for index_sql in indexes_to_create:
                                        conn.execute(index_sql)
                                    logger.info(f"Created {len(indexes_to_create)} missing indexes from migration 002")

                            # Mark as applied without executing full migration (INITIAL_SCHEMA_SQL already has table changes)
                            conn.execute(
                                "INSERT INTO schema_migrations (migration_name) VALUES (?)",
                                (migration_name,)
                            )
                            skipped_count += 1
                            logger.debug(f"Skipped migration {migration_name} (already in INITIAL_SCHEMA_SQL)")
                        else:
                            # Apply the migration (within our transaction)
                            try:
                                # Read migration SQL
                                migration_sql = migration_path.read_text()

                                logger.debug(f"Applying migration: {migration_name}")

                                # Execute migration (already in transaction)
                                conn.executescript(migration_sql)

                                # Record migration as applied
                                conn.execute(
                                    "INSERT INTO schema_migrations (migration_name) VALUES (?)",
                                    (migration_name,)
                                )

                                logger.info(f"Applied migration: {migration_name}")
                                pending_count += 1

                            except Exception as e:
                                # Roll back the transaction - will be handled by outer exception handler
                                raise MigrationError(f"Migration {migration_name} failed: {e}")

                # Commit all migrations atomically
                conn.commit()

                # Summary
                total_count = len(migration_files)
                if pending_count > 0 or skipped_count > 0:
                    if skipped_count > 0:
                        logger.info(
                            f"Migrations complete: {pending_count} applied, {skipped_count} skipped "
                            f"(already in INITIAL_SCHEMA_SQL), {total_count} total"
                        )
                    else:
                        logger.info(
                            f"Migrations complete: {pending_count} applied, "
                            f"{total_count} total"
                        )
                else:
                    logger.info(f"All migrations up to date ({total_count} total)")

                return  # Success!

            except MigrationError:
                # Migration error - rollback and re-raise
                try:
                    conn.rollback()
                except Exception as rollback_error:
                    logger.critical(f"FATAL: Rollback failed: {rollback_error}")
                    raise SystemExit(1)
                raise

            except Exception as e:
                # Unexpected error during migration - rollback and wrap
                try:
                    conn.rollback()
                except Exception as rollback_error:
                    logger.critical(f"FATAL: Rollback failed: {rollback_error}")
                    raise SystemExit(1)
                raise MigrationError(f"Migration system error: {e}")

        except sqlite3.OperationalError as e:
            if "database is locked" in str(e).lower():
                # Another worker has the lock, retry with exponential backoff
                retry_count += 1

                if retry_count < max_retries:
                    # Exponential backoff with jitter to prevent thundering herd
                    delay = base_delay * (2 ** retry_count) + random.uniform(0, 0.1)

                    # Graduated logging based on retry count
                    if retry_count <= 3:
                        # Normal operation - DEBUG level
                        logger.debug(
                            f"Database locked by another worker, retry {retry_count}/{max_retries} "
                            f"in {delay:.2f}s"
                        )
                    elif retry_count <= 7:
                        # Getting concerning - INFO level
                        logger.info(
                            f"Database locked by another worker, retry {retry_count}/{max_retries} "
                            f"in {delay:.2f}s"
                        )
                    else:
                        # Abnormal - WARNING level
                        logger.warning(
                            f"Database locked by another worker, retry {retry_count}/{max_retries} "
                            f"in {delay:.2f}s (approaching max retries)"
                        )

                    time.sleep(delay)
                    continue
                else:
                    # Retries exhausted
                    elapsed = time.time() - start_time
                    raise MigrationError(
                        f"Failed to acquire migration lock after {max_retries} attempts over {elapsed:.1f}s. "
                        f"Possible causes:\n"
                        f"1. Another process is stuck in migration (check logs)\n"
                        f"2. Database file permissions issue\n"
                        f"3. Disk I/O problems\n"
                        f"Action: Restart container with single worker to diagnose"
                    )
            else:
                # Non-lock related database error
                error_msg = f"Database error during migration: {e}"
                logger.error(error_msg)
                raise MigrationError(error_msg)

        except MigrationError:
            # Re-raise migration errors (already logged)
            raise

        except Exception as e:
            # Unexpected error
            error_msg = f"Unexpected error during migration: {e}"
            logger.error(error_msg)
            raise MigrationError(error_msg)

        finally:
            if conn:
                try:
                    conn.close()
                except:
                    pass  # Ignore errors during cleanup

    # Should only reach here if time limit exceeded
    elapsed = time.time() - start_time
    raise MigrationError(
        f"Migration timeout: Failed to acquire lock within {max_total_time}s limit "
        f"(elapsed: {elapsed:.1f}s, retries: {retry_count})"
    )