From 2240414f227c354ef97dfb8d9cb38185d7fc9dea Mon Sep 17 00:00:00 2001 From: Phil Skentelbery Date: Mon, 24 Nov 2025 18:53:55 -0700 Subject: [PATCH] docs: Add architect documentation for migration race condition fix MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add comprehensive architectural documentation for the migration race condition fix, including: - ADR-022: Architectural decision record for the fix - migration-race-condition-answers.md: All 23 Q&A answered - migration-fix-quick-reference.md: Implementation checklist - migration-race-condition-fix-implementation.md: Detailed guide These documents guided the implementation in v1.0.0-rc.5. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- .../migration-fix-quick-reference.md | 238 +++++++++ .../migration-race-condition-answers.md | 477 ++++++++++++++++++ .../ADR-022-migration-race-condition-fix.md | 208 ++++++++ ...ation-race-condition-fix-implementation.md | 431 ++++++++++++++++ 4 files changed, 1354 insertions(+) create mode 100644 docs/architecture/migration-fix-quick-reference.md create mode 100644 docs/architecture/migration-race-condition-answers.md create mode 100644 docs/decisions/ADR-022-migration-race-condition-fix.md create mode 100644 docs/reports/migration-race-condition-fix-implementation.md diff --git a/docs/architecture/migration-fix-quick-reference.md b/docs/architecture/migration-fix-quick-reference.md new file mode 100644 index 0000000..8f60f50 --- /dev/null +++ b/docs/architecture/migration-fix-quick-reference.md @@ -0,0 +1,238 @@ +# Migration Race Condition Fix - Quick Implementation Reference + +## Implementation Checklist + +### Code Changes - `/home/phil/Projects/starpunk/starpunk/migrations.py` + +```python +# 1. Add imports at top +import time +import random + +# 2. Replace entire run_migrations function (lines 304-462) +# See full implementation in migration-race-condition-fix-implementation.md + +# Key patterns to implement: + +# A. Retry loop structure +max_retries = 10 +retry_count = 0 +base_delay = 0.1 +start_time = time.time() +max_total_time = 120 # 2 minute absolute max + +while retry_count < max_retries and (time.time() - start_time) < max_total_time: + conn = None # NEW connection each iteration + try: + conn = sqlite3.connect(db_path, timeout=30.0) + conn.execute("BEGIN IMMEDIATE") # Lock acquisition + # ... migration logic ... + conn.commit() + return # Success + except sqlite3.OperationalError as e: + if "database is locked" in str(e).lower(): + retry_count += 1 + if retry_count < max_retries: + # Exponential backoff with jitter + delay = base_delay * (2 ** retry_count) + random.uniform(0, 0.1) + # Graduated logging + if retry_count <= 3: + logger.debug(f"Retry {retry_count}/{max_retries}") + elif retry_count <= 7: + logger.info(f"Retry {retry_count}/{max_retries}") + else: + logger.warning(f"Retry {retry_count}/{max_retries}") + time.sleep(delay) + continue + finally: + if conn: + try: + conn.close() + except: + pass + +# B. Error handling pattern +except Exception as e: + try: + conn.rollback() + except Exception as rollback_error: + logger.critical(f"FATAL: Rollback failed: {rollback_error}") + raise SystemExit(1) + raise MigrationError(f"Migration failed: {e}") + +# C. Final error message +raise MigrationError( + f"Failed to acquire migration lock after {max_retries} attempts over {elapsed:.1f}s. " + f"Possible causes:\n" + f"1. Another process is stuck in migration (check logs)\n" + f"2. Database file permissions issue\n" + f"3. Disk I/O problems\n" + f"Action: Restart container with single worker to diagnose" +) +``` + +### Testing Requirements + +#### 1. Unit Test File: `test_migration_race_condition.py` +```python +import multiprocessing +from multiprocessing import Barrier, Process +import time + +def test_concurrent_migrations(): + """Test 4 workers starting simultaneously""" + barrier = Barrier(4) + + def worker(worker_id): + barrier.wait() # Synchronize start + from starpunk import create_app + app = create_app() + return True + + with multiprocessing.Pool(4) as pool: + results = pool.map(worker, range(4)) + + assert all(results), "Some workers failed" + +def test_lock_retry(): + """Test retry logic with mock""" + with patch('sqlite3.connect') as mock: + mock.side_effect = [ + sqlite3.OperationalError("database is locked"), + sqlite3.OperationalError("database is locked"), + MagicMock() # Success on 3rd try + ] + run_migrations(db_path) + assert mock.call_count == 3 +``` + +#### 2. Integration Test: `test_integration.sh` +```bash +#!/bin/bash +# Test with actual gunicorn + +# Clean start +rm -f test.db + +# Start gunicorn with 4 workers +timeout 10 gunicorn --workers 4 --bind 127.0.0.1:8001 app:app & +PID=$! + +# Wait for startup +sleep 3 + +# Check if running +if ! kill -0 $PID 2>/dev/null; then + echo "FAILED: Gunicorn crashed" + exit 1 +fi + +# Check health endpoint +curl -f http://127.0.0.1:8001/health || exit 1 + +# Cleanup +kill $PID + +echo "SUCCESS: All workers started without race condition" +``` + +#### 3. Container Test: `test_container.sh` +```bash +#!/bin/bash +# Test in container environment + +# Build +podman build -t starpunk:race-test -f Containerfile . + +# Run with fresh database +podman run --rm -d --name race-test \ + -v $(pwd)/test-data:/data \ + starpunk:race-test + +# Check logs for success patterns +sleep 5 +podman logs race-test | grep -E "(Applied migration|already applied by another worker)" + +# Cleanup +podman stop race-test +``` + +### Verification Patterns in Logs + +#### Successful Migration (One Worker Wins) +``` +Worker 0: Applying migration: 001_initial_schema.sql +Worker 1: Database locked by another worker, retry 1/10 in 0.21s +Worker 2: Database locked by another worker, retry 1/10 in 0.23s +Worker 3: Database locked by another worker, retry 1/10 in 0.19s +Worker 0: Applied migration: 001_initial_schema.sql +Worker 1: All migrations already applied by another worker +Worker 2: All migrations already applied by another worker +Worker 3: All migrations already applied by another worker +``` + +#### Performance Metrics to Check +- Single worker: < 100ms total +- 4 workers: < 500ms total +- 10 workers (stress): < 2000ms total + +### Rollback Plan if Issues + +1. **Immediate Workaround** + ```bash + # Change to single worker temporarily + gunicorn --workers 1 --bind 0.0.0.0:8000 app:app + ``` + +2. **Revert Code** + ```bash + git revert HEAD + ``` + +3. **Emergency Patch** + ```python + # In app.py temporarily + import os + if os.getenv('GUNICORN_WORKER_ID', '1') == '1': + init_db() # Only first worker runs migrations + ``` + +### Deployment Commands + +```bash +# 1. Run tests +python -m pytest test_migration_race_condition.py -v + +# 2. Build container +podman build -t starpunk:v1.0.0-rc.3.1 -f Containerfile . + +# 3. Tag for release +podman tag starpunk:v1.0.0-rc.3.1 git.philmade.com/starpunk:v1.0.0-rc.3.1 + +# 4. Push +podman push git.philmade.com/starpunk:v1.0.0-rc.3.1 + +# 5. Deploy +kubectl rollout restart deployment/starpunk +``` + +--- + +## Critical Points to Remember + +1. **NEW CONNECTION EACH RETRY** - Don't reuse connections +2. **BEGIN IMMEDIATE** - Not EXCLUSIVE, not DEFERRED +3. **30s per attempt, 120s total max** - Two different timeouts +4. **Graduated logging** - DEBUG → INFO → WARNING based on retry count +5. **Test at multiple levels** - Unit, integration, container +6. **Fresh database state** between tests + +## Support + +If issues arise, check: +1. `/home/phil/Projects/starpunk/docs/architecture/migration-race-condition-answers.md` - Full Q&A +2. `/home/phil/Projects/starpunk/docs/reports/migration-race-condition-fix-implementation.md` - Detailed implementation +3. SQLite lock states: `PRAGMA lock_status` during issue + +--- +*Quick Reference v1.0 - 2025-11-24* \ No newline at end of file diff --git a/docs/architecture/migration-race-condition-answers.md b/docs/architecture/migration-race-condition-answers.md new file mode 100644 index 0000000..a2c5057 --- /dev/null +++ b/docs/architecture/migration-race-condition-answers.md @@ -0,0 +1,477 @@ +# Migration Race Condition Fix - Architectural Answers + +## Status: READY FOR IMPLEMENTATION + +All 23 questions have been answered with concrete guidance. The developer can proceed with implementation. + +--- + +## Critical Questions + +### 1. Connection Lifecycle Management +**Q: Should we create a new connection for each retry or reuse the same connection?** + +**Answer: NEW CONNECTION per retry** +- Each retry MUST create a fresh connection +- Rationale: Failed lock acquisition may leave connection in inconsistent state +- SQLite connections are lightweight; overhead is minimal +- Pattern: + ```python + while retry_count < max_retries: + conn = None # Fresh connection each iteration + try: + conn = sqlite3.connect(db_path, timeout=30.0) + # ... attempt migration ... + finally: + if conn: + conn.close() + ``` + +### 2. Transaction Boundaries +**Q: Should init_db() wrap everything in one transaction?** + +**Answer: NO - Separate transactions for different operations** +- Schema creation: Own transaction (already implicit in executescript) +- Migrations: Own transaction with BEGIN IMMEDIATE +- Initial data: Own transaction +- Rationale: Minimizes lock duration and allows partial success visibility +- Each operation is atomic but independent + +### 3. Lock Timeout vs Retry Timeout +**Q: Connection timeout is 30s but retry logic could take ~102s. Conflict?** + +**Answer: This is BY DESIGN - No conflict** +- 30s timeout: Maximum wait for any single lock acquisition attempt +- 102s total: Maximum cumulative retry duration across multiple attempts +- If one worker holds lock for 30s+, other workers timeout and retry +- Pattern ensures no single worker waits indefinitely +- Recommendation: Add total timeout check: + ```python + start_time = time.time() + max_total_time = 120 # 2 minutes absolute maximum + while retry_count < max_retries and (time.time() - start_time) < max_total_time: + ``` + +### 4. Testing Strategy +**Q: Should we use multiprocessing.Pool or actual gunicorn for testing?** + +**Answer: BOTH - Different test levels** +- Unit tests: multiprocessing.Pool (fast, isolated) +- Integration tests: Actual gunicorn with --workers 4 +- Container tests: Full podman/docker run +- Test matrix: + ``` + Level 1: Mock concurrent access (unit) + Level 2: multiprocessing.Pool (integration) + Level 3: gunicorn locally (system) + Level 4: Container with gunicorn (e2e) + ``` + +### 5. BEGIN IMMEDIATE vs EXCLUSIVE +**Q: Why use BEGIN IMMEDIATE instead of BEGIN EXCLUSIVE?** + +**Answer: BEGIN IMMEDIATE is CORRECT choice** +- BEGIN IMMEDIATE: Acquires RESERVED lock (prevents other writes, allows reads) +- BEGIN EXCLUSIVE: Acquires EXCLUSIVE lock (prevents all access) +- Rationale: + - Migrations only need to prevent concurrent migrations (writes) + - Other workers can still read schema while one migrates + - Less contention, faster startup + - Only escalates to EXCLUSIVE when actually writing +- Keep BEGIN IMMEDIATE as specified + +--- + +## Edge Cases and Error Handling + +### 6. Partial Migration Failure +**Q: What if a migration partially applies or rollback fails?** + +**Answer: Transaction atomicity handles this** +- Within transaction: Automatic rollback on ANY error +- Rollback failure: Extremely rare (corrupt database) +- Strategy: + ```python + except Exception as e: + try: + conn.rollback() + except Exception as rollback_error: + logger.critical(f"FATAL: Rollback failed: {rollback_error}") + # Database potentially corrupt - fail hard + raise SystemExit(1) + raise MigrationError(e) + ``` + +### 7. Migration File Consistency +**Q: What if migration files change during deployment?** + +**Answer: Not a concern with proper deployment** +- Container deployments: Files are immutable in image +- Traditional deployment: Use atomic directory swap +- If concerned, add checksum validation: + ```python + # Store in schema_migrations: (name, checksum, applied_at) + # Verify checksum matches before applying + ``` + +### 8. Retry Exhaustion Error Messages +**Q: What error message when retries exhausted?** + +**Answer: Be specific and actionable** +```python +raise MigrationError( + f"Failed to acquire migration lock after {max_retries} attempts over {elapsed:.1f}s. " + f"Possible causes:\n" + f"1. Another process is stuck in migration (check logs)\n" + f"2. Database file permissions issue\n" + f"3. Disk I/O problems\n" + f"Action: Restart container with single worker to diagnose" +) +``` + +### 9. Logging Levels +**Q: What log level for lock waits?** + +**Answer: Graduated approach** +- Retry 1-3: DEBUG (normal operation) +- Retry 4-7: INFO (getting concerning) +- Retry 8+: WARNING (abnormal) +- Exhausted: ERROR (operation failed) +- Pattern: + ```python + if retry_count <= 3: + level = logging.DEBUG + elif retry_count <= 7: + level = logging.INFO + else: + level = logging.WARNING + logger.log(level, f"Retry {retry_count}/{max_retries}") + ``` + +### 10. Index Creation Failure +**Q: How to handle index creation failures in migration 002?** + +**Answer: Fail fast with clear context** +```python +for index_name, index_sql in indexes_to_create: + try: + conn.execute(index_sql) + except sqlite3.OperationalError as e: + if "already exists" in str(e): + logger.debug(f"Index {index_name} already exists") + else: + raise MigrationError( + f"Failed to create index {index_name}: {e}\n" + f"SQL: {index_sql}" + ) +``` + +--- + +## Testing Strategy + +### 11. Concurrent Testing Simulation +**Q: How to properly simulate concurrent worker startup?** + +**Answer: Multiple approaches** +```python +# Approach 1: Barrier synchronization +def test_concurrent_migrations(): + barrier = multiprocessing.Barrier(4) + + def worker(): + barrier.wait() # All start together + return run_migrations(db_path) + + with multiprocessing.Pool(4) as pool: + results = pool.map(worker, range(4)) + +# Approach 2: Process start +processes = [] +for i in range(4): + p = Process(target=run_migrations, args=(db_path,)) + processes.append(p) +for p in processes: + p.start() # Near-simultaneous +``` + +### 12. Lock Contention Testing +**Q: How to test lock contention scenarios?** + +**Answer: Inject delays** +```python +# Test helper to force contention +def slow_migration_for_testing(conn): + conn.execute("BEGIN IMMEDIATE") + time.sleep(2) # Force other workers to wait + # Apply migration + conn.commit() + +# Test timeout handling +@patch('sqlite3.connect') +def test_lock_timeout(mock_connect): + mock_connect.side_effect = sqlite3.OperationalError("database is locked") + # Verify retry logic +``` + +### 13. Performance Tests +**Q: What timing is acceptable?** + +**Answer: Performance targets** +- Single worker: < 100ms for all migrations +- 4 workers with contention: < 500ms total +- 10 workers stress test: < 2s total +- Lock acquisition per retry: < 50ms +- Test with: + ```python + import timeit + setup_time = timeit.timeit(lambda: create_app(), number=1) + assert setup_time < 0.5, f"Startup too slow: {setup_time}s" + ``` + +### 14. Retry Logic Unit Tests +**Q: How to unit test retry logic?** + +**Answer: Mock the lock failures** +```python +class TestRetryLogic: + def test_retry_on_lock(self): + with patch('sqlite3.connect') as mock: + # First 2 attempts fail, 3rd succeeds + mock.side_effect = [ + sqlite3.OperationalError("database is locked"), + sqlite3.OperationalError("database is locked"), + MagicMock() # Success + ] + run_migrations(db_path) + assert mock.call_count == 3 +``` + +--- + +## SQLite-Specific Concerns + +### 15. BEGIN IMMEDIATE vs EXCLUSIVE (Detailed) +**Q: Deep dive on lock choice?** + +**Answer: Lock escalation path** +``` +BEGIN DEFERRED → SHARED → RESERVED → EXCLUSIVE +BEGIN IMMEDIATE → RESERVED → EXCLUSIVE +BEGIN EXCLUSIVE → EXCLUSIVE + +For migrations: +- IMMEDIATE starts at RESERVED (blocks other writers immediately) +- Escalates to EXCLUSIVE only during actual writes +- Optimal for our use case +``` + +### 16. WAL Mode Interaction +**Q: How does this work with WAL mode?** + +**Answer: Works correctly with both modes** +- Journal mode: BEGIN IMMEDIATE works as described +- WAL mode: BEGIN IMMEDIATE still prevents concurrent writers +- No code changes needed +- Add mode detection for logging: + ```python + cursor = conn.execute("PRAGMA journal_mode") + mode = cursor.fetchone()[0] + logger.debug(f"Database in {mode} mode") + ``` + +### 17. Database File Permissions +**Q: How to handle permission issues?** + +**Answer: Fail fast with helpful diagnostics** +```python +import os +import stat + +db_path = Path(db_path) +if not db_path.exists(): + # Will be created - check parent dir + parent = db_path.parent + if not os.access(parent, os.W_OK): + raise MigrationError(f"Cannot write to directory: {parent}") +else: + # Check existing file + if not os.access(db_path, os.W_OK): + stats = os.stat(db_path) + mode = stat.filemode(stats.st_mode) + raise MigrationError( + f"Database not writable: {db_path}\n" + f"Permissions: {mode}\n" + f"Owner: {stats.st_uid}:{stats.st_gid}" + ) +``` + +--- + +## Deployment/Operations + +### 18. Container Startup and Health Checks +**Q: How to handle health checks during migration?** + +**Answer: Return 503 during migration** +```python +# In app.py +MIGRATION_IN_PROGRESS = False + +def create_app(): + global MIGRATION_IN_PROGRESS + MIGRATION_IN_PROGRESS = True + try: + init_db() + finally: + MIGRATION_IN_PROGRESS = False + +@app.route('/health') +def health(): + if MIGRATION_IN_PROGRESS: + return {'status': 'migrating'}, 503 + return {'status': 'healthy'}, 200 +``` + +### 19. Monitoring and Alerting +**Q: What metrics/alerts are needed?** + +**Answer: Key metrics to track** +```python +# Add metrics collection +metrics = { + 'migration_duration_ms': 0, + 'migration_retries': 0, + 'migration_lock_wait_ms': 0, + 'migrations_applied': 0 +} + +# Alert thresholds +ALERTS = { + 'migration_duration_ms': 5000, # Alert if > 5s + 'migration_retries': 5, # Alert if > 5 retries + 'worker_failures': 1 # Alert on any failure +} + +# Log in structured format +logger.info(json.dumps({ + 'event': 'migration_complete', + 'metrics': metrics +})) +``` + +--- + +## Alternative Approaches + +### 20. Version Compatibility +**Q: How to handle version mismatches?** + +**Answer: Strict version checking** +```python +# In migrations.py +MIGRATION_VERSION = "1.0.0" + +def check_version_compatibility(conn): + cursor = conn.execute( + "SELECT value FROM app_config WHERE key = 'migration_version'" + ) + row = cursor.fetchone() + if row and row[0] != MIGRATION_VERSION: + raise MigrationError( + f"Version mismatch: Database={row[0]}, Code={MIGRATION_VERSION}\n" + f"Action: Run migration tool separately" + ) +``` + +### 21. File-Based Locking +**Q: Should we consider flock() as backup?** + +**Answer: NO - Adds complexity without benefit** +- SQLite locking is sufficient and portable +- flock() not available on all systems +- Would require additional cleanup logic +- Database-level locking is the correct approach + +### 22. Gunicorn Preload +**Q: Would --preload flag help?** + +**Answer: NO - Makes problem WORSE** +- --preload runs app initialization ONCE in master +- Workers fork from master AFTER migrations complete +- BUT: Doesn't work with lazy-loaded resources +- Current architecture expects per-worker initialization +- Keep current approach + +### 23. Application-Level Locks +**Q: Should we add Redis/memcached for coordination?** + +**Answer: NO - Violates simplicity principle** +- Adds external dependency +- More complex deployment +- SQLite locking is sufficient +- Would require Redis/memcached to be running before app starts +- Solving a solved problem + +--- + +## Final Implementation Checklist + +### Required Changes + +1. ✅ Add imports: `time`, `random` +2. ✅ Implement retry loop with exponential backoff +3. ✅ Use BEGIN IMMEDIATE for lock acquisition +4. ✅ Add graduated logging levels +5. ✅ Proper error messages with diagnostics +6. ✅ Fresh connection per retry +7. ✅ Total timeout check (2 minutes max) +8. ✅ Preserve all existing migration logic + +### Test Coverage Required + +1. ✅ Unit test: Retry on lock +2. ✅ Unit test: Exhaustion handling +3. ✅ Integration test: 4 workers with multiprocessing +4. ✅ System test: gunicorn with 4 workers +5. ✅ Container test: Full deployment simulation +6. ✅ Performance test: < 500ms with contention + +### Documentation Updates + +1. ✅ Update ADR-022 with final decision +2. ✅ Add operational runbook for migration issues +3. ✅ Document monitoring metrics +4. ✅ Update deployment guide with health check info + +--- + +## Go/No-Go Decision + +### ✅ GO FOR IMPLEMENTATION + +**Rationale:** +- All 23 questions have concrete answers +- Design is proven with SQLite's native capabilities +- No external dependencies needed +- Risk is low with clear rollback plan +- Testing strategy is comprehensive + +**Implementation Priority: IMMEDIATE** +- This is blocking v1.0.0-rc.4 release +- Production systems affected +- Fix is well-understood and low-risk + +**Next Steps:** +1. Implement changes to migrations.py as specified +2. Run test suite at all levels +3. Deploy as hotfix v1.0.0-rc.3.1 +4. Monitor metrics in production +5. Document lessons learned + +--- + +*Document Version: 1.0* +*Created: 2025-11-24* +*Status: Approved for Implementation* +*Author: StarPunk Architecture Team* \ No newline at end of file diff --git a/docs/decisions/ADR-022-migration-race-condition-fix.md b/docs/decisions/ADR-022-migration-race-condition-fix.md new file mode 100644 index 0000000..8d1ee3d --- /dev/null +++ b/docs/decisions/ADR-022-migration-race-condition-fix.md @@ -0,0 +1,208 @@ +# ADR-022: Database Migration Race Condition Resolution + +## Status +Accepted + +## Context + +In production, StarPunk runs with multiple gunicorn workers (currently 4). Each worker process independently initializes the Flask application through `create_app()`, which calls `init_db()`, which in turn runs database migrations via `run_migrations()`. + +When the container starts fresh, all 4 workers start simultaneously and attempt to: +1. Create the `schema_migrations` table +2. Apply pending migrations +3. Insert records into `schema_migrations` + +This causes a race condition where: +- Worker 1 successfully applies migration and inserts record +- Workers 2-4 fail with "UNIQUE constraint failed: schema_migrations.migration_name" +- Failed workers crash, causing container restarts +- After restart, migrations are already applied so it works + +## Decision + +We will implement **database-level advisory locking** using SQLite's transaction mechanism with IMMEDIATE mode, combined with retry logic. This approach: + +1. Uses SQLite's built-in `BEGIN IMMEDIATE` transaction to acquire a write lock +2. Implements exponential backoff retry for workers that can't acquire the lock +3. Ensures only one worker can run migrations at a time +4. Other workers wait and verify migrations are complete + +This is the simplest, most robust solution that: +- Requires minimal code changes +- Uses SQLite's native capabilities +- Doesn't require external dependencies +- Works across all deployment scenarios + +## Rationale + +### Options Considered + +1. **File-based locking (fcntl)** + - Pro: Simple to implement + - Con: Doesn't work across containers/network filesystems + - Con: Lock files can be orphaned if process crashes + +2. **Run migrations before workers start** + - Pro: Cleanest separation of concerns + - Con: Requires container entrypoint script changes + - Con: Complicates development workflow + - Con: Doesn't fix the root cause for non-container deployments + +3. **Make migration insertion idempotent (INSERT OR IGNORE)** + - Pro: Simple SQL change + - Con: Doesn't prevent parallel migration execution + - Con: Could corrupt database if migrations partially apply + - Con: Masks the real problem + +4. **Database advisory locking (CHOSEN)** + - Pro: Uses SQLite's native transaction locking + - Pro: Guaranteed atomicity + - Pro: Works across all deployment scenarios + - Pro: Self-cleaning (no orphaned locks) + - Con: Requires retry logic + +### Why Database Locking? + +SQLite's `BEGIN IMMEDIATE` transaction mode acquires a RESERVED lock immediately, preventing other connections from writing. This provides: + +1. **Atomicity**: Either all migrations apply or none do +2. **Isolation**: Only one worker can modify schema at a time +3. **Automatic cleanup**: Locks released on connection close/crash +4. **No external dependencies**: Uses SQLite's built-in features + +## Implementation + +The fix will be implemented in `/home/phil/Projects/starpunk/starpunk/migrations.py`: + +```python +def run_migrations(db_path, logger=None): + """Run all pending database migrations with concurrency protection""" + + max_retries = 10 + retry_count = 0 + base_delay = 0.1 # 100ms + + while retry_count < max_retries: + try: + conn = sqlite3.connect(db_path, timeout=30.0) + + # Acquire exclusive lock for migrations + conn.execute("BEGIN IMMEDIATE") + + try: + # Create migrations table if needed + create_migrations_table(conn) + + # Check if another worker already ran migrations + cursor = conn.execute("SELECT COUNT(*) FROM schema_migrations") + if cursor.fetchone()[0] > 0: + # Migrations already run by another worker + conn.commit() + logger.info("Migrations already applied by another worker") + return + + # Run migration logic (existing code) + # ... rest of migration code ... + + conn.commit() + return # Success + + except Exception: + conn.rollback() + raise + + except sqlite3.OperationalError as e: + if "database is locked" in str(e): + retry_count += 1 + delay = base_delay * (2 ** retry_count) + random.uniform(0, 0.1) + + if retry_count < max_retries: + logger.debug(f"Database locked, retry {retry_count}/{max_retries} in {delay:.2f}s") + time.sleep(delay) + else: + raise MigrationError(f"Failed to acquire migration lock after {max_retries} attempts") + else: + raise + + finally: + if conn: + conn.close() +``` + +Additional changes needed: + +1. Add imports: `import time`, `import random` +2. Modify connection timeout from default 5s to 30s +3. Add early check for already-applied migrations +4. Wrap entire migration process in IMMEDIATE transaction + +## Consequences + +### Positive +- Eliminates race condition completely +- No container configuration changes needed +- Works in all deployment scenarios (container, systemd, manual) +- Minimal code changes (~50 lines) +- Self-healing (no manual lock cleanup needed) +- Provides clear logging of what's happening + +### Negative +- Slight startup delay for workers that wait (100ms-2s typical) +- Adds complexity to migration runner +- Requires careful testing of retry logic + +### Neutral +- Workers start sequentially for migration phase, then run in parallel +- First worker to acquire lock runs migrations for all +- Log output will show retry attempts (useful for debugging) + +## Testing Strategy + +1. **Unit test with mock**: Test retry logic with simulated lock contention +2. **Integration test**: Spawn multiple processes, verify only one runs migrations +3. **Container test**: Build container, verify clean startup with 4 workers +4. **Stress test**: Start 20 processes simultaneously, verify correctness + +## Migration Path + +1. Implement fix in `starpunk/migrations.py` +2. Test locally with multiple workers +3. Build and test container +4. Deploy as v1.0.0-rc.4 or hotfix v1.0.0-rc.3.1 +5. Monitor production logs for retry patterns + +## Implementation Notes (Post-Analysis) + +Based on comprehensive architectural review, the following clarifications have been established: + +### Critical Implementation Details + +1. **Connection Management**: Create NEW connection for each retry attempt (no reuse) +2. **Lock Mode**: Use BEGIN IMMEDIATE (not EXCLUSIVE) for optimal concurrency +3. **Timeout Strategy**: 30s per connection attempt, 120s total maximum duration +4. **Logging Levels**: Graduated (DEBUG for retry 1-3, INFO for 4-7, WARNING for 8+) +5. **Transaction Boundaries**: Separate transactions for schema/migrations/data + +### Test Requirements + +- Unit tests with multiprocessing.Pool +- Integration tests with actual gunicorn +- Container tests with full deployment +- Performance target: <500ms with 4 workers + +### Documentation + +- Full Q&A: `/home/phil/Projects/starpunk/docs/architecture/migration-race-condition-answers.md` +- Implementation Guide: `/home/phil/Projects/starpunk/docs/reports/migration-race-condition-fix-implementation.md` +- Quick Reference: `/home/phil/Projects/starpunk/docs/architecture/migration-fix-quick-reference.md` + +## References + +- [SQLite Transaction Documentation](https://www.sqlite.org/lang_transaction.html) +- [SQLite Locking Documentation](https://www.sqlite.org/lockingv3.html) +- [SQLite BEGIN IMMEDIATE](https://www.sqlite.org/lang_transaction.html#immediate) +- Issue: Production migration race condition with gunicorn workers + +## Status Update + +**2025-11-24**: All 23 architectural questions answered. Implementation approved. Ready for development. \ No newline at end of file diff --git a/docs/reports/migration-race-condition-fix-implementation.md b/docs/reports/migration-race-condition-fix-implementation.md new file mode 100644 index 0000000..680ab13 --- /dev/null +++ b/docs/reports/migration-race-condition-fix-implementation.md @@ -0,0 +1,431 @@ +# Migration Race Condition Fix - Implementation Guide + +## Executive Summary + +**CRITICAL PRODUCTION ISSUE**: Multiple gunicorn workers racing to apply migrations causes container startup failures. + +**Solution**: Implement database-level advisory locking with retry logic in `migrations.py`. + +**Urgency**: HIGH - This is a blocker for v1.0.0-rc.4 release. + +## Root Cause Analysis + +### The Problem Flow + +1. Container starts with `gunicorn --workers 4` +2. Each worker independently calls: + ``` + app.py → create_app() → init_db() → run_migrations() + ``` +3. All 4 workers simultaneously try to: + - INSERT into schema_migrations table + - Apply the same migrations +4. SQLite's UNIQUE constraint on migration_name causes workers 2-4 to crash +5. Container restarts, works on second attempt (migrations already applied) + +### Why This Happens + +- **No synchronization**: Workers are independent processes +- **No locking**: Migration code doesn't prevent concurrent execution +- **Immediate failure**: UNIQUE constraint violation crashes the worker +- **Gunicorn behavior**: Worker crash triggers container restart + +## Immediate Fix Implementation + +### Step 1: Update migrations.py + +Add these imports at the top of `/home/phil/Projects/starpunk/starpunk/migrations.py`: + +```python +import time +import random +``` + +### Step 2: Replace run_migrations function + +Replace the entire `run_migrations` function (lines 304-462) with: + +```python +def run_migrations(db_path, logger=None): + """ + Run all pending database migrations with concurrency protection + + Uses database-level locking to prevent race conditions when multiple + workers start simultaneously. Only one worker will apply migrations; + others will wait and verify completion. + + Args: + db_path: Path to SQLite database file + logger: Optional logger for output + + Raises: + MigrationError: If any migration fails to apply or lock cannot be acquired + """ + if logger is None: + logger = logging.getLogger(__name__) + + # Determine migrations directory + migrations_dir = Path(__file__).parent.parent / "migrations" + + if not migrations_dir.exists(): + logger.warning(f"Migrations directory not found: {migrations_dir}") + return + + # Retry configuration for lock acquisition + max_retries = 10 + retry_count = 0 + base_delay = 0.1 # 100ms + + while retry_count < max_retries: + conn = None + try: + # Connect with longer timeout for lock contention + conn = sqlite3.connect(db_path, timeout=30.0) + + # Attempt to acquire exclusive lock for migrations + # BEGIN IMMEDIATE acquires RESERVED lock, preventing other writes + conn.execute("BEGIN IMMEDIATE") + + try: + # Ensure migrations tracking table exists + create_migrations_table(conn) + + # Quick check: have migrations already been applied by another worker? + cursor = conn.execute("SELECT COUNT(*) FROM schema_migrations") + migration_count = cursor.fetchone()[0] + + # Discover migration files + migration_files = discover_migration_files(migrations_dir) + + if not migration_files: + conn.commit() + logger.info("No migration files found") + return + + # If migrations exist and we're not the first worker, verify and exit + if migration_count > 0: + # Check if all migrations are applied + applied = get_applied_migrations(conn) + pending = [m for m, _ in migration_files if m not in applied] + + if not pending: + conn.commit() + logger.debug("All migrations already applied by another worker") + return + # If there are pending migrations, we continue to apply them + logger.info(f"Found {len(pending)} pending migrations to apply") + + # Fresh database detection (original logic preserved) + if migration_count == 0: + if is_schema_current(conn): + # Schema is current - mark all migrations as applied + for migration_name, _ in migration_files: + conn.execute( + "INSERT INTO schema_migrations (migration_name) VALUES (?)", + (migration_name,) + ) + conn.commit() + logger.info( + f"Fresh database detected: marked {len(migration_files)} " + f"migrations as applied (schema already current)" + ) + return + else: + logger.info("Fresh database with partial schema: applying needed migrations") + + # Get already-applied migrations + applied = get_applied_migrations(conn) + + # Apply pending migrations (original logic preserved) + pending_count = 0 + skipped_count = 0 + for migration_name, migration_path in migration_files: + if migration_name not in applied: + # Check if migration is actually needed + should_check_needed = ( + migration_count == 0 or + migration_name == "002_secure_tokens_and_authorization_codes.sql" + ) + + if should_check_needed and not is_migration_needed(conn, migration_name): + # Special handling for migration 002: if tables exist but indexes don't + if migration_name == "002_secure_tokens_and_authorization_codes.sql": + # Check if we need to create indexes + indexes_to_create = [] + if not index_exists(conn, 'idx_tokens_hash'): + indexes_to_create.append("CREATE INDEX idx_tokens_hash ON tokens(token_hash)") + if not index_exists(conn, 'idx_tokens_me'): + indexes_to_create.append("CREATE INDEX idx_tokens_me ON tokens(me)") + if not index_exists(conn, 'idx_tokens_expires'): + indexes_to_create.append("CREATE INDEX idx_tokens_expires ON tokens(expires_at)") + if not index_exists(conn, 'idx_auth_codes_hash'): + indexes_to_create.append("CREATE INDEX idx_auth_codes_hash ON authorization_codes(code_hash)") + if not index_exists(conn, 'idx_auth_codes_expires'): + indexes_to_create.append("CREATE INDEX idx_auth_codes_expires ON authorization_codes(expires_at)") + + if indexes_to_create: + for index_sql in indexes_to_create: + conn.execute(index_sql) + logger.info(f"Created {len(indexes_to_create)} missing indexes from migration 002") + + # Mark as applied without executing full migration + conn.execute( + "INSERT INTO schema_migrations (migration_name) VALUES (?)", + (migration_name,) + ) + skipped_count += 1 + logger.debug(f"Skipped migration {migration_name} (already in SCHEMA_SQL)") + else: + # Apply the migration (within our transaction) + try: + # Read migration SQL + migration_sql = migration_path.read_text() + + logger.debug(f"Applying migration: {migration_name}") + + # Execute migration (already in transaction) + conn.executescript(migration_sql) + + # Record migration as applied + conn.execute( + "INSERT INTO schema_migrations (migration_name) VALUES (?)", + (migration_name,) + ) + + logger.info(f"Applied migration: {migration_name}") + pending_count += 1 + + except Exception as e: + # Roll back the transaction + raise MigrationError(f"Migration {migration_name} failed: {e}") + + # Commit all migrations atomically + conn.commit() + + # Summary + total_count = len(migration_files) + if pending_count > 0 or skipped_count > 0: + if skipped_count > 0: + logger.info( + f"Migrations complete: {pending_count} applied, {skipped_count} skipped " + f"(already in SCHEMA_SQL), {total_count} total" + ) + else: + logger.info( + f"Migrations complete: {pending_count} applied, " + f"{total_count} total" + ) + else: + logger.info(f"All migrations up to date ({total_count} total)") + + return # Success! + + except MigrationError: + conn.rollback() + raise + + except Exception as e: + conn.rollback() + raise MigrationError(f"Migration system error: {e}") + + except sqlite3.OperationalError as e: + if "database is locked" in str(e).lower(): + # Another worker has the lock, retry with exponential backoff + retry_count += 1 + + if retry_count < max_retries: + # Exponential backoff with jitter + delay = base_delay * (2 ** retry_count) + random.uniform(0, 0.1) + logger.debug( + f"Database locked by another worker, retry {retry_count}/{max_retries} " + f"in {delay:.2f}s" + ) + time.sleep(delay) + continue + else: + raise MigrationError( + f"Failed to acquire migration lock after {max_retries} attempts. " + f"This may indicate a hung migration process." + ) + else: + # Non-lock related database error + error_msg = f"Database error during migration: {e}" + logger.error(error_msg) + raise MigrationError(error_msg) + + except Exception as e: + # Unexpected error + error_msg = f"Unexpected error during migration: {e}" + logger.error(error_msg) + raise MigrationError(error_msg) + + finally: + if conn: + try: + conn.close() + except: + pass # Ignore errors during cleanup + + # Should never reach here, but just in case + raise MigrationError("Migration retry loop exited unexpectedly") +``` + +### Step 3: Testing the Fix + +Create a test script to verify the fix works: + +```python +#!/usr/bin/env python3 +"""Test migration race condition fix""" + +import multiprocessing +import time +import sys +from pathlib import Path + +# Add project to path +sys.path.insert(0, str(Path(__file__).parent)) + +def worker_init(worker_id): + """Simulate a gunicorn worker starting""" + print(f"Worker {worker_id}: Starting...") + + try: + from starpunk import create_app + app = create_app() + print(f"Worker {worker_id}: Successfully initialized") + return True + except Exception as e: + print(f"Worker {worker_id}: FAILED - {e}") + return False + +if __name__ == "__main__": + # Test with 10 workers (more than production to stress test) + num_workers = 10 + + print(f"Starting {num_workers} workers simultaneously...") + + with multiprocessing.Pool(num_workers) as pool: + results = pool.map(worker_init, range(num_workers)) + + success_count = sum(results) + print(f"\nResults: {success_count}/{num_workers} workers succeeded") + + if success_count == num_workers: + print("SUCCESS: All workers initialized without race condition") + sys.exit(0) + else: + print("FAILURE: Race condition still present") + sys.exit(1) +``` + +## Verification Steps + +1. **Local Testing**: + ```bash + # Test with multiple workers + gunicorn --workers 4 --bind 0.0.0.0:8000 app:app + + # Check logs for retry messages + # Should see "Database locked by another worker, retry..." messages + ``` + +2. **Container Testing**: + ```bash + # Build container + podman build -t starpunk:test -f Containerfile . + + # Run with fresh database + podman run --rm -p 8000:8000 -v ./test-data:/data starpunk:test + + # Should start cleanly without restarts + ``` + +3. **Log Verification**: + Look for these patterns: + - One worker: "Applied migration: XXX" + - Other workers: "Database locked by another worker, retry..." + - Final: "All migrations already applied by another worker" + +## Risk Assessment + +### Risk Level: LOW + +The fix is safe because: +1. Uses SQLite's native transaction mechanism +2. Preserves all existing migration logic +3. Only adds retry wrapper around existing code +4. Fails safely with clear error messages +5. No data loss possible (transactions ensure atomicity) + +### Rollback Plan + +If issues occur: +1. Revert to previous version +2. Start container with single worker temporarily: `--workers 1` +3. Once migrations apply, scale back to 4 workers + +## Release Strategy + +### Option 1: Hotfix (Recommended) +- Release as v1.0.0-rc.3.1 +- Immediate deployment to fix production issue +- Minimal testing required (focused fix) + +### Option 2: Include in rc.4 +- Bundle with other rc.4 changes +- More testing time +- Risk: Production remains broken until rc.4 + +**Recommendation**: Deploy as hotfix v1.0.0-rc.3.1 immediately. + +## Alternative Workarounds (If Needed Urgently) + +While the proper fix is implemented, these temporary workarounds can be used: + +### Workaround 1: Single Worker Startup +```bash +# In Containerfile, temporarily change: +CMD ["gunicorn", "--workers", "1", ...] + +# After first successful start, rebuild with 4 workers +``` + +### Workaround 2: Pre-migration Script +```bash +# Add entrypoint script that runs migrations before gunicorn +#!/bin/bash +python3 -c "from starpunk.database import init_db; init_db()" +exec gunicorn --workers 4 ... +``` + +### Workaround 3: Delayed Worker Startup +```bash +# Stagger worker startup with --preload +gunicorn --preload --workers 4 ... +``` + +## Summary + +- **Problem**: Race condition when multiple workers apply migrations +- **Solution**: Database-level locking with retry logic +- **Implementation**: ~150 lines of code changes in migrations.py +- **Testing**: Verify with multi-worker startup +- **Risk**: LOW - Safe, atomic changes +- **Urgency**: HIGH - Blocks production deployment +- **Recommendation**: Deploy as hotfix v1.0.0-rc.3.1 immediately + +## Developer Questions Answered + +All 23 architectural questions have been comprehensively answered in: +`/home/phil/Projects/starpunk/docs/architecture/migration-race-condition-answers.md` + +**Key Decisions:** +- NEW connection per retry (not reused) +- BEGIN IMMEDIATE is correct (not EXCLUSIVE) +- Separate transactions for each operation +- Both multiprocessing.Pool AND gunicorn testing needed +- 30s timeout per attempt, 120s total maximum +- Graduated logging levels based on retry count + +**Implementation Status: READY TO PROCEED** \ No newline at end of file