feat(media): Make variant generation atomic with database

Per v1.5.0 Phase 4: - Generate variants to temp directory first - Perform database inserts in transaction - Move files to final location before commit - Clean up temp files on any failure - Add startup recovery for orphaned temp files - All media operations now fully atomic Changes: - Modified generate_all_variants() to return file moves - Modified save_media() to handle full atomic operation - Add cleanup_orphaned_temp_files() for startup recovery - Added 4 new tests for atomic behavior - Fixed HEIC variant format detection - Updated variant failure test for atomic behavior Fixes: - No orphaned files on database failures - No orphaned DB records on file failures - Startup recovery detects and cleans orphans 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-17 11:26:26 -07:00
parent b689e02e64
commit 21fa7acfbb
5 changed files with 951 additions and 87 deletions
--- a/starpunk/media.py
+++ b/starpunk/media.py
@@ -21,6 +21,7 @@ from pathlib import Path
 from datetime import datetime, timedelta
 import uuid
 import io
+import shutil
 from typing import Optional, List, Dict, Tuple
 from flask import current_app

@@ -316,7 +317,8 @@ def generate_variant(
    variant_type: str,
    base_path: Path,
    base_filename: str,
-    file_ext: str
+    file_ext: str,
+    relative_path: str = None
 ) -> Dict:
    """
    Generate a single image variant
@@ -327,6 +329,7 @@ def generate_variant(
        base_path: Directory to save to
        base_filename: Base filename (UUID without extension)
        file_ext: File extension (e.g., '.jpg')
+        relative_path: Relative path for metadata (if None, calculated from base_path)

    Returns:
        Dict with variant metadata (path, width, height, size_bytes)
@@ -359,19 +362,42 @@ def generate_variant(

    # Save with appropriate quality
    save_kwargs = {'optimize': True}
-    if work_img.format in ['JPEG', 'JPG', None]:
-        save_kwargs['quality'] = 85

-    # Determine format from extension
-    save_format = 'JPEG' if file_ext.lower() in ['.jpg', '.jpeg'] else file_ext[1:].upper()
+    # Determine format - prefer image's actual format over extension
+    # This handles cases like HEIC -> JPEG conversion where extension doesn't match format
+    if work_img.format and work_img.format in ['JPEG', 'PNG', 'GIF', 'WEBP']:
+        save_format = work_img.format
+        if save_format in ['JPEG', 'JPG']:
+            save_kwargs['quality'] = 85
+    else:
+        # Fallback to extension-based detection
+        if file_ext.lower() in ['.jpg', '.jpeg', '.heic']:
+            save_format = 'JPEG'
+            save_kwargs['quality'] = 85
+        elif file_ext.lower() == '.png':
+            save_format = 'PNG'
+        elif file_ext.lower() == '.gif':
+            save_format = 'GIF'
+        elif file_ext.lower() == '.webp':
+            save_format = 'WEBP'
+            save_kwargs['quality'] = 85
+        else:
+            save_format = 'JPEG'  # Default fallback
+            save_kwargs['quality'] = 85
+
    work_img.save(variant_path, format=save_format, **save_kwargs)

+    # Use provided relative path or calculate it
+    if relative_path is None:
+        relative_path = str(variant_path.relative_to(base_path.parent.parent))  # Relative to media root
+
    return {
        'variant_type': variant_type,
-        'path': str(variant_path.relative_to(base_path.parent.parent)),  # Relative to media root
+        'path': relative_path,
        'width': work_img.width,
        'height': work_img.height,
-        'size_bytes': variant_path.stat().st_size
+        'size_bytes': variant_path.stat().st_size,
+        'temp_file': variant_path  # Include temp file path for atomic operation
    }


@@ -383,32 +409,53 @@ def generate_all_variants(
    media_id: int,
    year: str,
    month: str,
-    optimized_bytes: bytes
-) -> List[Dict]:
+    optimized_bytes: bytes,
+    db = None
+) -> Tuple[List[Dict], List[Tuple[Path, Path]]]:
    """
-    Generate all variants for an image and store in database
+    Generate all variants for an image and prepare database records
+
+    Per v1.5.0 Phase 4: Atomic variant generation
+    - Generate variants to temp directory first
+    - Return database insert data and file move operations
+    - Caller handles transaction commit and file moves
+    - This ensures true atomicity

    Args:
        img: Source PIL Image (the optimized original)
-        base_path: Directory containing the original
+        base_path: Directory containing the original (final destination)
        base_filename: Base filename (UUID without extension)
        file_ext: File extension
        media_id: ID of parent media record
        year: Year string (e.g., '2025') for path calculation
        month: Month string (e.g., '01') for path calculation
        optimized_bytes: Bytes of optimized original (avoids re-reading file)
+        db: Database connection (optional, for transaction control)

    Returns:
-        List of variant metadata dicts
+        Tuple of (variant_metadata_list, file_moves_list)
+        - variant_metadata_list: List of dicts ready for database insert
+        - file_moves_list: List of (src_path, dst_path) tuples for file moves
    """
    from starpunk.database import get_db

+    if db is None:
+        db = get_db(current_app)
+
    variants = []
-    db = get_db(current_app)
-    created_files = []  # Track files for cleanup on failure
+    file_moves = []
+
+    # Create temp directory for atomic operation
+    media_dir = Path(current_app.config.get('DATA_PATH', 'data')) / 'media'
+    temp_dir = media_dir / '.tmp'
+    temp_dir.mkdir(parents=True, exist_ok=True)
+
+    # Create unique temp subdirectory for this operation
+    temp_subdir = temp_dir / f"{base_filename}_{uuid.uuid4().hex[:8]}"
+    temp_subdir.mkdir(parents=True, exist_ok=True)

    try:
-        # Generate each variant type
+        # Step 1: Generate all variants to temp directory
        for variant_type in ['thumb', 'small', 'medium', 'large']:
            # Skip if image is smaller than target
            spec = VARIANT_SPECS[variant_type]
@@ -417,45 +464,59 @@ def generate_all_variants(
            if img.width < target_width and variant_type != 'thumb':
                continue  # Skip variants larger than original

-            variant = generate_variant(img, variant_type, base_path, base_filename, file_ext)
-            variants.append(variant)
-            created_files.append(base_path / f"{base_filename}_{variant_type}{file_ext}")
+            # Calculate final relative path (for database)
+            final_relative_path = f"{year}/{month}/{base_filename}_{variant_type}{file_ext}"

-            # Insert into database
-            db.execute(
-                """
-                INSERT INTO media_variants
-                (media_id, variant_type, path, width, height, size_bytes)
-                VALUES (?, ?, ?, ?, ?, ?)
-                """,
-                (media_id, variant['variant_type'], variant['path'],
-                 variant['width'], variant['height'], variant['size_bytes'])
+            # Generate variant to temp directory
+            variant = generate_variant(
+                img,
+                variant_type,
+                temp_subdir,  # Write to temp
+                base_filename,
+                file_ext,
+                final_relative_path  # Store final path in metadata
            )

-        # Also record the original as 'original' variant
-        # Use explicit year/month for path calculation (avoids fragile parent traversal)
-        original_path = f"{year}/{month}/{base_filename}{file_ext}"
-        db.execute(
-            """
-            INSERT INTO media_variants
-            (media_id, variant_type, path, width, height, size_bytes)
-            VALUES (?, ?, ?, ?, ?, ?)
-            """,
-            (media_id, 'original', original_path, img.width, img.height,
-             len(optimized_bytes))  # Use passed bytes instead of file I/O
-        )
+            # Prepare database metadata (without temp_file key)
+            variant_metadata = {
+                'variant_type': variant['variant_type'],
+                'path': variant['path'],
+                'width': variant['width'],
+                'height': variant['height'],
+                'size_bytes': variant['size_bytes']
+            }
+            variants.append(variant_metadata)

-        db.commit()
-        return variants
+            # Track file move operation
+            temp_file = variant['temp_file']
+            final_path = base_path / temp_file.name
+            file_moves.append((temp_file, final_path, temp_subdir))
+
+        # Also prepare original variant metadata
+        original_path = f"{year}/{month}/{base_filename}{file_ext}"
+        variants.append({
+            'variant_type': 'original',
+            'path': original_path,
+            'width': img.width,
+            'height': img.height,
+            'size_bytes': len(optimized_bytes)
+        })
+
+        return variants, file_moves

    except Exception as e:
-        # Clean up any created variant files on failure
-        for file_path in created_files:
-            try:
-                if file_path.exists():
-                    file_path.unlink()
-            except OSError:
-                pass  # Best effort cleanup
+        # Clean up temp files on failure
+        try:
+            if temp_subdir.exists():
+                for file in temp_subdir.glob('*'):
+                    try:
+                        file.unlink()
+                    except OSError:
+                        pass
+                temp_subdir.rmdir()
+        except OSError:
+            pass  # Best effort
+
        raise  # Re-raise the original exception


@@ -526,46 +587,147 @@ def save_media(file_data: bytes, filename: str) -> Dict:
        full_dir = media_dir / year / month
        full_dir.mkdir(parents=True, exist_ok=True)

-        # Save optimized image (using bytes from optimize_image to avoid re-encoding)
-        full_path = full_dir / stored_filename
-        full_path.write_bytes(optimized_bytes)
-
        # Get actual file size (from optimized bytes)
        actual_size = len(optimized_bytes)

-        # Insert into database
-        db = get_db(current_app)
-        cursor = db.execute(
-            """
-            INSERT INTO media (filename, stored_filename, path, mime_type, size, width, height)
-            VALUES (?, ?, ?, ?, ?, ?, ?)
-            """,
-            (filename, stored_filename, relative_path, mime_type, actual_size, width, height)
-        )
-        db.commit()
-        media_id = cursor.lastrowid
-
-        # Generate variants (synchronous) - v1.4.0 Phase 2
-        # Pass year, month, and optimized_bytes to avoid fragile path traversal and file I/O
+        # Per v1.5.0 Phase 4: Atomic operation for all file saves and database inserts
+        # Generate variants first (to temp directory)
        base_filename = stored_filename.rsplit('.', 1)[0]
-        variants = []
+
+        db = get_db(current_app)
+        variant_metadata = []
+        file_moves = []
+        temp_original_path = None
+        temp_subdir = None
+
        try:
-            variants = generate_all_variants(
+            # Step 1: Save original to temp directory
+            media_dir = Path(current_app.config.get('DATA_PATH', 'data')) / 'media'
+            temp_dir = media_dir / '.tmp'
+            temp_dir.mkdir(parents=True, exist_ok=True)
+            temp_subdir = temp_dir / f"{base_filename}_{uuid.uuid4().hex[:8]}"
+            temp_subdir.mkdir(parents=True, exist_ok=True)
+
+            temp_original_path = temp_subdir / stored_filename
+            temp_original_path.write_bytes(optimized_bytes)
+
+            # Step 2: Generate variants to temp directory
+            variant_metadata, file_moves = generate_all_variants(
                optimized_img,
                full_dir,
                base_filename,
                file_ext,
-                media_id,
+                0,  # media_id not yet known
                year,
                month,
-                optimized_bytes
+                optimized_bytes,
+                db
            )
+
+            # Step 3: Begin transaction
+            db.execute("BEGIN TRANSACTION")
+
+            # Step 4: Insert media record
+            cursor = db.execute(
+                """
+                INSERT INTO media (filename, stored_filename, path, mime_type, size, width, height)
+                VALUES (?, ?, ?, ?, ?, ?, ?)
+                """,
+                (filename, stored_filename, relative_path, mime_type, actual_size, width, height)
+            )
+            media_id = cursor.lastrowid
+
+            # Step 5: Insert variant records
+            for variant in variant_metadata:
+                db.execute(
+                    """
+                    INSERT INTO media_variants
+                    (media_id, variant_type, path, width, height, size_bytes)
+                    VALUES (?, ?, ?, ?, ?, ?)
+                    """,
+                    (media_id, variant['variant_type'], variant['path'],
+                     variant['width'], variant['height'], variant['size_bytes'])
+                )
+
+            # Step 6: Move files to final location (before commit for true atomicity)
+            # If file moves fail, we can rollback the transaction
+            try:
+                # Move original file
+                full_path = full_dir / stored_filename
+                shutil.move(str(temp_original_path), str(full_path))
+
+                # Move variant files
+                for temp_file, final_path, _ in file_moves:
+                    shutil.move(str(temp_file), str(final_path))
+
+            except Exception as e:
+                # Rollback database if file move fails
+                db.rollback()
+                raise
+
+            # Step 7: Commit transaction (after files are moved successfully)
+            db.commit()
+
+            # Step 8: Clean up temp directory
+            try:
+                if temp_subdir and temp_subdir.exists():
+                    temp_subdir.rmdir()
+            except OSError:
+                pass  # Best effort
+
+            # Format variants for return value (same format as before)
+            variants = [v for v in variant_metadata if v['variant_type'] != 'original']
+
        except Exception as e:
+            # Rollback database on any failure (best effort)
+            try:
+                db.rollback()
+            except Exception:
+                pass  # May already be rolled back
+
+            # Clean up moved files if commit failed
+            # (This handles the case where files were moved but commit failed)
+            full_path = full_dir / stored_filename
+            if full_path.exists():
+                try:
+                    full_path.unlink()
+                except OSError:
+                    pass
+
+            for _, final_path, _ in file_moves:
+                try:
+                    if final_path.exists():
+                        final_path.unlink()
+                except OSError:
+                    pass
+
+            # Clean up temp files on any failure
+            if temp_original_path and temp_original_path.exists():
+                try:
+                    temp_original_path.unlink()
+                except OSError:
+                    pass
+
+            for temp_file, _, _ in file_moves:
+                try:
+                    if temp_file.exists():
+                        temp_file.unlink()
+                except OSError:
+                    pass
+
+            # Clean up temp subdirectory
+            if temp_subdir and temp_subdir.exists():
+                try:
+                    temp_subdir.rmdir()
+                except OSError:
+                    pass
+
+            # Log and re-raise
            current_app.logger.warning(
-                f'Media upload variant generation failed: filename="{filename}", '
-                f'media_id={media_id}, error="{e}"'
+                f'Media upload atomic operation failed: filename="{filename}", '
+                f'error="{e}"'
            )
-            # Continue - original image is still usable
+            raise

        # Log success
        was_optimized = len(optimized_bytes) < file_size
@@ -981,3 +1143,74 @@ def cleanup_old_debug_files(app) -> None:
            f"Debug file cleanup: deleted {deleted_count} file(s), "
            f"freed {deleted_size / 1024 / 1024:.2f} MB"
        )
+
+
+def cleanup_orphaned_temp_files(app) -> None:
+    """
+    Clean up orphaned temporary variant files on startup
+
+    Per v1.5.0 Phase 4:
+    - Detect temp files left from failed operations
+    - Log warnings for orphaned files
+    - Clean up temp directory
+    - Called on application startup
+
+    Args:
+        app: Flask application instance (for config and logger)
+    """
+    media_dir = Path(app.config.get('DATA_PATH', 'data')) / 'media'
+    temp_dir = media_dir / '.tmp'
+
+    # Check if temp directory exists
+    if not temp_dir.exists():
+        return
+
+    # Find all subdirectories and files in temp directory
+    orphaned_count = 0
+    cleaned_size = 0
+
+    # Iterate through temp subdirectories
+    for temp_subdir in temp_dir.iterdir():
+        if not temp_subdir.is_dir():
+            # Clean up any loose files (shouldn't normally exist)
+            try:
+                size = temp_subdir.stat().st_size
+                temp_subdir.unlink()
+                orphaned_count += 1
+                cleaned_size += size
+                app.logger.warning(f"Cleaned up orphaned temp file: {temp_subdir.name}")
+            except OSError as e:
+                app.logger.warning(f"Failed to delete orphaned temp file {temp_subdir.name}: {e}")
+            continue
+
+        # Process subdirectory
+        files_in_subdir = list(temp_subdir.glob('*'))
+        if files_in_subdir:
+            # Log orphaned operation
+            app.logger.warning(
+                f"Found orphaned temp directory from failed operation: {temp_subdir.name} "
+                f"({len(files_in_subdir)} file(s))"
+            )
+
+            # Clean up files
+            for file_path in files_in_subdir:
+                try:
+                    if file_path.is_file():
+                        size = file_path.stat().st_size
+                        file_path.unlink()
+                        orphaned_count += 1
+                        cleaned_size += size
+                except OSError as e:
+                    app.logger.warning(f"Failed to delete orphaned temp file {file_path}: {e}")
+
+        # Remove empty subdirectory
+        try:
+            temp_subdir.rmdir()
+        except OSError as e:
+            app.logger.warning(f"Failed to remove temp directory {temp_subdir.name}: {e}")
+
+    if orphaned_count > 0:
+        app.logger.info(
+            f"Temp file cleanup: removed {orphaned_count} orphaned file(s), "
+            f"freed {cleaned_size / 1024 / 1024:.2f} MB"
+        )