StarPunk/starpunk/feeds/cache.py

"""
Feed caching layer with LRU eviction and TTL expiration.

Implements efficient feed caching to reduce database queries and feed generation
overhead. Uses SHA-256 checksums for cache keys and supports ETag generation
for HTTP conditional requests.

Philosophy: Simple, memory-efficient caching that reduces database load.
"""

import hashlib
import time
from collections import OrderedDict
from typing import Optional, Dict, Tuple


class FeedCache:
    """
    LRU cache with TTL (Time To Live) for feed content.

    Features:
    - LRU eviction when max_size is reached
    - TTL-based expiration (default 5 minutes)
    - SHA-256 checksums for ETags
    - Thread-safe operations
    - Hit/miss statistics tracking

    Cache Key Format:
        feed:{format}:{checksum}

    Example:
        cache = FeedCache(max_size=50, ttl=300)

        # Store feed content
        checksum = cache.set('rss', content, notes_checksum)

        # Retrieve feed content
        cached_content, etag = cache.get('rss', notes_checksum)

        # Track cache statistics
        stats = cache.get_stats()
    """

    def __init__(self, max_size: int = 50, ttl: int = 300):
        """
        Initialize feed cache.

        Args:
            max_size: Maximum number of cached feeds (default: 50)
            ttl: Time to live in seconds (default: 300 = 5 minutes)
        """
        self.max_size = max_size
        self.ttl = ttl

        # OrderedDict for LRU behavior
        # Structure: {cache_key: (content, etag, timestamp)}
        self._cache: OrderedDict[str, Tuple[str, str, float]] = OrderedDict()

        # Statistics tracking
        self._hits = 0
        self._misses = 0
        self._evictions = 0

    def _generate_cache_key(self, format_name: str, checksum: str) -> str:
        """
        Generate cache key from format and content checksum.

        Args:
            format_name: Feed format (rss, atom, json)
            checksum: SHA-256 checksum of note content

        Returns:
            Cache key string
        """
        return f"feed:{format_name}:{checksum}"

    def _generate_etag(self, content: str) -> str:
        """
        Generate weak ETag from feed content using SHA-256.

        Uses weak ETags (W/"...") since feed content can have semantic
        equivalence even with different representations (e.g., timestamp
        formatting, whitespace variations).

        Args:
            content: Feed content (XML or JSON)

        Returns:
            Weak ETag in format: W/"sha256_hash"
        """
        content_hash = hashlib.sha256(content.encode('utf-8')).hexdigest()
        return f'W/"{content_hash}"'

    def _is_expired(self, timestamp: float) -> bool:
        """
        Check if cached entry has expired based on TTL.

        Args:
            timestamp: Unix timestamp when entry was cached

        Returns:
            True if expired, False otherwise
        """
        return (time.time() - timestamp) > self.ttl

    def _evict_lru(self) -> None:
        """
        Evict least recently used entry from cache.

        Called when cache is full and new entry needs to be added.
        Uses OrderedDict's FIFO behavior (first key is oldest).
        """
        if self._cache:
            # Remove first (oldest/least recently used) entry
            self._cache.popitem(last=False)
            self._evictions += 1

    def get(self, format_name: str, notes_checksum: str) -> Optional[Tuple[str, str]]:
        """
        Retrieve cached feed content if valid and not expired.

        Args:
            format_name: Feed format (rss, atom, json)
            notes_checksum: SHA-256 checksum of note list content

        Returns:
            Tuple of (content, etag) if cache hit and valid, None otherwise

        Side Effects:
            - Moves accessed entry to end of OrderedDict (LRU update)
            - Increments hit or miss counter
            - Removes expired entries
        """
        cache_key = self._generate_cache_key(format_name, notes_checksum)

        if cache_key not in self._cache:
            self._misses += 1
            return None

        content, etag, timestamp = self._cache[cache_key]

        # Check if expired
        if self._is_expired(timestamp):
            # Remove expired entry
            del self._cache[cache_key]
            self._misses += 1
            return None

        # Move to end (mark as recently used)
        self._cache.move_to_end(cache_key)
        self._hits += 1

        return (content, etag)

    def set(self, format_name: str, content: str, notes_checksum: str) -> str:
        """
        Store feed content in cache with generated ETag.

        Args:
            format_name: Feed format (rss, atom, json)
            content: Generated feed content (XML or JSON)
            notes_checksum: SHA-256 checksum of note list content

        Returns:
            Generated ETag for the content

        Side Effects:
            - May evict LRU entry if cache is full
            - Adds new entry or updates existing entry
        """
        cache_key = self._generate_cache_key(format_name, notes_checksum)
        etag = self._generate_etag(content)
        timestamp = time.time()

        # Evict if cache is full
        if len(self._cache) >= self.max_size and cache_key not in self._cache:
            self._evict_lru()

        # Store/update cache entry
        self._cache[cache_key] = (content, etag, timestamp)

        # Move to end if updating existing entry
        if cache_key in self._cache:
            self._cache.move_to_end(cache_key)

        return etag

    def invalidate(self, format_name: Optional[str] = None) -> int:
        """
        Invalidate cache entries.

        Args:
            format_name: If specified, only invalidate this format.
                        If None, invalidate all entries.

        Returns:
            Number of entries invalidated
        """
        if format_name is None:
            # Clear entire cache
            count = len(self._cache)
            self._cache.clear()
            return count

        # Invalidate specific format
        keys_to_remove = [
            key for key in self._cache.keys()
            if key.startswith(f"feed:{format_name}:")
        ]

        for key in keys_to_remove:
            del self._cache[key]

        return len(keys_to_remove)

    def get_stats(self) -> Dict[str, int]:
        """
        Get cache statistics.

        Returns:
            Dictionary with:
            - hits: Number of cache hits
            - misses: Number of cache misses
            - entries: Current number of cached entries
            - evictions: Number of LRU evictions
            - hit_rate: Cache hit rate (0.0 to 1.0)
        """
        total_requests = self._hits + self._misses
        hit_rate = self._hits / total_requests if total_requests > 0 else 0.0

        return {
            'hits': self._hits,
            'misses': self._misses,
            'entries': len(self._cache),
            'evictions': self._evictions,
            'hit_rate': hit_rate,
        }

    def generate_notes_checksum(self, notes: list) -> str:
        """
        Generate SHA-256 checksum from note list.

        Creates a stable checksum based on note IDs and updated timestamps.
        This checksum changes when notes are added, removed, or modified.

        Args:
            notes: List of Note objects

        Returns:
            SHA-256 hex digest of note content
        """
        # Create stable representation of notes
        # Use ID and updated timestamp as these uniquely identify note state
        note_repr = []
        for note in notes:
            # Include ID and updated timestamp for change detection
            note_str = f"{note.id}:{note.updated_at.isoformat()}"
            note_repr.append(note_str)

        # Join and hash
        combined = "|".join(note_repr)
        return hashlib.sha256(combined.encode('utf-8')).hexdigest()


# Global cache instance (singleton pattern)
# Created on first import, configured via Flask app config
_global_cache: Optional[FeedCache] = None


def get_cache() -> FeedCache:
    """
    Get global feed cache instance.

    Creates cache on first access with default settings.
    Can be reconfigured via configure_cache().

    Returns:
        Global FeedCache instance
    """
    global _global_cache
    if _global_cache is None:
        _global_cache = FeedCache()
    return _global_cache


def configure_cache(max_size: int, ttl: int) -> None:
    """
    Configure global feed cache.

    Call this during app initialization to set cache parameters.

    Args:
        max_size: Maximum number of cached feeds
        ttl: Time to live in seconds
    """
    global _global_cache
    _global_cache = FeedCache(max_size=max_size, ttl=ttl)