StarPunk/starpunk/slug_utils.py

"""
Slug validation and sanitization utilities for StarPunk

This module provides functions for validating, sanitizing, and ensuring uniqueness
of note slugs. Supports custom slugs via Micropub's mp-slug property.

Per developer Q&A Q8:
- Unicode normalization for slug generation
- Timestamp-based fallback (YYYYMMDD-HHMMSS) when normalization fails
- Log warnings with original text
- Never fail Micropub request
"""

import re
import unicodedata
import logging
from datetime import datetime
from typing import Optional, Set

logger = logging.getLogger(__name__)

# Reserved slugs that cannot be used for notes
# These correspond to application routes and special pages
RESERVED_SLUGS = frozenset([
    # Core routes
    'api',
    'admin',
    'auth',
    'feed',
    'static',
    'notes',

    # Auth/admin routes
    'login',
    'logout',
    'settings',
    'micropub',
    'callback',

    # Feed routes
    'feed.xml',
    'rss',
    'atom',

    # Special pages
    'index',
    'home',
    'about',
    'search',
])

# Slug validation regex
# Allows: lowercase letters, numbers, hyphens
# Must start with letter or number
# Must end with letter or number
# Cannot have consecutive hyphens
SLUG_PATTERN = re.compile(r'^[a-z0-9]([a-z0-9-]*[a-z0-9])?$')

# Maximum slug length
MAX_SLUG_LENGTH = 200


def is_reserved_slug(slug: str) -> bool:
    """
    Check if slug is reserved

    Args:
        slug: Slug to check

    Returns:
        bool: True if slug is reserved
    """
    return slug.lower() in RESERVED_SLUGS


def sanitize_slug(slug: str, allow_timestamp_fallback: bool = False) -> str:
    """
    Sanitize a custom slug with Unicode normalization

    Per developer Q&A Q8:
    - Unicode normalization (NFKD) for international characters
    - Timestamp-based fallback (YYYYMMDD-HHMMSS) when normalization fails
    - Log warnings with original text
    - Never fail (always returns a valid slug)

    Converts to lowercase, replaces invalid characters with hyphens,
    removes consecutive hyphens, and trims to max length.

    Args:
        slug: Raw slug input
        allow_timestamp_fallback: If True, use timestamp fallback for empty slugs

    Returns:
        Sanitized slug string (never empty if allow_timestamp_fallback=True)

    Examples:
        >>> sanitize_slug("Hello World!")
        'hello-world'

        >>> sanitize_slug("My--Post___Title")
        'my-post-title'

        >>> sanitize_slug("  leading-spaces  ")
        'leading-spaces'

        >>> sanitize_slug("Café")
        'cafe'

        >>> sanitize_slug("日本語", allow_timestamp_fallback=True)
        # Returns timestamp-based slug like '20231125-143022'

        >>> sanitize_slug("😀🎉✨", allow_timestamp_fallback=True)
        # Returns timestamp-based slug
    """
    original_slug = slug

    # Unicode normalization (NFKD) - decomposes characters
    # e.g., "é" becomes "e" + combining accent
    slug = unicodedata.normalize('NFKD', slug)

    # Remove combining characters (accents, etc.)
    # This converts accented characters to their ASCII equivalents
    slug = slug.encode('ascii', 'ignore').decode('ascii')

    # Convert to lowercase
    slug = slug.lower()

    # Replace invalid characters with hyphens
    # Allow only: a-z, 0-9, hyphens
    slug = re.sub(r'[^a-z0-9-]+', '-', slug)

    # Remove consecutive hyphens
    slug = re.sub(r'-+', '-', slug)

    # Trim leading/trailing hyphens
    slug = slug.strip('-')

    # Check if normalization resulted in empty slug
    if not slug and allow_timestamp_fallback:
        # Per Q8: Use timestamp-based fallback
        timestamp = datetime.utcnow().strftime('%Y%m%d-%H%M%S')
        slug = timestamp
        logger.warning(
            f"Slug normalization failed for input '{original_slug}' "
            f"(all characters removed during normalization). "
            f"Using timestamp fallback: {slug}"
        )

    # Trim to max length
    if len(slug) > MAX_SLUG_LENGTH:
        slug = slug[:MAX_SLUG_LENGTH].rstrip('-')

    return slug


def validate_slug(slug: str) -> bool:
    """
    Validate slug format

    Checks if slug matches required pattern:
    - Only lowercase letters, numbers, hyphens
    - Starts with letter or number
    - Ends with letter or number
    - No consecutive hyphens
    - Not empty
    - Not too long

    Args:
        slug: Slug to validate

    Returns:
        bool: True if valid, False otherwise

    Examples:
        >>> validate_slug("my-post")
        True

        >>> validate_slug("my--post")  # consecutive hyphens
        False

        >>> validate_slug("-my-post")  # starts with hyphen
        False

        >>> validate_slug("My-Post")  # uppercase
        False
    """
    if not slug:
        return False

    if len(slug) > MAX_SLUG_LENGTH:
        return False

    if not SLUG_PATTERN.match(slug):
        return False

    return True


def make_slug_unique_with_suffix(base_slug: str, existing_slugs: Set[str], max_attempts: int = 99) -> str:
    """
    Make slug unique by adding sequential numeric suffix

    If base_slug exists, tries base_slug-2, base_slug-3, etc.
    Uses sequential numbers (not random) for predictability.

    Args:
        base_slug: Base slug to make unique
        existing_slugs: Set of existing slugs to check against
        max_attempts: Maximum number of attempts (default: 99)

    Returns:
        Unique slug with suffix if needed

    Raises:
        ValueError: If unique slug cannot be generated after max_attempts

    Examples:
        >>> make_slug_unique_with_suffix("my-post", {"my-post"})
        'my-post-2'

        >>> make_slug_unique_with_suffix("my-post", {"my-post", "my-post-2"})
        'my-post-3'

        >>> make_slug_unique_with_suffix("my-post", set())
        'my-post'
    """
    # If base slug is available, use it
    if base_slug not in existing_slugs:
        return base_slug

    # Try sequential suffixes
    for i in range(2, max_attempts + 2):
        candidate = f"{base_slug}-{i}"
        if candidate not in existing_slugs:
            return candidate

    # Exhausted all attempts
    raise ValueError(
        f"Could not create unique slug after {max_attempts} attempts. "
        f"Base slug: {base_slug}"
    )


def validate_and_sanitize_custom_slug(custom_slug: str, existing_slugs: Set[str]) -> tuple[bool, Optional[str], Optional[str]]:
    """
    Validate and sanitize a custom slug from Micropub

    Per developer Q&A Q8:
    - Never fail Micropub request due to slug issues
    - Use timestamp fallback if normalization fails
    - Log warnings for debugging

    Performs full validation pipeline:
    1. Sanitize the input (with timestamp fallback)
    2. Check if it's reserved
    3. Validate format
    4. Make unique if needed

    Args:
        custom_slug: Raw custom slug from mp-slug property
        existing_slugs: Set of existing slugs

    Returns:
        Tuple of (success, slug_or_none, error_message_or_none)

    Examples:
        >>> validate_and_sanitize_custom_slug("My Post", set())
        (True, 'my-post', None)

        >>> validate_and_sanitize_custom_slug("api", set())
        (False, None, 'Slug "api" is reserved')

        >>> validate_and_sanitize_custom_slug("/invalid/slug", set())
        (False, None, 'Slug "/invalid/slug" contains hierarchical paths which are not supported')

        >>> validate_and_sanitize_custom_slug("😀🎉", set())
        # Returns (True, '20231125-143022', None) - timestamp fallback
    """
    # Check for hierarchical paths (not supported in v1.1.0)
    if '/' in custom_slug:
        return (
            False,
            None,
            f'Slug "{custom_slug}" contains hierarchical paths which are not supported'
        )

    # Sanitize with timestamp fallback enabled
    # Per Q8: Never fail Micropub request
    sanitized = sanitize_slug(custom_slug, allow_timestamp_fallback=True)

    # After timestamp fallback, slug should never be empty
    # But check anyway for safety
    if not sanitized:
        # This should never happen with allow_timestamp_fallback=True
        # but handle it just in case
        timestamp = datetime.utcnow().strftime('%Y%m%d-%H%M%S')
        sanitized = timestamp
        logger.error(
            f"Unexpected empty slug after sanitization with fallback. "
            f"Original: '{custom_slug}'. Using timestamp: {sanitized}"
        )

    # Check if reserved
    if is_reserved_slug(sanitized):
        # Per Q8: Never fail - add suffix to reserved slug
        logger.warning(
            f"Slug '{sanitized}' (from '{custom_slug}') is reserved. "
            f"Adding numeric suffix."
        )
        # Add a suffix to make it non-reserved
        sanitized = f"{sanitized}-note"

    # Validate format
    if not validate_slug(sanitized):
        # This should rarely happen after sanitization
        # but if it does, use timestamp fallback
        timestamp = datetime.utcnow().strftime('%Y%m%d-%H%M%S')
        logger.warning(
            f"Slug '{sanitized}' (from '{custom_slug}') failed validation. "
            f"Using timestamp fallback: {timestamp}"
        )
        sanitized = timestamp

    # Make unique if needed
    try:
        unique_slug = make_slug_unique_with_suffix(sanitized, existing_slugs)
        return (True, unique_slug, None)
    except ValueError as e:
        # This should rarely happen, but if it does, use timestamp
        # Per Q8: Never fail Micropub request
        timestamp = datetime.utcnow().strftime('%Y%m%d-%H%M%S')
        logger.error(
            f"Could not create unique slug from '{custom_slug}'. "
            f"Using timestamp: {timestamp}. Error: {e}"
        )
        return (True, timestamp, None)