""" Slug validation and sanitization utilities for StarPunk This module provides functions for validating, sanitizing, and ensuring uniqueness of note slugs. Supports custom slugs via Micropub's mp-slug property. Per developer Q&A Q8: - Unicode normalization for slug generation - Timestamp-based fallback (YYYYMMDD-HHMMSS) when normalization fails - Log warnings with original text - Never fail Micropub request """ import re import unicodedata import logging from datetime import datetime from typing import Optional, Set logger = logging.getLogger(__name__) # Reserved slugs that cannot be used for notes # These correspond to application routes and special pages RESERVED_SLUGS = frozenset([ # Core routes 'api', 'admin', 'auth', 'feed', 'static', 'notes', # Auth/admin routes 'login', 'logout', 'settings', 'micropub', 'callback', # Feed routes 'feed.xml', 'rss', 'atom', # Special pages 'index', 'home', 'about', 'search', ]) # Slug validation regex # Allows: lowercase letters, numbers, hyphens # Must start with letter or number # Must end with letter or number # Cannot have consecutive hyphens SLUG_PATTERN = re.compile(r'^[a-z0-9]([a-z0-9-]*[a-z0-9])?$') # Maximum slug length MAX_SLUG_LENGTH = 200 def is_reserved_slug(slug: str) -> bool: """ Check if slug is reserved Args: slug: Slug to check Returns: bool: True if slug is reserved """ return slug.lower() in RESERVED_SLUGS def sanitize_slug(slug: str, allow_timestamp_fallback: bool = False) -> str: """ Sanitize a custom slug with Unicode normalization Per developer Q&A Q8: - Unicode normalization (NFKD) for international characters - Timestamp-based fallback (YYYYMMDD-HHMMSS) when normalization fails - Log warnings with original text - Never fail (always returns a valid slug) Converts to lowercase, replaces invalid characters with hyphens, removes consecutive hyphens, and trims to max length. Args: slug: Raw slug input allow_timestamp_fallback: If True, use timestamp fallback for empty slugs Returns: Sanitized slug string (never empty if allow_timestamp_fallback=True) Examples: >>> sanitize_slug("Hello World!") 'hello-world' >>> sanitize_slug("My--Post___Title") 'my-post-title' >>> sanitize_slug(" leading-spaces ") 'leading-spaces' >>> sanitize_slug("Café") 'cafe' >>> sanitize_slug("日本語", allow_timestamp_fallback=True) # Returns timestamp-based slug like '20231125-143022' >>> sanitize_slug("😀🎉✨", allow_timestamp_fallback=True) # Returns timestamp-based slug """ original_slug = slug # Unicode normalization (NFKD) - decomposes characters # e.g., "é" becomes "e" + combining accent slug = unicodedata.normalize('NFKD', slug) # Remove combining characters (accents, etc.) # This converts accented characters to their ASCII equivalents slug = slug.encode('ascii', 'ignore').decode('ascii') # Convert to lowercase slug = slug.lower() # Replace invalid characters with hyphens # Allow only: a-z, 0-9, hyphens slug = re.sub(r'[^a-z0-9-]+', '-', slug) # Remove consecutive hyphens slug = re.sub(r'-+', '-', slug) # Trim leading/trailing hyphens slug = slug.strip('-') # Check if normalization resulted in empty slug if not slug and allow_timestamp_fallback: # Per Q8: Use timestamp-based fallback timestamp = datetime.utcnow().strftime('%Y%m%d-%H%M%S') slug = timestamp logger.warning( f"Slug normalization failed for input '{original_slug}' " f"(all characters removed during normalization). " f"Using timestamp fallback: {slug}" ) # Trim to max length if len(slug) > MAX_SLUG_LENGTH: slug = slug[:MAX_SLUG_LENGTH].rstrip('-') return slug def validate_slug(slug: str) -> bool: """ Validate slug format Checks if slug matches required pattern: - Only lowercase letters, numbers, hyphens - Starts with letter or number - Ends with letter or number - No consecutive hyphens - Not empty - Not too long Args: slug: Slug to validate Returns: bool: True if valid, False otherwise Examples: >>> validate_slug("my-post") True >>> validate_slug("my--post") # consecutive hyphens False >>> validate_slug("-my-post") # starts with hyphen False >>> validate_slug("My-Post") # uppercase False """ if not slug: return False if len(slug) > MAX_SLUG_LENGTH: return False if not SLUG_PATTERN.match(slug): return False return True def make_slug_unique_with_suffix(base_slug: str, existing_slugs: Set[str], max_attempts: int = 99) -> str: """ Make slug unique by adding sequential numeric suffix If base_slug exists, tries base_slug-2, base_slug-3, etc. Uses sequential numbers (not random) for predictability. Args: base_slug: Base slug to make unique existing_slugs: Set of existing slugs to check against max_attempts: Maximum number of attempts (default: 99) Returns: Unique slug with suffix if needed Raises: ValueError: If unique slug cannot be generated after max_attempts Examples: >>> make_slug_unique_with_suffix("my-post", {"my-post"}) 'my-post-2' >>> make_slug_unique_with_suffix("my-post", {"my-post", "my-post-2"}) 'my-post-3' >>> make_slug_unique_with_suffix("my-post", set()) 'my-post' """ # If base slug is available, use it if base_slug not in existing_slugs: return base_slug # Try sequential suffixes for i in range(2, max_attempts + 2): candidate = f"{base_slug}-{i}" if candidate not in existing_slugs: return candidate # Exhausted all attempts raise ValueError( f"Could not create unique slug after {max_attempts} attempts. " f"Base slug: {base_slug}" ) def validate_and_sanitize_custom_slug(custom_slug: str, existing_slugs: Set[str]) -> tuple[bool, Optional[str], Optional[str]]: """ Validate and sanitize a custom slug from Micropub Per developer Q&A Q8: - Never fail Micropub request due to slug issues - Use timestamp fallback if normalization fails - Log warnings for debugging Performs full validation pipeline: 1. Sanitize the input (with timestamp fallback) 2. Check if it's reserved 3. Validate format 4. Make unique if needed Args: custom_slug: Raw custom slug from mp-slug property existing_slugs: Set of existing slugs Returns: Tuple of (success, slug_or_none, error_message_or_none) Examples: >>> validate_and_sanitize_custom_slug("My Post", set()) (True, 'my-post', None) >>> validate_and_sanitize_custom_slug("api", set()) (False, None, 'Slug "api" is reserved') >>> validate_and_sanitize_custom_slug("/invalid/slug", set()) (False, None, 'Slug "/invalid/slug" contains hierarchical paths which are not supported') >>> validate_and_sanitize_custom_slug("😀🎉", set()) # Returns (True, '20231125-143022', None) - timestamp fallback """ # Check for hierarchical paths (not supported in v1.1.0) if '/' in custom_slug: return ( False, None, f'Slug "{custom_slug}" contains hierarchical paths which are not supported' ) # Sanitize with timestamp fallback enabled # Per Q8: Never fail Micropub request sanitized = sanitize_slug(custom_slug, allow_timestamp_fallback=True) # After timestamp fallback, slug should never be empty # But check anyway for safety if not sanitized: # This should never happen with allow_timestamp_fallback=True # but handle it just in case timestamp = datetime.utcnow().strftime('%Y%m%d-%H%M%S') sanitized = timestamp logger.error( f"Unexpected empty slug after sanitization with fallback. " f"Original: '{custom_slug}'. Using timestamp: {sanitized}" ) # Check if reserved if is_reserved_slug(sanitized): # Per Q8: Never fail - add suffix to reserved slug logger.warning( f"Slug '{sanitized}' (from '{custom_slug}') is reserved. " f"Adding numeric suffix." ) # Add a suffix to make it non-reserved sanitized = f"{sanitized}-note" # Validate format if not validate_slug(sanitized): # This should rarely happen after sanitization # but if it does, use timestamp fallback timestamp = datetime.utcnow().strftime('%Y%m%d-%H%M%S') logger.warning( f"Slug '{sanitized}' (from '{custom_slug}') failed validation. " f"Using timestamp fallback: {timestamp}" ) sanitized = timestamp # Make unique if needed try: unique_slug = make_slug_unique_with_suffix(sanitized, existing_slugs) return (True, unique_slug, None) except ValueError as e: # This should rarely happen, but if it does, use timestamp # Per Q8: Never fail Micropub request timestamp = datetime.utcnow().strftime('%Y%m%d-%H%M%S') logger.error( f"Could not create unique slug from '{custom_slug}'. " f"Using timestamp: {timestamp}. Error: {e}" ) return (True, timestamp, None)