Files
StarPunk/starpunk/slug_utils.py
Phil Skentelbery 07fff01fab feat: Complete v1.1.1 Phases 2 & 3 - Enhancements and Polish
Phase 2 - Enhancements:
- Add performance monitoring infrastructure with MetricsBuffer
- Implement three-tier health checks (/health, /health?detailed, /admin/health)
- Enhance search with FTS5 fallback and XSS-safe highlighting
- Add Unicode slug generation with timestamp fallback
- Expose database pool statistics via /admin/metrics
- Create missing error templates (400, 401, 403, 405, 503)

Phase 3 - Polish:
- Implement RSS streaming optimization (memory O(n) → O(1))
- Add admin metrics dashboard with htmx and Chart.js
- Fix flaky migration race condition tests
- Create comprehensive operational documentation
- Add upgrade guide and troubleshooting guide

Testing: 632 tests passing, zero flaky tests
Documentation: Complete operational guides
Security: All security reviews passed

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-25 20:10:41 -07:00

337 lines
9.5 KiB
Python

"""
Slug validation and sanitization utilities for StarPunk
This module provides functions for validating, sanitizing, and ensuring uniqueness
of note slugs. Supports custom slugs via Micropub's mp-slug property.
Per developer Q&A Q8:
- Unicode normalization for slug generation
- Timestamp-based fallback (YYYYMMDD-HHMMSS) when normalization fails
- Log warnings with original text
- Never fail Micropub request
"""
import re
import unicodedata
import logging
from datetime import datetime
from typing import Optional, Set
logger = logging.getLogger(__name__)
# Reserved slugs that cannot be used for notes
# These correspond to application routes and special pages
RESERVED_SLUGS = frozenset([
# Core routes
'api',
'admin',
'auth',
'feed',
'static',
'notes',
# Auth/admin routes
'login',
'logout',
'settings',
'micropub',
'callback',
# Feed routes
'feed.xml',
'rss',
'atom',
# Special pages
'index',
'home',
'about',
'search',
])
# Slug validation regex
# Allows: lowercase letters, numbers, hyphens
# Must start with letter or number
# Must end with letter or number
# Cannot have consecutive hyphens
SLUG_PATTERN = re.compile(r'^[a-z0-9]([a-z0-9-]*[a-z0-9])?$')
# Maximum slug length
MAX_SLUG_LENGTH = 200
def is_reserved_slug(slug: str) -> bool:
"""
Check if slug is reserved
Args:
slug: Slug to check
Returns:
bool: True if slug is reserved
"""
return slug.lower() in RESERVED_SLUGS
def sanitize_slug(slug: str, allow_timestamp_fallback: bool = False) -> str:
"""
Sanitize a custom slug with Unicode normalization
Per developer Q&A Q8:
- Unicode normalization (NFKD) for international characters
- Timestamp-based fallback (YYYYMMDD-HHMMSS) when normalization fails
- Log warnings with original text
- Never fail (always returns a valid slug)
Converts to lowercase, replaces invalid characters with hyphens,
removes consecutive hyphens, and trims to max length.
Args:
slug: Raw slug input
allow_timestamp_fallback: If True, use timestamp fallback for empty slugs
Returns:
Sanitized slug string (never empty if allow_timestamp_fallback=True)
Examples:
>>> sanitize_slug("Hello World!")
'hello-world'
>>> sanitize_slug("My--Post___Title")
'my-post-title'
>>> sanitize_slug(" leading-spaces ")
'leading-spaces'
>>> sanitize_slug("Café")
'cafe'
>>> sanitize_slug("日本語", allow_timestamp_fallback=True)
# Returns timestamp-based slug like '20231125-143022'
>>> sanitize_slug("😀🎉✨", allow_timestamp_fallback=True)
# Returns timestamp-based slug
"""
original_slug = slug
# Unicode normalization (NFKD) - decomposes characters
# e.g., "é" becomes "e" + combining accent
slug = unicodedata.normalize('NFKD', slug)
# Remove combining characters (accents, etc.)
# This converts accented characters to their ASCII equivalents
slug = slug.encode('ascii', 'ignore').decode('ascii')
# Convert to lowercase
slug = slug.lower()
# Replace invalid characters with hyphens
# Allow only: a-z, 0-9, hyphens
slug = re.sub(r'[^a-z0-9-]+', '-', slug)
# Remove consecutive hyphens
slug = re.sub(r'-+', '-', slug)
# Trim leading/trailing hyphens
slug = slug.strip('-')
# Check if normalization resulted in empty slug
if not slug and allow_timestamp_fallback:
# Per Q8: Use timestamp-based fallback
timestamp = datetime.utcnow().strftime('%Y%m%d-%H%M%S')
slug = timestamp
logger.warning(
f"Slug normalization failed for input '{original_slug}' "
f"(all characters removed during normalization). "
f"Using timestamp fallback: {slug}"
)
# Trim to max length
if len(slug) > MAX_SLUG_LENGTH:
slug = slug[:MAX_SLUG_LENGTH].rstrip('-')
return slug
def validate_slug(slug: str) -> bool:
"""
Validate slug format
Checks if slug matches required pattern:
- Only lowercase letters, numbers, hyphens
- Starts with letter or number
- Ends with letter or number
- No consecutive hyphens
- Not empty
- Not too long
Args:
slug: Slug to validate
Returns:
bool: True if valid, False otherwise
Examples:
>>> validate_slug("my-post")
True
>>> validate_slug("my--post") # consecutive hyphens
False
>>> validate_slug("-my-post") # starts with hyphen
False
>>> validate_slug("My-Post") # uppercase
False
"""
if not slug:
return False
if len(slug) > MAX_SLUG_LENGTH:
return False
if not SLUG_PATTERN.match(slug):
return False
return True
def make_slug_unique_with_suffix(base_slug: str, existing_slugs: Set[str], max_attempts: int = 99) -> str:
"""
Make slug unique by adding sequential numeric suffix
If base_slug exists, tries base_slug-2, base_slug-3, etc.
Uses sequential numbers (not random) for predictability.
Args:
base_slug: Base slug to make unique
existing_slugs: Set of existing slugs to check against
max_attempts: Maximum number of attempts (default: 99)
Returns:
Unique slug with suffix if needed
Raises:
ValueError: If unique slug cannot be generated after max_attempts
Examples:
>>> make_slug_unique_with_suffix("my-post", {"my-post"})
'my-post-2'
>>> make_slug_unique_with_suffix("my-post", {"my-post", "my-post-2"})
'my-post-3'
>>> make_slug_unique_with_suffix("my-post", set())
'my-post'
"""
# If base slug is available, use it
if base_slug not in existing_slugs:
return base_slug
# Try sequential suffixes
for i in range(2, max_attempts + 2):
candidate = f"{base_slug}-{i}"
if candidate not in existing_slugs:
return candidate
# Exhausted all attempts
raise ValueError(
f"Could not create unique slug after {max_attempts} attempts. "
f"Base slug: {base_slug}"
)
def validate_and_sanitize_custom_slug(custom_slug: str, existing_slugs: Set[str]) -> tuple[bool, Optional[str], Optional[str]]:
"""
Validate and sanitize a custom slug from Micropub
Per developer Q&A Q8:
- Never fail Micropub request due to slug issues
- Use timestamp fallback if normalization fails
- Log warnings for debugging
Performs full validation pipeline:
1. Sanitize the input (with timestamp fallback)
2. Check if it's reserved
3. Validate format
4. Make unique if needed
Args:
custom_slug: Raw custom slug from mp-slug property
existing_slugs: Set of existing slugs
Returns:
Tuple of (success, slug_or_none, error_message_or_none)
Examples:
>>> validate_and_sanitize_custom_slug("My Post", set())
(True, 'my-post', None)
>>> validate_and_sanitize_custom_slug("api", set())
(False, None, 'Slug "api" is reserved')
>>> validate_and_sanitize_custom_slug("/invalid/slug", set())
(False, None, 'Slug "/invalid/slug" contains hierarchical paths which are not supported')
>>> validate_and_sanitize_custom_slug("😀🎉", set())
# Returns (True, '20231125-143022', None) - timestamp fallback
"""
# Check for hierarchical paths (not supported in v1.1.0)
if '/' in custom_slug:
return (
False,
None,
f'Slug "{custom_slug}" contains hierarchical paths which are not supported'
)
# Sanitize with timestamp fallback enabled
# Per Q8: Never fail Micropub request
sanitized = sanitize_slug(custom_slug, allow_timestamp_fallback=True)
# After timestamp fallback, slug should never be empty
# But check anyway for safety
if not sanitized:
# This should never happen with allow_timestamp_fallback=True
# but handle it just in case
timestamp = datetime.utcnow().strftime('%Y%m%d-%H%M%S')
sanitized = timestamp
logger.error(
f"Unexpected empty slug after sanitization with fallback. "
f"Original: '{custom_slug}'. Using timestamp: {sanitized}"
)
# Check if reserved
if is_reserved_slug(sanitized):
# Per Q8: Never fail - add suffix to reserved slug
logger.warning(
f"Slug '{sanitized}' (from '{custom_slug}') is reserved. "
f"Adding numeric suffix."
)
# Add a suffix to make it non-reserved
sanitized = f"{sanitized}-note"
# Validate format
if not validate_slug(sanitized):
# This should rarely happen after sanitization
# but if it does, use timestamp fallback
timestamp = datetime.utcnow().strftime('%Y%m%d-%H%M%S')
logger.warning(
f"Slug '{sanitized}' (from '{custom_slug}') failed validation. "
f"Using timestamp fallback: {timestamp}"
)
sanitized = timestamp
# Make unique if needed
try:
unique_slug = make_slug_unique_with_suffix(sanitized, existing_slugs)
return (True, unique_slug, None)
except ValueError as e:
# This should rarely happen, but if it does, use timestamp
# Per Q8: Never fail Micropub request
timestamp = datetime.utcnow().strftime('%Y%m%d-%H%M%S')
logger.error(
f"Could not create unique slug from '{custom_slug}'. "
f"Using timestamp: {timestamp}. Error: {e}"
)
return (True, timestamp, None)