Phase 2 - Enhancements: - Add performance monitoring infrastructure with MetricsBuffer - Implement three-tier health checks (/health, /health?detailed, /admin/health) - Enhance search with FTS5 fallback and XSS-safe highlighting - Add Unicode slug generation with timestamp fallback - Expose database pool statistics via /admin/metrics - Create missing error templates (400, 401, 403, 405, 503) Phase 3 - Polish: - Implement RSS streaming optimization (memory O(n) → O(1)) - Add admin metrics dashboard with htmx and Chart.js - Fix flaky migration race condition tests - Create comprehensive operational documentation - Add upgrade guide and troubleshooting guide Testing: 632 tests passing, zero flaky tests Documentation: Complete operational guides Security: All security reviews passed 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
337 lines
9.5 KiB
Python
337 lines
9.5 KiB
Python
"""
|
|
Slug validation and sanitization utilities for StarPunk
|
|
|
|
This module provides functions for validating, sanitizing, and ensuring uniqueness
|
|
of note slugs. Supports custom slugs via Micropub's mp-slug property.
|
|
|
|
Per developer Q&A Q8:
|
|
- Unicode normalization for slug generation
|
|
- Timestamp-based fallback (YYYYMMDD-HHMMSS) when normalization fails
|
|
- Log warnings with original text
|
|
- Never fail Micropub request
|
|
"""
|
|
|
|
import re
|
|
import unicodedata
|
|
import logging
|
|
from datetime import datetime
|
|
from typing import Optional, Set
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Reserved slugs that cannot be used for notes
|
|
# These correspond to application routes and special pages
|
|
RESERVED_SLUGS = frozenset([
|
|
# Core routes
|
|
'api',
|
|
'admin',
|
|
'auth',
|
|
'feed',
|
|
'static',
|
|
'notes',
|
|
|
|
# Auth/admin routes
|
|
'login',
|
|
'logout',
|
|
'settings',
|
|
'micropub',
|
|
'callback',
|
|
|
|
# Feed routes
|
|
'feed.xml',
|
|
'rss',
|
|
'atom',
|
|
|
|
# Special pages
|
|
'index',
|
|
'home',
|
|
'about',
|
|
'search',
|
|
])
|
|
|
|
# Slug validation regex
|
|
# Allows: lowercase letters, numbers, hyphens
|
|
# Must start with letter or number
|
|
# Must end with letter or number
|
|
# Cannot have consecutive hyphens
|
|
SLUG_PATTERN = re.compile(r'^[a-z0-9]([a-z0-9-]*[a-z0-9])?$')
|
|
|
|
# Maximum slug length
|
|
MAX_SLUG_LENGTH = 200
|
|
|
|
|
|
def is_reserved_slug(slug: str) -> bool:
|
|
"""
|
|
Check if slug is reserved
|
|
|
|
Args:
|
|
slug: Slug to check
|
|
|
|
Returns:
|
|
bool: True if slug is reserved
|
|
"""
|
|
return slug.lower() in RESERVED_SLUGS
|
|
|
|
|
|
def sanitize_slug(slug: str, allow_timestamp_fallback: bool = False) -> str:
|
|
"""
|
|
Sanitize a custom slug with Unicode normalization
|
|
|
|
Per developer Q&A Q8:
|
|
- Unicode normalization (NFKD) for international characters
|
|
- Timestamp-based fallback (YYYYMMDD-HHMMSS) when normalization fails
|
|
- Log warnings with original text
|
|
- Never fail (always returns a valid slug)
|
|
|
|
Converts to lowercase, replaces invalid characters with hyphens,
|
|
removes consecutive hyphens, and trims to max length.
|
|
|
|
Args:
|
|
slug: Raw slug input
|
|
allow_timestamp_fallback: If True, use timestamp fallback for empty slugs
|
|
|
|
Returns:
|
|
Sanitized slug string (never empty if allow_timestamp_fallback=True)
|
|
|
|
Examples:
|
|
>>> sanitize_slug("Hello World!")
|
|
'hello-world'
|
|
|
|
>>> sanitize_slug("My--Post___Title")
|
|
'my-post-title'
|
|
|
|
>>> sanitize_slug(" leading-spaces ")
|
|
'leading-spaces'
|
|
|
|
>>> sanitize_slug("Café")
|
|
'cafe'
|
|
|
|
>>> sanitize_slug("日本語", allow_timestamp_fallback=True)
|
|
# Returns timestamp-based slug like '20231125-143022'
|
|
|
|
>>> sanitize_slug("😀🎉✨", allow_timestamp_fallback=True)
|
|
# Returns timestamp-based slug
|
|
"""
|
|
original_slug = slug
|
|
|
|
# Unicode normalization (NFKD) - decomposes characters
|
|
# e.g., "é" becomes "e" + combining accent
|
|
slug = unicodedata.normalize('NFKD', slug)
|
|
|
|
# Remove combining characters (accents, etc.)
|
|
# This converts accented characters to their ASCII equivalents
|
|
slug = slug.encode('ascii', 'ignore').decode('ascii')
|
|
|
|
# Convert to lowercase
|
|
slug = slug.lower()
|
|
|
|
# Replace invalid characters with hyphens
|
|
# Allow only: a-z, 0-9, hyphens
|
|
slug = re.sub(r'[^a-z0-9-]+', '-', slug)
|
|
|
|
# Remove consecutive hyphens
|
|
slug = re.sub(r'-+', '-', slug)
|
|
|
|
# Trim leading/trailing hyphens
|
|
slug = slug.strip('-')
|
|
|
|
# Check if normalization resulted in empty slug
|
|
if not slug and allow_timestamp_fallback:
|
|
# Per Q8: Use timestamp-based fallback
|
|
timestamp = datetime.utcnow().strftime('%Y%m%d-%H%M%S')
|
|
slug = timestamp
|
|
logger.warning(
|
|
f"Slug normalization failed for input '{original_slug}' "
|
|
f"(all characters removed during normalization). "
|
|
f"Using timestamp fallback: {slug}"
|
|
)
|
|
|
|
# Trim to max length
|
|
if len(slug) > MAX_SLUG_LENGTH:
|
|
slug = slug[:MAX_SLUG_LENGTH].rstrip('-')
|
|
|
|
return slug
|
|
|
|
|
|
def validate_slug(slug: str) -> bool:
|
|
"""
|
|
Validate slug format
|
|
|
|
Checks if slug matches required pattern:
|
|
- Only lowercase letters, numbers, hyphens
|
|
- Starts with letter or number
|
|
- Ends with letter or number
|
|
- No consecutive hyphens
|
|
- Not empty
|
|
- Not too long
|
|
|
|
Args:
|
|
slug: Slug to validate
|
|
|
|
Returns:
|
|
bool: True if valid, False otherwise
|
|
|
|
Examples:
|
|
>>> validate_slug("my-post")
|
|
True
|
|
|
|
>>> validate_slug("my--post") # consecutive hyphens
|
|
False
|
|
|
|
>>> validate_slug("-my-post") # starts with hyphen
|
|
False
|
|
|
|
>>> validate_slug("My-Post") # uppercase
|
|
False
|
|
"""
|
|
if not slug:
|
|
return False
|
|
|
|
if len(slug) > MAX_SLUG_LENGTH:
|
|
return False
|
|
|
|
if not SLUG_PATTERN.match(slug):
|
|
return False
|
|
|
|
return True
|
|
|
|
|
|
def make_slug_unique_with_suffix(base_slug: str, existing_slugs: Set[str], max_attempts: int = 99) -> str:
|
|
"""
|
|
Make slug unique by adding sequential numeric suffix
|
|
|
|
If base_slug exists, tries base_slug-2, base_slug-3, etc.
|
|
Uses sequential numbers (not random) for predictability.
|
|
|
|
Args:
|
|
base_slug: Base slug to make unique
|
|
existing_slugs: Set of existing slugs to check against
|
|
max_attempts: Maximum number of attempts (default: 99)
|
|
|
|
Returns:
|
|
Unique slug with suffix if needed
|
|
|
|
Raises:
|
|
ValueError: If unique slug cannot be generated after max_attempts
|
|
|
|
Examples:
|
|
>>> make_slug_unique_with_suffix("my-post", {"my-post"})
|
|
'my-post-2'
|
|
|
|
>>> make_slug_unique_with_suffix("my-post", {"my-post", "my-post-2"})
|
|
'my-post-3'
|
|
|
|
>>> make_slug_unique_with_suffix("my-post", set())
|
|
'my-post'
|
|
"""
|
|
# If base slug is available, use it
|
|
if base_slug not in existing_slugs:
|
|
return base_slug
|
|
|
|
# Try sequential suffixes
|
|
for i in range(2, max_attempts + 2):
|
|
candidate = f"{base_slug}-{i}"
|
|
if candidate not in existing_slugs:
|
|
return candidate
|
|
|
|
# Exhausted all attempts
|
|
raise ValueError(
|
|
f"Could not create unique slug after {max_attempts} attempts. "
|
|
f"Base slug: {base_slug}"
|
|
)
|
|
|
|
|
|
def validate_and_sanitize_custom_slug(custom_slug: str, existing_slugs: Set[str]) -> tuple[bool, Optional[str], Optional[str]]:
|
|
"""
|
|
Validate and sanitize a custom slug from Micropub
|
|
|
|
Per developer Q&A Q8:
|
|
- Never fail Micropub request due to slug issues
|
|
- Use timestamp fallback if normalization fails
|
|
- Log warnings for debugging
|
|
|
|
Performs full validation pipeline:
|
|
1. Sanitize the input (with timestamp fallback)
|
|
2. Check if it's reserved
|
|
3. Validate format
|
|
4. Make unique if needed
|
|
|
|
Args:
|
|
custom_slug: Raw custom slug from mp-slug property
|
|
existing_slugs: Set of existing slugs
|
|
|
|
Returns:
|
|
Tuple of (success, slug_or_none, error_message_or_none)
|
|
|
|
Examples:
|
|
>>> validate_and_sanitize_custom_slug("My Post", set())
|
|
(True, 'my-post', None)
|
|
|
|
>>> validate_and_sanitize_custom_slug("api", set())
|
|
(False, None, 'Slug "api" is reserved')
|
|
|
|
>>> validate_and_sanitize_custom_slug("/invalid/slug", set())
|
|
(False, None, 'Slug "/invalid/slug" contains hierarchical paths which are not supported')
|
|
|
|
>>> validate_and_sanitize_custom_slug("😀🎉", set())
|
|
# Returns (True, '20231125-143022', None) - timestamp fallback
|
|
"""
|
|
# Check for hierarchical paths (not supported in v1.1.0)
|
|
if '/' in custom_slug:
|
|
return (
|
|
False,
|
|
None,
|
|
f'Slug "{custom_slug}" contains hierarchical paths which are not supported'
|
|
)
|
|
|
|
# Sanitize with timestamp fallback enabled
|
|
# Per Q8: Never fail Micropub request
|
|
sanitized = sanitize_slug(custom_slug, allow_timestamp_fallback=True)
|
|
|
|
# After timestamp fallback, slug should never be empty
|
|
# But check anyway for safety
|
|
if not sanitized:
|
|
# This should never happen with allow_timestamp_fallback=True
|
|
# but handle it just in case
|
|
timestamp = datetime.utcnow().strftime('%Y%m%d-%H%M%S')
|
|
sanitized = timestamp
|
|
logger.error(
|
|
f"Unexpected empty slug after sanitization with fallback. "
|
|
f"Original: '{custom_slug}'. Using timestamp: {sanitized}"
|
|
)
|
|
|
|
# Check if reserved
|
|
if is_reserved_slug(sanitized):
|
|
# Per Q8: Never fail - add suffix to reserved slug
|
|
logger.warning(
|
|
f"Slug '{sanitized}' (from '{custom_slug}') is reserved. "
|
|
f"Adding numeric suffix."
|
|
)
|
|
# Add a suffix to make it non-reserved
|
|
sanitized = f"{sanitized}-note"
|
|
|
|
# Validate format
|
|
if not validate_slug(sanitized):
|
|
# This should rarely happen after sanitization
|
|
# but if it does, use timestamp fallback
|
|
timestamp = datetime.utcnow().strftime('%Y%m%d-%H%M%S')
|
|
logger.warning(
|
|
f"Slug '{sanitized}' (from '{custom_slug}') failed validation. "
|
|
f"Using timestamp fallback: {timestamp}"
|
|
)
|
|
sanitized = timestamp
|
|
|
|
# Make unique if needed
|
|
try:
|
|
unique_slug = make_slug_unique_with_suffix(sanitized, existing_slugs)
|
|
return (True, unique_slug, None)
|
|
except ValueError as e:
|
|
# This should rarely happen, but if it does, use timestamp
|
|
# Per Q8: Never fail Micropub request
|
|
timestamp = datetime.utcnow().strftime('%Y%m%d-%H%M%S')
|
|
logger.error(
|
|
f"Could not create unique slug from '{custom_slug}'. "
|
|
f"Using timestamp: {timestamp}. Error: {e}"
|
|
)
|
|
return (True, timestamp, None)
|