Files
StarPunk/starpunk/slug_utils.py
Phil Skentelbery 3f1f82a749 feat(slugs): Implement timestamp-based slugs per ADR-062
Replaces content-based slug generation with timestamp format YYYYMMDDHHMMSS.
Simplifies slug generation and improves privacy by not exposing note content in URLs.

Changes:
- Add generate_timestamp_slug() to slug_utils.py
- Update notes.py to use timestamp slugs for default generation
- Sequential collision suffix (-1, -2) instead of random
- Custom slugs via mp-slug continue to work unchanged
- 892 tests passing (+18 new timestamp slug tests)

Per ADR-062 and v1.5.0 Phase 1 specification.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-17 09:49:30 -07:00

384 lines
11 KiB
Python

"""
Slug validation and sanitization utilities for StarPunk
This module provides functions for validating, sanitizing, and ensuring uniqueness
of note slugs. Supports custom slugs via Micropub's mp-slug property.
Per developer Q&A Q8:
- Unicode normalization for slug generation
- Timestamp-based fallback (YYYYMMDD-HHMMSS) when normalization fails
- Log warnings with original text
- Never fail Micropub request
"""
import re
import unicodedata
import logging
from datetime import datetime
from typing import Optional, Set
logger = logging.getLogger(__name__)
# Reserved slugs that cannot be used for notes
# These correspond to application routes and special pages
RESERVED_SLUGS = frozenset([
# Core routes
'api',
'admin',
'auth',
'feed',
'static',
'notes',
# Auth/admin routes
'login',
'logout',
'settings',
'micropub',
'callback',
# Feed routes
'feed.xml',
'rss',
'atom',
# Special pages
'index',
'home',
'about',
'search',
])
# Slug validation regex
# Allows: lowercase letters, numbers, hyphens
# Must start with letter or number
# Must end with letter or number
# Cannot have consecutive hyphens
SLUG_PATTERN = re.compile(r'^[a-z0-9]([a-z0-9-]*[a-z0-9])?$')
# Maximum slug length
MAX_SLUG_LENGTH = 200
def is_reserved_slug(slug: str) -> bool:
"""
Check if slug is reserved
Args:
slug: Slug to check
Returns:
bool: True if slug is reserved
"""
return slug.lower() in RESERVED_SLUGS
def sanitize_slug(slug: str, allow_timestamp_fallback: bool = False) -> str:
"""
Sanitize a custom slug with Unicode normalization
Per developer Q&A Q8:
- Unicode normalization (NFKD) for international characters
- Timestamp-based fallback (YYYYMMDD-HHMMSS) when normalization fails
- Log warnings with original text
- Never fail (always returns a valid slug)
Converts to lowercase, replaces invalid characters with hyphens,
removes consecutive hyphens, and trims to max length.
Args:
slug: Raw slug input
allow_timestamp_fallback: If True, use timestamp fallback for empty slugs
Returns:
Sanitized slug string (never empty if allow_timestamp_fallback=True)
Examples:
>>> sanitize_slug("Hello World!")
'hello-world'
>>> sanitize_slug("My--Post___Title")
'my-post-title'
>>> sanitize_slug(" leading-spaces ")
'leading-spaces'
>>> sanitize_slug("Café")
'cafe'
>>> sanitize_slug("日本語", allow_timestamp_fallback=True)
# Returns timestamp-based slug like '20231125-143022'
>>> sanitize_slug("😀🎉✨", allow_timestamp_fallback=True)
# Returns timestamp-based slug
"""
original_slug = slug
# Unicode normalization (NFKD) - decomposes characters
# e.g., "é" becomes "e" + combining accent
slug = unicodedata.normalize('NFKD', slug)
# Remove combining characters (accents, etc.)
# This converts accented characters to their ASCII equivalents
slug = slug.encode('ascii', 'ignore').decode('ascii')
# Convert to lowercase
slug = slug.lower()
# Replace invalid characters with hyphens
# Allow only: a-z, 0-9, hyphens
slug = re.sub(r'[^a-z0-9-]+', '-', slug)
# Remove consecutive hyphens
slug = re.sub(r'-+', '-', slug)
# Trim leading/trailing hyphens
slug = slug.strip('-')
# Check if normalization resulted in empty slug
if not slug and allow_timestamp_fallback:
# Per Q8: Use timestamp-based fallback
timestamp = datetime.utcnow().strftime('%Y%m%d-%H%M%S')
slug = timestamp
logger.warning(
f"Slug normalization failed for input '{original_slug}' "
f"(all characters removed during normalization). "
f"Using timestamp fallback: {slug}"
)
# Trim to max length
if len(slug) > MAX_SLUG_LENGTH:
slug = slug[:MAX_SLUG_LENGTH].rstrip('-')
return slug
def validate_slug(slug: str) -> bool:
"""
Validate slug format
Checks if slug matches required pattern:
- Only lowercase letters, numbers, hyphens
- Starts with letter or number
- Ends with letter or number
- No consecutive hyphens
- Not empty
- Not too long
Args:
slug: Slug to validate
Returns:
bool: True if valid, False otherwise
Examples:
>>> validate_slug("my-post")
True
>>> validate_slug("my--post") # consecutive hyphens
False
>>> validate_slug("-my-post") # starts with hyphen
False
>>> validate_slug("My-Post") # uppercase
False
"""
if not slug:
return False
if len(slug) > MAX_SLUG_LENGTH:
return False
if not SLUG_PATTERN.match(slug):
return False
return True
def make_slug_unique_with_suffix(base_slug: str, existing_slugs: Set[str], max_attempts: int = 99) -> str:
"""
Make slug unique by adding sequential numeric suffix
If base_slug exists, tries base_slug-2, base_slug-3, etc.
Uses sequential numbers (not random) for predictability.
Args:
base_slug: Base slug to make unique
existing_slugs: Set of existing slugs to check against
max_attempts: Maximum number of attempts (default: 99)
Returns:
Unique slug with suffix if needed
Raises:
ValueError: If unique slug cannot be generated after max_attempts
Examples:
>>> make_slug_unique_with_suffix("my-post", {"my-post"})
'my-post-2'
>>> make_slug_unique_with_suffix("my-post", {"my-post", "my-post-2"})
'my-post-3'
>>> make_slug_unique_with_suffix("my-post", set())
'my-post'
"""
# If base slug is available, use it
if base_slug not in existing_slugs:
return base_slug
# Try sequential suffixes
for i in range(2, max_attempts + 2):
candidate = f"{base_slug}-{i}"
if candidate not in existing_slugs:
return candidate
# Exhausted all attempts
raise ValueError(
f"Could not create unique slug after {max_attempts} attempts. "
f"Base slug: {base_slug}"
)
def generate_timestamp_slug(
created_at: datetime = None,
existing_slugs: Set[str] = None
) -> str:
"""Generate a timestamp-based slug with collision handling.
Per ADR-062: Default format is YYYYMMDDHHMMSS with sequential
suffix (-1, -2, etc.) for collisions.
Args:
created_at: Note creation timestamp (defaults to now)
existing_slugs: Set of existing slugs to check for collisions
Returns:
Unique timestamp-based slug
Examples:
>>> generate_timestamp_slug(datetime(2025, 12, 16, 14, 30, 52), set())
'20251216143052'
>>> generate_timestamp_slug(datetime(2025, 12, 16, 14, 30, 52), {'20251216143052'})
'20251216143052-1'
>>> generate_timestamp_slug(datetime(2025, 12, 16, 14, 30, 52), {'20251216143052', '20251216143052-1'})
'20251216143052-2'
"""
if created_at is None:
created_at = datetime.utcnow()
if existing_slugs is None:
existing_slugs = set()
# Generate base timestamp slug (YYYYMMDDHHMMSS per ADR-062)
base_slug = created_at.strftime("%Y%m%d%H%M%S")
# If no collision, return base slug
if base_slug not in existing_slugs:
return base_slug
# Sequential suffix for collisions (starts at -1 per ADR-062)
suffix = 1
while f"{base_slug}-{suffix}" in existing_slugs:
suffix += 1
return f"{base_slug}-{suffix}"
def validate_and_sanitize_custom_slug(custom_slug: str, existing_slugs: Set[str]) -> tuple[bool, Optional[str], Optional[str]]:
"""
Validate and sanitize a custom slug from Micropub
Per developer Q&A Q8:
- Never fail Micropub request due to slug issues
- Use timestamp fallback if normalization fails
- Log warnings for debugging
Performs full validation pipeline:
1. Sanitize the input (with timestamp fallback)
2. Check if it's reserved
3. Validate format
4. Make unique if needed
Args:
custom_slug: Raw custom slug from mp-slug property
existing_slugs: Set of existing slugs
Returns:
Tuple of (success, slug_or_none, error_message_or_none)
Examples:
>>> validate_and_sanitize_custom_slug("My Post", set())
(True, 'my-post', None)
>>> validate_and_sanitize_custom_slug("api", set())
(False, None, 'Slug "api" is reserved')
>>> validate_and_sanitize_custom_slug("/invalid/slug", set())
(False, None, 'Slug "/invalid/slug" contains hierarchical paths which are not supported')
>>> validate_and_sanitize_custom_slug("😀🎉", set())
# Returns (True, '20231125-143022', None) - timestamp fallback
"""
# Check for hierarchical paths (not supported in v1.1.0)
if '/' in custom_slug:
return (
False,
None,
f'Slug "{custom_slug}" contains hierarchical paths which are not supported'
)
# Sanitize with timestamp fallback enabled
# Per Q8: Never fail Micropub request
sanitized = sanitize_slug(custom_slug, allow_timestamp_fallback=True)
# After timestamp fallback, slug should never be empty
# But check anyway for safety
if not sanitized:
# This should never happen with allow_timestamp_fallback=True
# but handle it just in case
timestamp = datetime.utcnow().strftime('%Y%m%d-%H%M%S')
sanitized = timestamp
logger.error(
f"Unexpected empty slug after sanitization with fallback. "
f"Original: '{custom_slug}'. Using timestamp: {sanitized}"
)
# Check if reserved
if is_reserved_slug(sanitized):
# Per Q8: Never fail - add suffix to reserved slug
logger.warning(
f"Slug '{sanitized}' (from '{custom_slug}') is reserved. "
f"Adding numeric suffix."
)
# Add a suffix to make it non-reserved
sanitized = f"{sanitized}-note"
# Validate format
if not validate_slug(sanitized):
# This should rarely happen after sanitization
# but if it does, use timestamp fallback
timestamp = datetime.utcnow().strftime('%Y%m%d-%H%M%S')
logger.warning(
f"Slug '{sanitized}' (from '{custom_slug}') failed validation. "
f"Using timestamp fallback: {timestamp}"
)
sanitized = timestamp
# Make unique if needed
try:
unique_slug = make_slug_unique_with_suffix(sanitized, existing_slugs)
return (True, unique_slug, None)
except ValueError as e:
# This should rarely happen, but if it does, use timestamp
# Per Q8: Never fail Micropub request
timestamp = datetime.utcnow().strftime('%Y%m%d-%H%M%S')
logger.error(
f"Could not create unique slug from '{custom_slug}'. "
f"Using timestamp: {timestamp}. Error: {e}"
)
return (True, timestamp, None)