""" Core utility functions for StarPunk This module provides essential utilities for slug generation, file operations, hashing, and date/time handling. These utilities are used throughout the application and have no external dependencies beyond standard library. """ # Standard library imports import hashlib import re import secrets import shutil from datetime import datetime from pathlib import Path from typing import Optional # Constants - Slug configuration MAX_SLUG_LENGTH = 100 MIN_SLUG_LENGTH = 1 SLUG_WORDS_COUNT = 5 RANDOM_SUFFIX_LENGTH = 4 # Reserved slugs (system routes) RESERVED_SLUGS = {"admin", "api", "static", "auth", "feed", "login", "logout"} # File operations TEMP_FILE_SUFFIX = ".tmp" TRASH_DIR_NAME = ".trash" # Hashing CONTENT_HASH_ALGORITHM = "sha256" # Regex patterns SLUG_PATTERN = re.compile(r"^[a-z0-9]+(?:-[a-z0-9]+)*$") SAFE_SLUG_PATTERN = re.compile(r"[^a-z0-9-]") MULTIPLE_HYPHENS_PATTERN = re.compile(r"-+") URL_PATTERN = re.compile( r"^https?://" # http:// or https:// r"(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+[A-Z]{2,6}\.?|" # domain... r"localhost|" # localhost... r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})" # ...or ip r"(?::\d+)?" # optional port r"(?:/?|[/?]\S+)$", re.IGNORECASE, ) # Character set for random suffix generation RANDOM_CHARS = "abcdefghijklmnopqrstuvwxyz0123456789" # Helper Functions def is_valid_url(url: str) -> bool: """ Validate URL format Checks if a string is a valid HTTP or HTTPS URL. Args: url: URL string to validate Returns: True if valid URL, False otherwise Examples: >>> is_valid_url("https://example.com") True >>> is_valid_url("http://localhost:5000") True >>> is_valid_url("not-a-url") False >>> is_valid_url("ftp://example.com") False """ if not url or not isinstance(url, str): return False return bool(URL_PATTERN.match(url)) def extract_first_words(text: str, max_words: int = 5) -> str: """ Extract first N words from text Helper function for slug generation. Splits text on whitespace and returns first N non-empty words. Args: text: Text to extract words from max_words: Maximum number of words to extract (default: 5) Returns: Space-separated string of first N words Examples: >>> extract_first_words("Hello world this is a test", 3) 'Hello world this' >>> extract_first_words(" Multiple spaces ", 2) 'Multiple spaces' """ words = text.strip().split() return " ".join(words[:max_words]) def normalize_slug_text(text: str) -> str: """ Normalize text for use in slug Converts to lowercase, replaces spaces with hyphens, removes special characters, and collapses multiple hyphens. Args: text: Text to normalize Returns: Normalized slug-safe text Examples: >>> normalize_slug_text("Hello World!") 'hello-world' >>> normalize_slug_text("Testing... with -- special chars!") 'testing-with-special-chars' """ # Convert to lowercase text = text.lower() # Replace spaces with hyphens text = text.replace(" ", "-") # Remove all non-alphanumeric characters except hyphens text = SAFE_SLUG_PATTERN.sub("", text) # Collapse multiple hyphens to single hyphen text = MULTIPLE_HYPHENS_PATTERN.sub("-", text) # Strip leading/trailing hyphens text = text.strip("-") return text def generate_random_suffix(length: int = 4) -> str: """ Generate random alphanumeric suffix Creates a secure random string for making slugs unique. Uses lowercase letters and numbers only. Args: length: Length of suffix (default: 4) Returns: Random alphanumeric string Examples: >>> suffix = generate_random_suffix() >>> len(suffix) 4 >>> suffix.isalnum() True """ return "".join(secrets.choice(RANDOM_CHARS) for _ in range(length)) # Slug Functions def generate_slug(content: str, created_at: Optional[datetime] = None) -> str: """ Generate URL-safe slug from note content Creates a slug by extracting the first few words from the content and normalizing them to lowercase with hyphens. If content is insufficient, falls back to timestamp-based slug. Args: content: The note content (markdown text) created_at: Optional timestamp for fallback slug (defaults to now) Returns: URL-safe slug string (lowercase, alphanumeric + hyphens only) Raises: ValueError: If content is empty or contains only whitespace Examples: >>> generate_slug("Hello World! This is my first note.") 'hello-world-this-is-my' >>> generate_slug("Testing... with special chars!@#") 'testing-with-special-chars' >>> generate_slug("A") # Too short, uses timestamp '20241118-143022' Notes: - This function does NOT check for uniqueness - Caller must verify slug doesn't exist in database - Use make_slug_unique() to add random suffix if needed """ # Validate input if not content or not content.strip(): raise ValueError("Content cannot be empty or whitespace-only") # Extract first N words from content first_words = extract_first_words(content, SLUG_WORDS_COUNT) # Normalize to slug format slug = normalize_slug_text(first_words) # If slug is empty or too short, use timestamp fallback if len(slug) < MIN_SLUG_LENGTH: if created_at is None: created_at = datetime.utcnow() slug = created_at.strftime("%Y%m%d-%H%M%S") # Truncate to maximum length slug = slug[:MAX_SLUG_LENGTH] return slug def make_slug_unique(base_slug: str, existing_slugs: set[str]) -> str: """ Make a slug unique by adding random suffix if needed If the base_slug already exists in the provided set, appends a random alphanumeric suffix until a unique slug is found. Args: base_slug: The base slug to make unique existing_slugs: Set of existing slugs to check against Returns: Unique slug (base_slug or base_slug-{random}) Examples: >>> make_slug_unique("test-note", set()) 'test-note' >>> make_slug_unique("test-note", {"test-note"}) 'test-note-a7c9' # Random suffix >>> make_slug_unique("test-note", {"test-note", "test-note-a7c9"}) 'test-note-x3k2' # Different random suffix Notes: - Random suffix is 4 lowercase alphanumeric characters - Extremely low collision probability (36^4 = 1.6M combinations) - Will retry up to 100 times if collision occurs (should never happen) """ # If base slug doesn't exist, return it unchanged if base_slug not in existing_slugs: return base_slug # Generate unique slug with random suffix max_attempts = 100 for _ in range(max_attempts): suffix = generate_random_suffix(RANDOM_SUFFIX_LENGTH) unique_slug = f"{base_slug}-{suffix}" if unique_slug not in existing_slugs: return unique_slug # This should never happen with 36^4 combinations raise RuntimeError( f"Failed to generate unique slug after {max_attempts} attempts. " f"This is extremely unlikely and may indicate a problem." ) def validate_slug(slug: str) -> bool: """ Validate that a slug meets all requirements Checks that slug contains only allowed characters and is within length limits. Also checks against reserved slugs. Args: slug: The slug to validate Returns: True if slug is valid, False otherwise Rules: - Must contain only: a-z, 0-9, hyphen (-) - Must be between 1 and 100 characters - Cannot start or end with hyphen - Cannot contain consecutive hyphens - Cannot be a reserved slug Examples: >>> validate_slug("hello-world") True >>> validate_slug("Hello-World") # Uppercase False >>> validate_slug("-hello") # Leading hyphen False >>> validate_slug("hello--world") # Double hyphen False >>> validate_slug("admin") # Reserved slug False """ # Check basic constraints if not slug: return False if len(slug) < MIN_SLUG_LENGTH or len(slug) > MAX_SLUG_LENGTH: return False # Check against reserved slugs if slug in RESERVED_SLUGS: return False # Check pattern (lowercase alphanumeric with single hyphens) return bool(SLUG_PATTERN.match(slug)) # Content Hashing def calculate_content_hash(content: str) -> str: """ Calculate SHA-256 hash of content Generates a cryptographic hash of the content for change detection and cache invalidation. Uses UTF-8 encoding. Args: content: The content to hash (markdown text) Returns: Hexadecimal hash string (64 characters) Examples: >>> calculate_content_hash("Hello World") 'a591a6d40bf420404a011733cfb7b190d62c65bf0bcda32b57b277d9ad9f146e' >>> calculate_content_hash("") 'e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855' Notes: - Same content always produces same hash - Hash is deterministic across systems - Useful for detecting external file modifications - SHA-256 chosen for security and wide support """ content_bytes = content.encode("utf-8") hash_obj = hashlib.sha256(content_bytes) return hash_obj.hexdigest() # File Path Operations def generate_note_path(slug: str, created_at: datetime, data_dir: Path) -> Path: """ Generate file path for a note Creates path following pattern: data/notes/YYYY/MM/slug.md Args: slug: URL-safe slug for the note created_at: Creation timestamp (determines YYYY/MM) data_dir: Base data directory path Returns: Full Path object for the note file Raises: ValueError: If slug is invalid Examples: >>> from datetime import datetime >>> from pathlib import Path >>> dt = datetime(2024, 11, 18, 14, 30) >>> generate_note_path("test-note", dt, Path("data")) PosixPath('data/notes/2024/11/test-note.md') Notes: - Does NOT create directories (use ensure_note_directory) - Does NOT check if file exists - Validates slug before generating path """ # Validate slug before generating path if not validate_slug(slug): raise ValueError(f"Invalid slug: {slug}") # Extract year and month from created_at year = created_at.strftime("%Y") month = created_at.strftime("%m") # Build path: data_dir/notes/YYYY/MM/slug.md note_path = data_dir / "notes" / year / month / f"{slug}.md" return note_path def ensure_note_directory(note_path: Path) -> Path: """ Ensure directory exists for note file Creates parent directories if they don't exist. Safe to call even if directories already exist. Args: note_path: Full path to note file Returns: Parent directory path Raises: OSError: If directory cannot be created (permissions, etc.) Examples: >>> note_path = Path("data/notes/2024/11/test-note.md") >>> ensure_note_directory(note_path) PosixPath('data/notes/2024/11') """ # Create parent directories if they don't exist parent_dir = note_path.parent parent_dir.mkdir(parents=True, exist_ok=True) return parent_dir def validate_note_path(file_path: Path, data_dir: Path) -> bool: """ Validate that file path is within data directory Security check to prevent path traversal attacks. Ensures the resolved path is within the allowed data directory. Args: file_path: Path to validate data_dir: Base data directory that must contain file_path Returns: True if path is safe, False otherwise Examples: >>> validate_note_path( ... Path("data/notes/2024/11/note.md"), ... Path("data") ... ) True >>> validate_note_path( ... Path("data/notes/../../etc/passwd"), ... Path("data") ... ) False Security: - Resolves symlinks and relative paths - Checks if resolved path is child of data_dir - Prevents directory traversal attacks """ # Resolve both paths to absolute try: resolved_file = file_path.resolve() resolved_data_dir = data_dir.resolve() # Check if file_path is relative to data_dir return resolved_file.is_relative_to(resolved_data_dir) except (ValueError, OSError): # If resolve() fails or is_relative_to() raises an error return False # Atomic File Operations def write_note_file(file_path: Path, content: str) -> None: """ Write note content to file atomically Writes to temporary file first, then atomically renames to final path. This prevents corruption if write is interrupted. Args: file_path: Destination file path content: Content to write (markdown text) Raises: OSError: If file cannot be written ValueError: If file_path is invalid Examples: >>> write_note_file(Path("data/notes/2024/11/test.md"), "# Test") Implementation: 1. Create temp file: {file_path}.tmp 2. Write content to temp file 3. Atomically rename temp to final path 4. If any step fails, clean up temp file Notes: - Atomic rename is guaranteed on POSIX systems - Temp file created in same directory as target - UTF-8 encoding used for all text """ # Create temp file path temp_path = file_path.with_suffix(file_path.suffix + TEMP_FILE_SUFFIX) try: # Write to temp file temp_path.write_text(content, encoding="utf-8") # Atomically rename temp to final path temp_path.replace(file_path) except Exception: # Clean up temp file if it exists if temp_path.exists(): temp_path.unlink() # Re-raise the exception raise def read_note_file(file_path: Path) -> str: """ Read note content from file Args: file_path: Path to note file Returns: File content as string Raises: FileNotFoundError: If file doesn't exist OSError: If file cannot be read Examples: >>> content = read_note_file(Path("data/notes/2024/11/test.md")) >>> print(content) # Test Note """ return file_path.read_text(encoding="utf-8") def delete_note_file( file_path: Path, soft: bool = False, data_dir: Optional[Path] = None ) -> None: """ Delete note file from filesystem Supports soft delete (move to trash) or hard delete (permanent removal). Args: file_path: Path to note file soft: If True, move to .trash/ directory; if False, delete permanently data_dir: Required if soft=True, base data directory Raises: FileNotFoundError: If file doesn't exist ValueError: If soft=True but data_dir not provided OSError: If file cannot be deleted or moved Examples: >>> # Hard delete >>> delete_note_file(Path("data/notes/2024/11/test.md")) >>> # Soft delete (move to trash) >>> delete_note_file( ... Path("data/notes/2024/11/test.md"), ... soft=True, ... data_dir=Path("data") ... ) """ if soft: # Soft delete: move to trash if data_dir is None: raise ValueError("data_dir is required for soft delete") # Extract year/month from file path # Assuming path structure: data_dir/notes/YYYY/MM/slug.md parts = file_path.parts try: # Find the year and month in the path notes_idx = parts.index("notes") year = parts[notes_idx + 1] month = parts[notes_idx + 2] except (ValueError, IndexError): # If path doesn't follow expected structure, use current date now = datetime.utcnow() year = now.strftime("%Y") month = now.strftime("%m") # Create trash directory path trash_dir = data_dir / TRASH_DIR_NAME / year / month trash_dir.mkdir(parents=True, exist_ok=True) # Move file to trash trash_path = trash_dir / file_path.name shutil.move(str(file_path), str(trash_path)) else: # Hard delete: permanent removal file_path.unlink() # Date/Time Utilities def format_rfc822(dt: datetime) -> str: """ Format datetime as RFC-822 string Converts datetime to RFC-822 format required by RSS 2.0 specification. Assumes UTC timezone. Args: dt: Datetime to format (assumed UTC) Returns: RFC-822 formatted string Examples: >>> from datetime import datetime >>> dt = datetime(2024, 11, 18, 14, 30, 45) >>> format_rfc822(dt) 'Mon, 18 Nov 2024 14:30:45 +0000' References: - RSS 2.0 spec: https://www.rssboard.org/rss-specification - RFC-822 date format """ return dt.strftime("%a, %d %b %Y %H:%M:%S +0000") def format_iso8601(dt: datetime) -> str: """ Format datetime as ISO 8601 string Converts datetime to ISO 8601 format for timestamps and APIs. Args: dt: Datetime to format Returns: ISO 8601 formatted string Examples: >>> from datetime import datetime >>> dt = datetime(2024, 11, 18, 14, 30, 45) >>> format_iso8601(dt) '2024-11-18T14:30:45Z' """ return dt.strftime("%Y-%m-%dT%H:%M:%SZ") def parse_iso8601(date_string: str) -> datetime: """ Parse ISO 8601 string to datetime Args: date_string: ISO 8601 formatted string Returns: Datetime object (UTC) Raises: ValueError: If string is not valid ISO 8601 format Examples: >>> parse_iso8601("2024-11-18T14:30:45Z") datetime.datetime(2024, 11, 18, 14, 30, 45) """ # Remove 'Z' suffix if present if date_string.endswith("Z"): date_string = date_string[:-1] # Parse using fromisoformat return datetime.fromisoformat(date_string)