Files
Gondulf/src/gondulf/services/relme_parser.py
Phil Skentelbery 074f74002c feat(phase-2): implement domain verification system
Implements complete domain verification flow with:
- rel=me link verification service
- HTML fetching with security controls
- Rate limiting to prevent abuse
- Email validation utilities
- Authorization and verification API endpoints
- User-facing templates for authorization and verification flows

This completes Phase 2: Domain Verification as designed.

Tests:
- All Phase 2 unit tests passing
- Coverage: 85% overall
- Migration tests updated

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-20 13:44:33 -07:00

77 lines
2.1 KiB
Python

"""rel=me parser service for extracting email addresses from HTML."""
from bs4 import BeautifulSoup
class RelMeParser:
"""Service for parsing rel=me links from HTML."""
def parse_relme_links(self, html: str) -> list[str]:
"""
Parse HTML for rel=me links.
Args:
html: HTML content to parse
Returns:
List of rel=me link URLs
"""
try:
soup = BeautifulSoup(html, 'html.parser')
links = []
# Find all <a> tags with rel="me" attribute
for link in soup.find_all('a', rel='me'):
href = link.get('href')
if href:
links.append(href)
# Also check for <link> tags with rel="me"
for link in soup.find_all('link', rel='me'):
href = link.get('href')
if href:
links.append(href)
return links
except Exception:
return []
def extract_mailto_email(self, relme_links: list[str]) -> str | None:
"""
Extract email address from mailto: links.
Args:
relme_links: List of rel=me link URLs
Returns:
Email address if found, None otherwise
"""
for link in relme_links:
if link.startswith('mailto:'):
# Extract email address from mailto: link
email = link[7:] # Remove 'mailto:' prefix
# Strip any query parameters (e.g., ?subject=...)
if '?' in email:
email = email.split('?')[0]
# Basic validation
if '@' in email and '.' in email:
return email.strip()
return None
def find_email(self, html: str) -> str | None:
"""
Find email address from HTML by parsing rel=me links.
Args:
html: HTML content to parse
Returns:
Email address if found, None otherwise
"""
relme_links = self.parse_relme_links(html)
return self.extract_mailto_email(relme_links)