Gondulf/src/gondulf/services/relme_parser.py

"""rel=me parser service for extracting email addresses from HTML."""

from bs4 import BeautifulSoup


class RelMeParser:
    """Service for parsing rel=me links from HTML."""

    def parse_relme_links(self, html: str) -> list[str]:
        """
        Parse HTML for rel=me links.

        Args:
            html: HTML content to parse

        Returns:
            List of rel=me link URLs
        """
        try:
            soup = BeautifulSoup(html, 'html.parser')
            links = []

            # Find all <a> tags with rel="me" attribute
            for link in soup.find_all('a', rel='me'):
                href = link.get('href')
                if href:
                    links.append(href)

            # Also check for <link> tags with rel="me"
            for link in soup.find_all('link', rel='me'):
                href = link.get('href')
                if href:
                    links.append(href)

            return links

        except Exception:
            return []

    def extract_mailto_email(self, relme_links: list[str]) -> str | None:
        """
        Extract email address from mailto: links.

        Args:
            relme_links: List of rel=me link URLs

        Returns:
            Email address if found, None otherwise
        """
        for link in relme_links:
            if link.startswith('mailto:'):
                # Extract email address from mailto: link
                email = link[7:]  # Remove 'mailto:' prefix

                # Strip any query parameters (e.g., ?subject=...)
                if '?' in email:
                    email = email.split('?')[0]

                # Basic validation
                if '@' in email and '.' in email:
                    return email.strip()

        return None

    def find_email(self, html: str) -> str | None:
        """
        Find email address from HTML by parsing rel=me links.

        Args:
            html: HTML content to parse

        Returns:
            Email address if found, None otherwise
        """
        relme_links = self.parse_relme_links(html)
        return self.extract_mailto_email(relme_links)