Big-Link-Man/src/generation/url_generator.py

"""
URL generation logic for generated content
"""

import re
import logging
from typing import List, Dict
from src.database.models import GeneratedContent, SiteDeployment, SitePage
from src.database.repositories import SiteDeploymentRepository

logger = logging.getLogger(__name__)


def get_site_hostname(site: SiteDeployment) -> str:
    """
    Get the proper hostname for a site, handling S3 sites correctly.

    For S3 sites:
    - Uses s3_custom_domain if set
    - Otherwise constructs S3 website endpoint URL
    - Never uses pull_zone_bcdn_hostname (Bunny domain) for S3 sites

    For other providers (Bunny, etc.):
    - Uses custom_hostname if set
    - Falls back to pull_zone_bcdn_hostname

    Args:
        site: SiteDeployment record

    Returns:
        Hostname string for the site
    """
    if site.storage_provider in ('s3', 's3_compatible'):
        if site.s3_custom_domain:
            return site.s3_custom_domain
        elif site.s3_bucket_name and site.s3_bucket_region:
            # Use website endpoint format for static website hosting (enables root URL access)
            # Skip website endpoint for S3-compatible services (use standard endpoint)
            if site.storage_provider == 's3_compatible' or getattr(site, 's3_endpoint_url', None):
                return f"{site.s3_bucket_name}.s3.{site.s3_bucket_region}.amazonaws.com"
            else:
                return f"{site.s3_bucket_name}.s3-website-{site.s3_bucket_region}.amazonaws.com"
        else:
            hostname = site.custom_hostname or site.pull_zone_bcdn_hostname
            logger.warning(f"S3 site {site.id} missing s3_custom_domain and bucket info, using fallback: {hostname}")
            return hostname
    else:
        return site.custom_hostname or site.pull_zone_bcdn_hostname


def generate_slug(title: str, max_length: int = 100) -> str:
    """
    Generate URL-safe slug from article title

    Args:
        title: Article title
        max_length: Maximum slug length (default: 100)

    Returns:
        URL-safe slug

    Examples:
        "How to Fix Your Engine" -> "how-to-fix-your-engine"
        "10 Best SEO Tips for 2024!" -> "10-best-seo-tips-for-2024"
        "C++ Programming Guide" -> "c-programming-guide"
    """
    slug = title.lower()
    slug = re.sub(r'[^\w\s-]', '', slug)
    slug = re.sub(r'[-\s]+', '-', slug)
    slug = slug.strip('-')[:max_length]

    return slug or "article"


def generate_urls_for_batch(
    content_records: List[GeneratedContent],
    site_repo: SiteDeploymentRepository
) -> List[Dict]:
    """
    Generate final public URLs for a batch of articles

    Args:
        content_records: List of GeneratedContent records (all should have site_deployment_id set)
        site_repo: SiteDeploymentRepository for looking up site details

    Returns:
        List of URL mappings: [{content_id, title, url, tier, slug}, ...]

    Raises:
        ValueError: If any article is missing site_deployment_id or site lookup fails
    """
    url_mappings = []

    for content in content_records:
        if not content.site_deployment_id:
            raise ValueError(
                f"Content ID {content.id} is missing site_deployment_id. "
                "All articles must be assigned to a site before URL generation."
            )

        site = site_repo.get_by_id(content.site_deployment_id)
        if not site:
            raise ValueError(
                f"Site deployment ID {content.site_deployment_id} not found for content ID {content.id}"
            )

        hostname = get_site_hostname(site)
        slug = generate_slug(content.title)

        if not slug or slug == "article":
            slug = f"article-{content.id}"
            logger.warning(
                f"Empty slug generated for content ID {content.id}, using fallback: {slug}"
            )

        url = f"https://{hostname}/{slug}.html"

        url_mappings.append({
            "content_id": content.id,
            "title": content.title,
            "url": url,
            "tier": content.tier,
            "slug": slug,
            "hostname": hostname
        })

        logger.info(f"Generated URL for content_id={content.id}: {url}")

    return url_mappings


def generate_public_url(site: SiteDeployment, file_path: str) -> str:
    """
    Generate full public URL for a file path on a site

    Args:
        site: SiteDeployment record with hostname information
        file_path: File path within storage zone (e.g., 'my-article.html')

    Returns:
        Full HTTPS URL (e.g., 'https://example.com/my-article.html')

    Examples:
        site with custom_hostname='www.example.com', file_path='about.html'
        -> 'https://www.example.com/about.html'

        site with pull_zone_bcdn_hostname='mysite.b-cdn.net', file_path='article.html'
        -> 'https://mysite.b-cdn.net/article.html'

        S3 site with s3_custom_domain='cdn.example.com', file_path='article.html'
        -> 'https://cdn.example.com/article.html'

        S3 site without custom domain, file_path='article.html'
        -> 'https://bucket-name.s3-website-region.amazonaws.com/article.html'
    """
    hostname = get_site_hostname(site)
    return f"https://{hostname}/{file_path}"


def generate_file_path(content: GeneratedContent) -> str:
    """
    Generate storage file path for content

    Args:
        content: GeneratedContent record

    Returns:
        File path with .html extension (e.g., 'my-article-slug.html')

    Note:
        Uses title-based slug generation with fallback to content_id
    """
    slug = generate_slug(content.title)

    if not slug or slug == "article":
        slug = f"article-{content.id}"
        logger.warning(f"Empty slug for content {content.id}, using fallback: {slug}")

    return f"{slug}.html"


def generate_page_file_path(page: SitePage) -> str:
    """
    Generate storage file path for boilerplate page

    Args:
        page: SitePage record

    Returns:
        File path with .html extension (e.g., 'about.html', 'contact.html')
    """
    return f"{page.page_type}.html"