""" URL generation logic for generated content """ import re import logging from typing import List, Dict from src.database.models import GeneratedContent, SiteDeployment, SitePage from src.database.repositories import SiteDeploymentRepository logger = logging.getLogger(__name__) def get_site_hostname(site: SiteDeployment) -> str: """ Get the proper hostname for a site, handling S3 sites correctly. For S3 sites: - Uses s3_custom_domain if set - Otherwise constructs S3 website endpoint URL - Never uses pull_zone_bcdn_hostname (Bunny domain) for S3 sites For other providers (Bunny, etc.): - Uses custom_hostname if set - Falls back to pull_zone_bcdn_hostname Args: site: SiteDeployment record Returns: Hostname string for the site """ if site.storage_provider in ('s3', 's3_compatible'): if site.s3_custom_domain: return site.s3_custom_domain elif site.s3_bucket_name and site.s3_bucket_region: # Use website endpoint format for static website hosting (enables root URL access) # Skip website endpoint for S3-compatible services (use standard endpoint) if site.storage_provider == 's3_compatible' or getattr(site, 's3_endpoint_url', None): return f"{site.s3_bucket_name}.s3.{site.s3_bucket_region}.amazonaws.com" else: return f"{site.s3_bucket_name}.s3-website-{site.s3_bucket_region}.amazonaws.com" else: hostname = site.custom_hostname or site.pull_zone_bcdn_hostname logger.warning(f"S3 site {site.id} missing s3_custom_domain and bucket info, using fallback: {hostname}") return hostname else: return site.custom_hostname or site.pull_zone_bcdn_hostname def generate_slug(title: str, max_length: int = 100) -> str: """ Generate URL-safe slug from article title Args: title: Article title max_length: Maximum slug length (default: 100) Returns: URL-safe slug Examples: "How to Fix Your Engine" -> "how-to-fix-your-engine" "10 Best SEO Tips for 2024!" -> "10-best-seo-tips-for-2024" "C++ Programming Guide" -> "c-programming-guide" """ slug = title.lower() slug = re.sub(r'[^\w\s-]', '', slug) slug = re.sub(r'[-\s]+', '-', slug) slug = slug.strip('-')[:max_length] return slug or "article" def generate_urls_for_batch( content_records: List[GeneratedContent], site_repo: SiteDeploymentRepository ) -> List[Dict]: """ Generate final public URLs for a batch of articles Args: content_records: List of GeneratedContent records (all should have site_deployment_id set) site_repo: SiteDeploymentRepository for looking up site details Returns: List of URL mappings: [{content_id, title, url, tier, slug}, ...] Raises: ValueError: If any article is missing site_deployment_id or site lookup fails """ url_mappings = [] for content in content_records: if not content.site_deployment_id: raise ValueError( f"Content ID {content.id} is missing site_deployment_id. " "All articles must be assigned to a site before URL generation." ) site = site_repo.get_by_id(content.site_deployment_id) if not site: raise ValueError( f"Site deployment ID {content.site_deployment_id} not found for content ID {content.id}" ) hostname = get_site_hostname(site) slug = generate_slug(content.title) if not slug or slug == "article": slug = f"article-{content.id}" logger.warning( f"Empty slug generated for content ID {content.id}, using fallback: {slug}" ) url = f"https://{hostname}/{slug}.html" url_mappings.append({ "content_id": content.id, "title": content.title, "url": url, "tier": content.tier, "slug": slug, "hostname": hostname }) logger.info(f"Generated URL for content_id={content.id}: {url}") return url_mappings def generate_public_url(site: SiteDeployment, file_path: str) -> str: """ Generate full public URL for a file path on a site Args: site: SiteDeployment record with hostname information file_path: File path within storage zone (e.g., 'my-article.html') Returns: Full HTTPS URL (e.g., 'https://example.com/my-article.html') Examples: site with custom_hostname='www.example.com', file_path='about.html' -> 'https://www.example.com/about.html' site with pull_zone_bcdn_hostname='mysite.b-cdn.net', file_path='article.html' -> 'https://mysite.b-cdn.net/article.html' S3 site with s3_custom_domain='cdn.example.com', file_path='article.html' -> 'https://cdn.example.com/article.html' S3 site without custom domain, file_path='article.html' -> 'https://bucket-name.s3-website-region.amazonaws.com/article.html' """ hostname = get_site_hostname(site) return f"https://{hostname}/{file_path}" def generate_file_path(content: GeneratedContent) -> str: """ Generate storage file path for content Args: content: GeneratedContent record Returns: File path with .html extension (e.g., 'my-article-slug.html') Note: Uses title-based slug generation with fallback to content_id """ slug = generate_slug(content.title) if not slug or slug == "article": slug = f"article-{content.id}" logger.warning(f"Empty slug for content {content.id}, using fallback: {slug}") return f"{slug}.html" def generate_page_file_path(page: SitePage) -> str: """ Generate storage file path for boilerplate page Args: page: SitePage record Returns: File path with .html extension (e.g., 'about.html', 'contact.html') """ return f"{page.page_type}.html"