Big-Link-Man/src/generation/url_generator.py

194 lines
6.1 KiB
Python

"""
URL generation logic for generated content
"""
import re
import logging
from typing import List, Dict
from src.database.models import GeneratedContent, SiteDeployment, SitePage
from src.database.repositories import SiteDeploymentRepository
logger = logging.getLogger(__name__)
def get_site_hostname(site: SiteDeployment) -> str:
"""
Get the proper hostname for a site, handling S3 sites correctly.
For S3 sites:
- Uses s3_custom_domain if set
- Otherwise constructs S3 website endpoint URL
- Never uses pull_zone_bcdn_hostname (Bunny domain) for S3 sites
For other providers (Bunny, etc.):
- Uses custom_hostname if set
- Falls back to pull_zone_bcdn_hostname
Args:
site: SiteDeployment record
Returns:
Hostname string for the site
"""
if site.storage_provider in ('s3', 's3_compatible'):
if site.s3_custom_domain:
return site.s3_custom_domain
elif site.s3_bucket_name and site.s3_bucket_region:
# Use website endpoint format for static website hosting (enables root URL access)
# Skip website endpoint for S3-compatible services (use standard endpoint)
if site.storage_provider == 's3_compatible' or getattr(site, 's3_endpoint_url', None):
return f"{site.s3_bucket_name}.s3.{site.s3_bucket_region}.amazonaws.com"
else:
return f"{site.s3_bucket_name}.s3-website-{site.s3_bucket_region}.amazonaws.com"
else:
hostname = site.custom_hostname or site.pull_zone_bcdn_hostname
logger.warning(f"S3 site {site.id} missing s3_custom_domain and bucket info, using fallback: {hostname}")
return hostname
else:
return site.custom_hostname or site.pull_zone_bcdn_hostname
def generate_slug(title: str, max_length: int = 100) -> str:
"""
Generate URL-safe slug from article title
Args:
title: Article title
max_length: Maximum slug length (default: 100)
Returns:
URL-safe slug
Examples:
"How to Fix Your Engine" -> "how-to-fix-your-engine"
"10 Best SEO Tips for 2024!" -> "10-best-seo-tips-for-2024"
"C++ Programming Guide" -> "c-programming-guide"
"""
slug = title.lower()
slug = re.sub(r'[^\w\s-]', '', slug)
slug = re.sub(r'[-\s]+', '-', slug)
slug = slug.strip('-')[:max_length]
return slug or "article"
def generate_urls_for_batch(
content_records: List[GeneratedContent],
site_repo: SiteDeploymentRepository
) -> List[Dict]:
"""
Generate final public URLs for a batch of articles
Args:
content_records: List of GeneratedContent records (all should have site_deployment_id set)
site_repo: SiteDeploymentRepository for looking up site details
Returns:
List of URL mappings: [{content_id, title, url, tier, slug}, ...]
Raises:
ValueError: If any article is missing site_deployment_id or site lookup fails
"""
url_mappings = []
for content in content_records:
if not content.site_deployment_id:
raise ValueError(
f"Content ID {content.id} is missing site_deployment_id. "
"All articles must be assigned to a site before URL generation."
)
site = site_repo.get_by_id(content.site_deployment_id)
if not site:
raise ValueError(
f"Site deployment ID {content.site_deployment_id} not found for content ID {content.id}"
)
hostname = get_site_hostname(site)
slug = generate_slug(content.title)
if not slug or slug == "article":
slug = f"article-{content.id}"
logger.warning(
f"Empty slug generated for content ID {content.id}, using fallback: {slug}"
)
url = f"https://{hostname}/{slug}.html"
url_mappings.append({
"content_id": content.id,
"title": content.title,
"url": url,
"tier": content.tier,
"slug": slug,
"hostname": hostname
})
logger.info(f"Generated URL for content_id={content.id}: {url}")
return url_mappings
def generate_public_url(site: SiteDeployment, file_path: str) -> str:
"""
Generate full public URL for a file path on a site
Args:
site: SiteDeployment record with hostname information
file_path: File path within storage zone (e.g., 'my-article.html')
Returns:
Full HTTPS URL (e.g., 'https://example.com/my-article.html')
Examples:
site with custom_hostname='www.example.com', file_path='about.html'
-> 'https://www.example.com/about.html'
site with pull_zone_bcdn_hostname='mysite.b-cdn.net', file_path='article.html'
-> 'https://mysite.b-cdn.net/article.html'
S3 site with s3_custom_domain='cdn.example.com', file_path='article.html'
-> 'https://cdn.example.com/article.html'
S3 site without custom domain, file_path='article.html'
-> 'https://bucket-name.s3-website-region.amazonaws.com/article.html'
"""
hostname = get_site_hostname(site)
return f"https://{hostname}/{file_path}"
def generate_file_path(content: GeneratedContent) -> str:
"""
Generate storage file path for content
Args:
content: GeneratedContent record
Returns:
File path with .html extension (e.g., 'my-article-slug.html')
Note:
Uses title-based slug generation with fallback to content_id
"""
slug = generate_slug(content.title)
if not slug or slug == "article":
slug = f"article-{content.id}"
logger.warning(f"Empty slug for content {content.id}, using fallback: {slug}")
return f"{slug}.html"
def generate_page_file_path(page: SitePage) -> str:
"""
Generate storage file path for boilerplate page
Args:
page: SitePage record
Returns:
File path with .html extension (e.g., 'about.html', 'contact.html')
"""
return f"{page.page_type}.html"