194 lines
6.1 KiB
Python
194 lines
6.1 KiB
Python
"""
|
|
URL generation logic for generated content
|
|
"""
|
|
|
|
import re
|
|
import logging
|
|
from typing import List, Dict
|
|
from src.database.models import GeneratedContent, SiteDeployment, SitePage
|
|
from src.database.repositories import SiteDeploymentRepository
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
def get_site_hostname(site: SiteDeployment) -> str:
|
|
"""
|
|
Get the proper hostname for a site, handling S3 sites correctly.
|
|
|
|
For S3 sites:
|
|
- Uses s3_custom_domain if set
|
|
- Otherwise constructs S3 website endpoint URL
|
|
- Never uses pull_zone_bcdn_hostname (Bunny domain) for S3 sites
|
|
|
|
For other providers (Bunny, etc.):
|
|
- Uses custom_hostname if set
|
|
- Falls back to pull_zone_bcdn_hostname
|
|
|
|
Args:
|
|
site: SiteDeployment record
|
|
|
|
Returns:
|
|
Hostname string for the site
|
|
"""
|
|
if site.storage_provider in ('s3', 's3_compatible'):
|
|
if site.s3_custom_domain:
|
|
return site.s3_custom_domain
|
|
elif site.s3_bucket_name and site.s3_bucket_region:
|
|
# Use website endpoint format for static website hosting (enables root URL access)
|
|
# Skip website endpoint for S3-compatible services (use standard endpoint)
|
|
if site.storage_provider == 's3_compatible' or getattr(site, 's3_endpoint_url', None):
|
|
return f"{site.s3_bucket_name}.s3.{site.s3_bucket_region}.amazonaws.com"
|
|
else:
|
|
return f"{site.s3_bucket_name}.s3-website-{site.s3_bucket_region}.amazonaws.com"
|
|
else:
|
|
hostname = site.custom_hostname or site.pull_zone_bcdn_hostname
|
|
logger.warning(f"S3 site {site.id} missing s3_custom_domain and bucket info, using fallback: {hostname}")
|
|
return hostname
|
|
else:
|
|
return site.custom_hostname or site.pull_zone_bcdn_hostname
|
|
|
|
|
|
def generate_slug(title: str, max_length: int = 100) -> str:
|
|
"""
|
|
Generate URL-safe slug from article title
|
|
|
|
Args:
|
|
title: Article title
|
|
max_length: Maximum slug length (default: 100)
|
|
|
|
Returns:
|
|
URL-safe slug
|
|
|
|
Examples:
|
|
"How to Fix Your Engine" -> "how-to-fix-your-engine"
|
|
"10 Best SEO Tips for 2024!" -> "10-best-seo-tips-for-2024"
|
|
"C++ Programming Guide" -> "c-programming-guide"
|
|
"""
|
|
slug = title.lower()
|
|
slug = re.sub(r'[^\w\s-]', '', slug)
|
|
slug = re.sub(r'[-\s]+', '-', slug)
|
|
slug = slug.strip('-')[:max_length]
|
|
|
|
return slug or "article"
|
|
|
|
|
|
def generate_urls_for_batch(
|
|
content_records: List[GeneratedContent],
|
|
site_repo: SiteDeploymentRepository
|
|
) -> List[Dict]:
|
|
"""
|
|
Generate final public URLs for a batch of articles
|
|
|
|
Args:
|
|
content_records: List of GeneratedContent records (all should have site_deployment_id set)
|
|
site_repo: SiteDeploymentRepository for looking up site details
|
|
|
|
Returns:
|
|
List of URL mappings: [{content_id, title, url, tier, slug}, ...]
|
|
|
|
Raises:
|
|
ValueError: If any article is missing site_deployment_id or site lookup fails
|
|
"""
|
|
url_mappings = []
|
|
|
|
for content in content_records:
|
|
if not content.site_deployment_id:
|
|
raise ValueError(
|
|
f"Content ID {content.id} is missing site_deployment_id. "
|
|
"All articles must be assigned to a site before URL generation."
|
|
)
|
|
|
|
site = site_repo.get_by_id(content.site_deployment_id)
|
|
if not site:
|
|
raise ValueError(
|
|
f"Site deployment ID {content.site_deployment_id} not found for content ID {content.id}"
|
|
)
|
|
|
|
hostname = get_site_hostname(site)
|
|
slug = generate_slug(content.title)
|
|
|
|
if not slug or slug == "article":
|
|
slug = f"article-{content.id}"
|
|
logger.warning(
|
|
f"Empty slug generated for content ID {content.id}, using fallback: {slug}"
|
|
)
|
|
|
|
url = f"https://{hostname}/{slug}.html"
|
|
|
|
url_mappings.append({
|
|
"content_id": content.id,
|
|
"title": content.title,
|
|
"url": url,
|
|
"tier": content.tier,
|
|
"slug": slug,
|
|
"hostname": hostname
|
|
})
|
|
|
|
logger.info(f"Generated URL for content_id={content.id}: {url}")
|
|
|
|
return url_mappings
|
|
|
|
|
|
def generate_public_url(site: SiteDeployment, file_path: str) -> str:
|
|
"""
|
|
Generate full public URL for a file path on a site
|
|
|
|
Args:
|
|
site: SiteDeployment record with hostname information
|
|
file_path: File path within storage zone (e.g., 'my-article.html')
|
|
|
|
Returns:
|
|
Full HTTPS URL (e.g., 'https://example.com/my-article.html')
|
|
|
|
Examples:
|
|
site with custom_hostname='www.example.com', file_path='about.html'
|
|
-> 'https://www.example.com/about.html'
|
|
|
|
site with pull_zone_bcdn_hostname='mysite.b-cdn.net', file_path='article.html'
|
|
-> 'https://mysite.b-cdn.net/article.html'
|
|
|
|
S3 site with s3_custom_domain='cdn.example.com', file_path='article.html'
|
|
-> 'https://cdn.example.com/article.html'
|
|
|
|
S3 site without custom domain, file_path='article.html'
|
|
-> 'https://bucket-name.s3-website-region.amazonaws.com/article.html'
|
|
"""
|
|
hostname = get_site_hostname(site)
|
|
return f"https://{hostname}/{file_path}"
|
|
|
|
|
|
def generate_file_path(content: GeneratedContent) -> str:
|
|
"""
|
|
Generate storage file path for content
|
|
|
|
Args:
|
|
content: GeneratedContent record
|
|
|
|
Returns:
|
|
File path with .html extension (e.g., 'my-article-slug.html')
|
|
|
|
Note:
|
|
Uses title-based slug generation with fallback to content_id
|
|
"""
|
|
slug = generate_slug(content.title)
|
|
|
|
if not slug or slug == "article":
|
|
slug = f"article-{content.id}"
|
|
logger.warning(f"Empty slug for content {content.id}, using fallback: {slug}")
|
|
|
|
return f"{slug}.html"
|
|
|
|
|
|
def generate_page_file_path(page: SitePage) -> str:
|
|
"""
|
|
Generate storage file path for boilerplate page
|
|
|
|
Args:
|
|
page: SitePage record
|
|
|
|
Returns:
|
|
File path with .html extension (e.g., 'about.html', 'contact.html')
|
|
"""
|
|
return f"{page.page_type}.html"
|
|
|