Big-Link-Man/src/generation/url_generator.py

94 lines
2.7 KiB
Python

"""
URL generation logic for generated content
"""
import re
import logging
from typing import List, Dict
from src.database.models import GeneratedContent
from src.database.repositories import SiteDeploymentRepository
logger = logging.getLogger(__name__)
def generate_slug(title: str, max_length: int = 100) -> str:
"""
Generate URL-safe slug from article title
Args:
title: Article title
max_length: Maximum slug length (default: 100)
Returns:
URL-safe slug
Examples:
"How to Fix Your Engine" -> "how-to-fix-your-engine"
"10 Best SEO Tips for 2024!" -> "10-best-seo-tips-for-2024"
"C++ Programming Guide" -> "c-programming-guide"
"""
slug = title.lower()
slug = re.sub(r'[^\w\s-]', '', slug)
slug = re.sub(r'[-\s]+', '-', slug)
slug = slug.strip('-')[:max_length]
return slug or "article"
def generate_urls_for_batch(
content_records: List[GeneratedContent],
site_repo: SiteDeploymentRepository
) -> List[Dict]:
"""
Generate final public URLs for a batch of articles
Args:
content_records: List of GeneratedContent records (all should have site_deployment_id set)
site_repo: SiteDeploymentRepository for looking up site details
Returns:
List of URL mappings: [{content_id, title, url, tier, slug}, ...]
Raises:
ValueError: If any article is missing site_deployment_id or site lookup fails
"""
url_mappings = []
for content in content_records:
if not content.site_deployment_id:
raise ValueError(
f"Content ID {content.id} is missing site_deployment_id. "
"All articles must be assigned to a site before URL generation."
)
site = site_repo.get_by_id(content.site_deployment_id)
if not site:
raise ValueError(
f"Site deployment ID {content.site_deployment_id} not found for content ID {content.id}"
)
hostname = site.custom_hostname or site.pull_zone_bcdn_hostname
slug = generate_slug(content.title)
if not slug or slug == "article":
slug = f"article-{content.id}"
logger.warning(
f"Empty slug generated for content ID {content.id}, using fallback: {slug}"
)
url = f"https://{hostname}/{slug}.html"
url_mappings.append({
"content_id": content.id,
"title": content.title,
"url": url,
"tier": content.tier,
"slug": slug,
"hostname": hostname
})
logger.info(f"Generated URL for content_id={content.id}: {url}")
return url_mappings