Big-Link-Man/src/generation/service.py

377 lines
12 KiB
Python

"""
Content generation service with three-stage pipeline
"""
import re
import json
from html import unescape
from pathlib import Path
from datetime import datetime
from typing import Optional, Tuple
from src.generation.ai_client import AIClient, PromptManager
from src.database.repositories import ProjectRepository, GeneratedContentRepository, SiteDeploymentRepository
from src.templating.service import TemplateService
class ContentGenerator:
"""Main service for generating content through AI pipeline"""
def __init__(
self,
ai_client: AIClient,
prompt_manager: PromptManager,
project_repo: ProjectRepository,
content_repo: GeneratedContentRepository,
template_service: Optional[TemplateService] = None,
site_deployment_repo: Optional[SiteDeploymentRepository] = None
):
self.ai_client = ai_client
self.prompt_manager = prompt_manager
self.project_repo = project_repo
self.content_repo = content_repo
self.template_service = template_service or TemplateService(content_repo)
self.site_deployment_repo = site_deployment_repo
def generate_title(self, project_id: int, debug: bool = False, model: Optional[str] = None) -> str:
"""
Generate SEO-optimized title
Args:
project_id: Project ID to generate title for
debug: If True, save response to debug_output/
model: Optional model override for this generation stage
Returns:
Generated title string
"""
project = self.project_repo.get_by_id(project_id)
if not project:
raise ValueError(f"Project {project_id} not found")
entities_str = ", ".join(project.entities or [])
related_str = ", ".join(project.related_searches or [])
system_msg, user_prompt = self.prompt_manager.format_prompt(
"title_generation",
keyword=project.main_keyword,
entities=entities_str,
related_searches=related_str
)
title = self.ai_client.generate_completion(
prompt=user_prompt,
system_message=system_msg,
max_tokens=100,
temperature=0.7,
override_model=model
)
title = title.strip().strip('"').strip("'")
if debug:
self._save_debug_output(
project_id, "title", title, "txt"
)
return title
def generate_outline(
self,
project_id: int,
title: str,
min_h2: int,
max_h2: int,
min_h3: int,
max_h3: int,
debug: bool = False,
model: Optional[str] = None
) -> dict:
"""
Generate article outline in JSON format
Args:
project_id: Project ID
title: Article title
min_h2: Minimum H2 headings
max_h2: Maximum H2 headings
min_h3: Minimum H3 subheadings total
max_h3: Maximum H3 subheadings total
debug: If True, save response to debug_output/
model: Optional model override for this generation stage
Returns:
Outline dictionary: {"outline": [{"h2": "...", "h3": ["...", "..."]}]}
Raises:
ValueError: If outline doesn't meet minimum requirements
"""
project = self.project_repo.get_by_id(project_id)
if not project:
raise ValueError(f"Project {project_id} not found")
entities_str = ", ".join(project.entities or [])
related_str = ", ".join(project.related_searches or [])
system_msg, user_prompt = self.prompt_manager.format_prompt(
"outline_generation",
title=title,
keyword=project.main_keyword,
min_h2=min_h2,
max_h2=max_h2,
min_h3=min_h3,
max_h3=max_h3,
entities=entities_str,
related_searches=related_str
)
outline_json = self.ai_client.generate_completion(
prompt=user_prompt,
system_message=system_msg,
max_tokens=2000,
temperature=0.7,
json_mode=True,
override_model=model
)
print(f"[DEBUG] Raw outline response: {outline_json}")
# Save raw response immediately
if debug:
self._save_debug_output(project_id, "outline_raw", outline_json, "txt")
print(f"[DEBUG] Raw outline response: {outline_json}")
try:
outline = json.loads(outline_json)
except json.JSONDecodeError as e:
if debug:
self._save_debug_output(project_id, "outline_error", outline_json, "txt")
raise ValueError(f"Failed to parse outline JSON: {e}\nResponse: {outline_json[:500]}")
if "outline" not in outline:
if debug:
self._save_debug_output(project_id, "outline_invalid", json.dumps(outline, indent=2), "json")
raise ValueError(f"Outline missing 'outline' key. Got keys: {list(outline.keys())}\nContent: {outline}")
h2_count = len(outline["outline"])
h3_count = sum(len(section.get("h3", [])) for section in outline["outline"])
if h2_count < min_h2:
raise ValueError(f"Outline has {h2_count} H2s, minimum is {min_h2}")
if h3_count < min_h3:
raise ValueError(f"Outline has {h3_count} H3s, minimum is {min_h3}")
if debug:
self._save_debug_output(
project_id, "outline", json.dumps(outline, indent=2), "json"
)
return outline
def generate_content(
self,
project_id: int,
title: str,
outline: dict,
min_word_count: int,
max_word_count: int,
debug: bool = False,
model: Optional[str] = None
) -> str:
"""
Generate full article HTML fragment
Args:
project_id: Project ID
title: Article title
outline: Article outline dict
min_word_count: Minimum word count for guidance
max_word_count: Maximum word count for guidance
debug: If True, save response to debug_output/
model: Optional model override for this generation stage
Returns:
HTML string with <h2>, <h3>, <p> tags
"""
project = self.project_repo.get_by_id(project_id)
if not project:
raise ValueError(f"Project {project_id} not found")
entities_str = ", ".join(project.entities or [])
related_str = ", ".join(project.related_searches or [])
outline_str = json.dumps(outline, indent=2)
system_msg, user_prompt = self.prompt_manager.format_prompt(
"content_generation",
title=title,
outline=outline_str,
keyword=project.main_keyword,
entities=entities_str,
related_searches=related_str,
min_word_count=min_word_count,
max_word_count=max_word_count
)
content = self.ai_client.generate_completion(
prompt=user_prompt,
system_message=system_msg,
max_tokens=8000,
temperature=0.7,
override_model=model
)
content = content.strip()
if debug:
self._save_debug_output(
project_id, "content", content, "html"
)
return content
def validate_word_count(self, content: str, min_words: int, max_words: int) -> Tuple[bool, int]:
"""
Validate content word count
Args:
content: HTML content string
min_words: Minimum word count
max_words: Maximum word count
Returns:
Tuple of (is_valid, actual_count)
"""
word_count = self.count_words(content)
is_valid = min_words <= word_count <= max_words
return is_valid, word_count
def count_words(self, html_content: str) -> int:
"""
Count words in HTML content
Args:
html_content: HTML string
Returns:
Number of words
"""
text = re.sub(r'<[^>]+>', '', html_content)
text = unescape(text)
words = text.split()
return len(words)
def augment_content(
self,
content: str,
target_word_count: int,
debug: bool = False,
project_id: Optional[int] = None,
model: Optional[str] = None
) -> str:
"""
Expand article content to meet minimum word count
Args:
content: Current HTML content
target_word_count: Target word count
debug: If True, save response to debug_output/
project_id: Optional project ID for debug output
model: Optional model override for this generation stage
Returns:
Expanded HTML content
"""
system_msg, user_prompt = self.prompt_manager.format_prompt(
"content_augmentation",
content=content,
target_word_count=target_word_count
)
augmented = self.ai_client.generate_completion(
prompt=user_prompt,
system_message=system_msg,
max_tokens=8000,
temperature=0.7,
override_model=model
)
augmented = augmented.strip()
if debug and project_id:
self._save_debug_output(
project_id, "augmented", augmented, "html"
)
return augmented
def apply_template(
self,
content_id: int,
meta_description: Optional[str] = None
) -> bool:
"""
Apply HTML template to generated content and save to database
Args:
content_id: GeneratedContent ID to format
meta_description: Optional meta description (defaults to truncated content)
Returns:
True if successful, False otherwise
"""
try:
content_record = self.content_repo.get_by_id(content_id)
if not content_record:
print(f"Warning: Content {content_id} not found")
return False
if not meta_description:
text = re.sub(r'<[^>]+>', '', content_record.content)
text = unescape(text)
words = text.split()[:25]
meta_description = ' '.join(words) + '...'
template_name = self.template_service.select_template_for_content(
site_deployment_id=content_record.site_deployment_id,
site_deployment_repo=self.site_deployment_repo
)
formatted_html = self.template_service.format_content(
content=content_record.content,
title=content_record.title,
meta_description=meta_description,
template_name=template_name
)
content_record.formatted_html = formatted_html
content_record.template_used = template_name
self.content_repo.update(content_record)
print(f"Applied template '{template_name}' to content {content_id}")
return True
except Exception as e:
print(f"Error applying template to content {content_id}: {e}")
return False
def _save_debug_output(
self,
project_id: int,
stage: str,
content: str,
extension: str,
tier: Optional[str] = None,
article_num: Optional[int] = None
):
"""Save debug output to file"""
debug_dir = Path("debug_output")
debug_dir.mkdir(exist_ok=True)
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
tier_part = f"_tier{tier}" if tier else ""
article_part = f"_article{article_num}" if article_num else ""
filename = f"{stage}_project{project_id}{tier_part}{article_part}_{timestamp}.{extension}"
filepath = debug_dir / filename
with open(filepath, 'w', encoding='utf-8') as f:
f.write(content)