Big-Link-Man/src/generation/service.py

"""
Content generation service with three-stage pipeline
"""

import re
import json
from html import unescape
from pathlib import Path
from datetime import datetime
from typing import Optional, Tuple
from src.generation.ai_client import AIClient, PromptManager
from src.database.repositories import ProjectRepository, GeneratedContentRepository, SiteDeploymentRepository
from src.templating.service import TemplateService


class ContentGenerator:
    """Main service for generating content through AI pipeline"""

    def __init__(
        self,
        ai_client: AIClient,
        prompt_manager: PromptManager,
        project_repo: ProjectRepository,
        content_repo: GeneratedContentRepository,
        template_service: Optional[TemplateService] = None,
        site_deployment_repo: Optional[SiteDeploymentRepository] = None
    ):
        self.ai_client = ai_client
        self.prompt_manager = prompt_manager
        self.project_repo = project_repo
        self.content_repo = content_repo
        self.template_service = template_service or TemplateService(content_repo)
        self.site_deployment_repo = site_deployment_repo

    def generate_title(self, project_id: int, debug: bool = False, model: Optional[str] = None) -> str:
        """
        Generate SEO-optimized title

        Args:
            project_id: Project ID to generate title for
            debug: If True, save response to debug_output/
            model: Optional model override for this generation stage

        Returns:
            Generated title string
        """
        project = self.project_repo.get_by_id(project_id)
        if not project:
            raise ValueError(f"Project {project_id} not found")

        entities_str = ", ".join(project.entities or [])
        related_str = ", ".join(project.related_searches or [])

        system_msg, user_prompt = self.prompt_manager.format_prompt(
            "title_generation",
            keyword=project.main_keyword,
            entities=entities_str,
            related_searches=related_str
        )

        title = self.ai_client.generate_completion(
            prompt=user_prompt,
            system_message=system_msg,
            max_tokens=100,
            temperature=0.7,
            override_model=model
        )

        title = title.strip().strip('"').strip("'")

        if debug:
            self._save_debug_output(
                project_id, "title", title, "txt"
            )

        return title

    def generate_outline(
        self,
        project_id: int,
        title: str,
        min_h2: int,
        max_h2: int,
        min_h3: int,
        max_h3: int,
        debug: bool = False,
        model: Optional[str] = None
    ) -> dict:
        """
        Generate article outline in JSON format

        Args:
            project_id: Project ID
            title: Article title
            min_h2: Minimum H2 headings
            max_h2: Maximum H2 headings
            min_h3: Minimum H3 subheadings total
            max_h3: Maximum H3 subheadings total
            debug: If True, save response to debug_output/
            model: Optional model override for this generation stage

        Returns:
            Outline dictionary: {"outline": [{"h2": "...", "h3": ["...", "..."]}]}

        Raises:
            ValueError: If outline doesn't meet minimum requirements
        """
        project = self.project_repo.get_by_id(project_id)
        if not project:
            raise ValueError(f"Project {project_id} not found")

        entities_str = ", ".join(project.entities or [])
        related_str = ", ".join(project.related_searches or [])

        system_msg, user_prompt = self.prompt_manager.format_prompt(
            "outline_generation",
            title=title,
            keyword=project.main_keyword,
            min_h2=min_h2,
            max_h2=max_h2,
            min_h3=min_h3,
            max_h3=max_h3,
            entities=entities_str,
            related_searches=related_str
        )

        outline_json = self.ai_client.generate_completion(
            prompt=user_prompt,
            system_message=system_msg,
            max_tokens=2000,
            temperature=0.7,
            json_mode=True,
            override_model=model
        )
        print(f"[DEBUG] Raw outline response: {outline_json}")
        # Save raw response immediately
        if debug:
            self._save_debug_output(project_id, "outline_raw", outline_json, "txt")
            print(f"[DEBUG] Raw outline response: {outline_json}")

        try:
            outline = json.loads(outline_json)
        except json.JSONDecodeError as e:
            if debug:
                self._save_debug_output(project_id, "outline_error", outline_json, "txt")
            raise ValueError(f"Failed to parse outline JSON: {e}\nResponse: {outline_json[:500]}")

        if "outline" not in outline:
            if debug:
                self._save_debug_output(project_id, "outline_invalid", json.dumps(outline, indent=2), "json")
            raise ValueError(f"Outline missing 'outline' key. Got keys: {list(outline.keys())}\nContent: {outline}")

        h2_count = len(outline["outline"])
        h3_count = sum(len(section.get("h3", [])) for section in outline["outline"])

        if h2_count < min_h2:
            raise ValueError(f"Outline has {h2_count} H2s, minimum is {min_h2}")

        if h3_count < min_h3:
            raise ValueError(f"Outline has {h3_count} H3s, minimum is {min_h3}")

        if debug:
            self._save_debug_output(
                project_id, "outline", json.dumps(outline, indent=2), "json"
            )

        return outline

    def generate_content(
        self,
        project_id: int,
        title: str,
        outline: dict,
        min_word_count: int,
        max_word_count: int,
        debug: bool = False,
        model: Optional[str] = None
    ) -> str:
        """
        Generate full article HTML fragment

        Args:
            project_id: Project ID
            title: Article title
            outline: Article outline dict
            min_word_count: Minimum word count for guidance
            max_word_count: Maximum word count for guidance
            debug: If True, save response to debug_output/
            model: Optional model override for this generation stage

        Returns:
            HTML string with <h2>, <h3>, <p> tags
        """
        project = self.project_repo.get_by_id(project_id)
        if not project:
            raise ValueError(f"Project {project_id} not found")

        entities_str = ", ".join(project.entities or [])
        related_str = ", ".join(project.related_searches or [])
        outline_str = json.dumps(outline, indent=2)

        system_msg, user_prompt = self.prompt_manager.format_prompt(
            "content_generation",
            title=title,
            outline=outline_str,
            keyword=project.main_keyword,
            entities=entities_str,
            related_searches=related_str,
            min_word_count=min_word_count,
            max_word_count=max_word_count
        )

        content = self.ai_client.generate_completion(
            prompt=user_prompt,
            system_message=system_msg,
            max_tokens=8000,
            temperature=0.7,
            override_model=model
        )

        content = content.strip()

        if debug:
            self._save_debug_output(
                project_id, "content", content, "html"
            )

        return content

    def validate_word_count(self, content: str, min_words: int, max_words: int) -> Tuple[bool, int]:
        """
        Validate content word count

        Args:
            content: HTML content string
            min_words: Minimum word count
            max_words: Maximum word count

        Returns:
            Tuple of (is_valid, actual_count)
        """
        word_count = self.count_words(content)
        is_valid = min_words <= word_count <= max_words
        return is_valid, word_count

    def count_words(self, html_content: str) -> int:
        """
        Count words in HTML content

        Args:
            html_content: HTML string

        Returns:
            Number of words
        """
        text = re.sub(r'<[^>]+>', '', html_content)
        text = unescape(text)
        words = text.split()
        return len(words)

    def augment_content(
        self,
        content: str,
        target_word_count: int,
        debug: bool = False,
        project_id: Optional[int] = None,
        model: Optional[str] = None
    ) -> str:
        """
        Expand article content to meet minimum word count

        Args:
            content: Current HTML content
            target_word_count: Target word count
            debug: If True, save response to debug_output/
            project_id: Optional project ID for debug output
            model: Optional model override for this generation stage

        Returns:
            Expanded HTML content
        """
        system_msg, user_prompt = self.prompt_manager.format_prompt(
            "content_augmentation",
            content=content,
            target_word_count=target_word_count
        )

        augmented = self.ai_client.generate_completion(
            prompt=user_prompt,
            system_message=system_msg,
            max_tokens=8000,
            temperature=0.7,
            override_model=model
        )

        augmented = augmented.strip()

        if debug and project_id:
            self._save_debug_output(
                project_id, "augmented", augmented, "html"
            )

        return augmented

    def apply_template(
        self,
        content_id: int,
        meta_description: Optional[str] = None
    ) -> bool:
        """
        Apply HTML template to generated content and save to database

        Args:
            content_id: GeneratedContent ID to format
            meta_description: Optional meta description (defaults to truncated content)

        Returns:
            True if successful, False otherwise
        """
        try:
            content_record = self.content_repo.get_by_id(content_id)
            if not content_record:
                print(f"Warning: Content {content_id} not found")
                return False

            if not meta_description:
                text = re.sub(r'<[^>]+>', '', content_record.content)
                text = unescape(text)
                words = text.split()[:25]
                meta_description = ' '.join(words) + '...'

            template_name = self.template_service.select_template_for_content(
                site_deployment_id=content_record.site_deployment_id,
                site_deployment_repo=self.site_deployment_repo
            )

            formatted_html = self.template_service.format_content(
                content=content_record.content,
                title=content_record.title,
                meta_description=meta_description,
                template_name=template_name
            )

            content_record.formatted_html = formatted_html
            content_record.template_used = template_name
            self.content_repo.update(content_record)

            print(f"Applied template '{template_name}' to content {content_id}")
            return True

        except Exception as e:
            print(f"Error applying template to content {content_id}: {e}")
            return False

    def _save_debug_output(
        self,
        project_id: int,
        stage: str,
        content: str,
        extension: str,
        tier: Optional[str] = None,
        article_num: Optional[int] = None
    ):
        """Save debug output to file"""
        debug_dir = Path("debug_output")
        debug_dir.mkdir(exist_ok=True)

        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

        tier_part = f"_tier{tier}" if tier else ""
        article_part = f"_article{article_num}" if article_num else ""

        filename = f"{stage}_project{project_id}{tier_part}{article_part}_{timestamp}.{extension}"
        filepath = debug_dir / filename

        with open(filepath, 'w', encoding='utf-8') as f:
            f.write(content)