Big-Link-Man/src/generation/service.py

"""
Content generation service with three-stage pipeline
"""

import re
import json
from html import unescape
from pathlib import Path
from datetime import datetime
from typing import Optional, Tuple, List
from src.generation.ai_client import AIClient, PromptManager
from src.database.repositories import ProjectRepository, GeneratedContentRepository, SiteDeploymentRepository
from src.templating.service import TemplateService


class ContentGenerator:
    """Main service for generating content through AI pipeline"""

    def __init__(
        self,
        ai_client: AIClient,
        prompt_manager: PromptManager,
        project_repo: ProjectRepository,
        content_repo: GeneratedContentRepository,
        template_service: Optional[TemplateService] = None,
        site_deployment_repo: Optional[SiteDeploymentRepository] = None
    ):
        self.ai_client = ai_client
        self.prompt_manager = prompt_manager
        self.project_repo = project_repo
        self.content_repo = content_repo
        self.template_service = template_service or TemplateService(content_repo)
        self.site_deployment_repo = site_deployment_repo

    def generate_title(self, project_id: int, debug: bool = False, model: Optional[str] = None) -> str:
        """
        Generate SEO-optimized title

        Args:
            project_id: Project ID to generate title for
            debug: If True, save response to debug_output/
            model: Optional model override for this generation stage

        Returns:
            Generated title string
        """
        project = self.project_repo.get_by_id(project_id)
        if not project:
            raise ValueError(f"Project {project_id} not found")

        entities_str = ", ".join(project.entities or [])
        related_str = ", ".join(project.related_searches or [])

        system_msg, user_prompt = self.prompt_manager.format_prompt(
            "title_generation",
            keyword=project.main_keyword,
            entities=entities_str,
            related_searches=related_str
        )

        title, _ = self.ai_client.generate_completion(
            prompt=user_prompt,
            system_message=system_msg,
            max_tokens=100,
            temperature=0.7,
            override_model=model
        )

        title = title.strip().strip('"').strip("'")

        if debug:
            self._save_debug_output(
                project_id, "title", title, "txt"
            )

        return title

    def generate_titles_batch(
        self,
        project_id: int,
        count: int,
        batch_size: int = 25,
        debug: bool = False,
        model: Optional[str] = None
    ) -> List[str]:
        """
        Generate multiple titles in batches

        Args:
            project_id: Project ID to generate titles for
            count: Total number of titles needed
            batch_size: Number of titles per AI call (default: 25)
            debug: If True, save responses to debug_output/
            model: Optional model override for this generation stage

        Returns:
            List of generated title strings
        """
        project = self.project_repo.get_by_id(project_id)
        if not project:
            raise ValueError(f"Project {project_id} not found")

        entities_str = ", ".join(project.entities or [])
        related_str = ", ".join(project.related_searches or [])

        all_titles = []
        titles_remaining = count

        while titles_remaining > 0:
            current_batch_size = min(batch_size, titles_remaining)

            system_msg, user_prompt = self.prompt_manager.format_prompt(
                "batch_title_generation",
                keyword=project.main_keyword,
                entities=entities_str,
                related_searches=related_str,
                count=current_batch_size
            )

            batch_titles = None
            for attempt in range(3):
                try:
                    response, _ = self.ai_client.generate_completion(
                        prompt=user_prompt,
                        system_message=system_msg,
                        max_tokens=100 * current_batch_size,
                        temperature=0.7,
                        override_model=model
                    )

                    lines = response.strip().split('\n')
                    batch_titles = []

                    for line in lines:
                        line = line.strip()
                        if not line:
                            continue
                        line = re.sub(r'^\d+[\.\)]\s*', '', line)
                        line = line.strip('"').strip("'")
                        if line:
                            batch_titles.append(line)

                    if len(batch_titles) < current_batch_size:
                        print(f"Warning: Requested {current_batch_size} titles but received {len(batch_titles)}. Continuing with partial batch.")

                    if len(batch_titles) > current_batch_size:
                        batch_titles = batch_titles[:current_batch_size]

                    break

                except Exception as e:
                    if attempt == 2:
                        raise ValueError(f"Failed to generate batch after 3 attempts: {e}")
                    print(f"Batch generation attempt {attempt + 1} failed: {e}, retrying...")

            if batch_titles:
                all_titles.extend(batch_titles)
                titles_remaining -= len(batch_titles)
            else:
                raise ValueError("Failed to generate any titles in batch")

        if debug:
            for i, title in enumerate(all_titles, 1):
                self._save_debug_output(
                    project_id, f"batch_title_{i}", title, "txt"
                )

        return all_titles

    def generate_outline(
        self,
        project_id: int,
        title: str,
        min_h2: int,
        max_h2: int,
        min_h3: int,
        max_h3: int,
        debug: bool = False,
        model: Optional[str] = None
    ) -> dict:
        """
        Generate article outline in JSON format

        Args:
            project_id: Project ID
            title: Article title
            min_h2: Minimum H2 headings
            max_h2: Maximum H2 headings
            min_h3: Minimum H3 subheadings total
            max_h3: Maximum H3 subheadings total
            debug: If True, save response to debug_output/
            model: Optional model override for this generation stage

        Returns:
            Outline dictionary: {"outline": [{"h2": "...", "h3": ["...", "..."]}]}

        Raises:
            ValueError: If outline doesn't meet minimum requirements
        """
        project = self.project_repo.get_by_id(project_id)
        if not project:
            raise ValueError(f"Project {project_id} not found")

        entities_str = ", ".join(project.entities or [])
        related_str = ", ".join(project.related_searches or [])

        system_msg, user_prompt = self.prompt_manager.format_prompt(
            "outline_generation",
            title=title,
            keyword=project.main_keyword,
            min_h2=min_h2,
            max_h2=max_h2,
            min_h3=min_h3,
            max_h3=max_h3,
            entities=entities_str,
            related_searches=related_str
        )

        outline_json, _ = self.ai_client.generate_completion(
            prompt=user_prompt,
            system_message=system_msg,
            max_tokens=2000,
            temperature=0.7,
            json_mode=True,
            override_model=model
        )
        print(f"[DEBUG] Raw outline response: {outline_json}")
        # Save raw response immediately
        if debug:
            self._save_debug_output(project_id, "outline_raw", outline_json, "txt")
            print(f"[DEBUG] Raw outline response: {outline_json}")

        try:
            outline = json.loads(outline_json)
        except json.JSONDecodeError as e:
            if debug:
                self._save_debug_output(project_id, "outline_error", outline_json, "txt")
            raise ValueError(f"Failed to parse outline JSON: {e}\nResponse: {outline_json[:500]}")

        if "outline" not in outline:
            if debug:
                self._save_debug_output(project_id, "outline_invalid", json.dumps(outline, indent=2), "json")
            raise ValueError(f"Outline missing 'outline' key. Got keys: {list(outline.keys())}\nContent: {outline}")

        h2_count = len(outline["outline"])
        h3_count = sum(len(section.get("h3", [])) for section in outline["outline"])

        if h2_count < min_h2:
            raise ValueError(f"Outline has {h2_count} H2s, minimum is {min_h2}")

        if h3_count < min_h3:
            raise ValueError(f"Outline has {h3_count} H3s, minimum is {min_h3}")

        if debug:
            self._save_debug_output(
                project_id, "outline", json.dumps(outline, indent=2), "json"
            )

        return outline

    def generate_content(
        self,
        project_id: int,
        title: str,
        outline: dict,
        min_word_count: int,
        max_word_count: int,
        debug: bool = False,
        model: Optional[str] = None
    ) -> str:
        """
        Generate full article HTML fragment

        Compensates for AI undershoot by adding 200 words to targets.

        Args:
            project_id: Project ID
            title: Article title
            outline: Article outline dict
            min_word_count: Minimum word count for guidance
            max_word_count: Maximum word count for guidance
            debug: If True, save response to debug_output/
            model: Optional model override for this generation stage

        Returns:
            Tuple of (HTML string with <h2>, <h3>, <p> tags, finish_reason)
        """
        project = self.project_repo.get_by_id(project_id)
        if not project:
            raise ValueError(f"Project {project_id} not found")

        entities_str = ", ".join(project.entities or [])
        related_str = ", ".join(project.related_searches or [])
        outline_str = json.dumps(outline, indent=2)

        compensated_min = min_word_count + 100
        compensated_max = max_word_count + 100

        h3_count = sum(len(section.get("h3", [])) for section in outline.get("outline", []))
        words_per_section = int(compensated_min / h3_count) if h3_count > 0 else 100

        system_msg, user_prompt = self.prompt_manager.format_prompt(
            "content_generation",
            title=title,
            outline=outline_str,
            keyword=project.main_keyword,
            entities=entities_str,
            related_searches=related_str,
            min_word_count=compensated_min,
            max_word_count=compensated_max,
            words_per_section=words_per_section
        )

        content, finish_reason = self.ai_client.generate_completion(
            prompt=user_prompt,
            system_message=system_msg,
            max_tokens=12000,
            temperature=0.7,
            override_model=model,
            title=title
        )

        content = content.strip()
        content = self._clean_markdown_fences(content)

        if debug:
            self._save_debug_output(
                project_id, "content", content, "html"
            )

        return content, finish_reason

    def validate_word_count(self, content: str, min_words: int, max_words: int) -> Tuple[bool, int]:
        """
        Validate content word count

        Args:
            content: HTML content string
            min_words: Minimum word count
            max_words: Maximum word count

        Returns:
            Tuple of (is_valid, actual_count)
        """
        word_count = self.count_words(content)
        is_valid = min_words <= word_count <= max_words
        return is_valid, word_count

    def count_words(self, html_content: str) -> int:
        """
        Count words in HTML content

        Args:
            html_content: HTML string

        Returns:
            Number of words
        """
        text = re.sub(r'<[^>]+>', '', html_content)
        text = unescape(text)
        words = text.split()
        return len(words)

    def augment_content(
        self,
        content: str,
        target_word_count: int,
        debug: bool = False,
        project_id: Optional[int] = None,
        model: Optional[str] = None
    ) -> str:
        """
        Expand article content to meet minimum word count

        Args:
            content: Current HTML content
            target_word_count: Target word count
            debug: If True, save response to debug_output/
            project_id: Optional project ID for debug output
            model: Optional model override for this generation stage

        Returns:
            Expanded HTML content
        """
        system_msg, user_prompt = self.prompt_manager.format_prompt(
            "content_augmentation",
            content=content,
            target_word_count=target_word_count
        )

        augmented, _ = self.ai_client.generate_completion(
            prompt=user_prompt,
            system_message=system_msg,
            max_tokens=8000,
            temperature=0.7,
            override_model=model
        )

        augmented = augmented.strip()
        augmented = self._clean_markdown_fences(augmented)

        if debug and project_id:
            self._save_debug_output(
                project_id, "augmented", augmented, "html"
            )

        return augmented

    def apply_template(
        self,
        content_id: int,
        meta_description: Optional[str] = None,
        canonical_url: Optional[str] = None
    ) -> bool:
        """
        Apply HTML template to generated content and save to database

        Args:
            content_id: GeneratedContent ID to format
            meta_description: Optional meta description (defaults to truncated content)
            canonical_url: Optional canonical URL for SEO canonical link tag

        Returns:
            True if successful, False otherwise
        """
        try:
            # Refresh to ensure we have latest content (especially after image reinsertion)
            content_record = self.content_repo.get_by_id(content_id)
            if not content_record:
                print(f"Warning: Content {content_id} not found")
                return False

            # Force refresh from database to get latest content
            self.content_repo.session.refresh(content_record)

            if not meta_description:
                text = re.sub(r'<[^>]+>', '', content_record.content)
                text = unescape(text)
                words = text.split()[:25]
                meta_description = ' '.join(words) + '...'

            template_name = self.template_service.select_template_for_content(
                site_deployment_id=content_record.site_deployment_id,
                site_deployment_repo=self.site_deployment_repo
            )

            formatted_html = self.template_service.format_content(
                content=content_record.content,
                title=content_record.title,
                meta_description=meta_description,
                template_name=template_name,
                canonical_url=canonical_url
            )

            content_record.formatted_html = formatted_html
            content_record.template_used = template_name
            self.content_repo.update(content_record)

            # Verify it was saved
            self.content_repo.session.refresh(content_record)
            if content_record.template_used != template_name:
                print(f"ERROR: template_used not saved! Expected '{template_name}', got '{content_record.template_used}'")
                return False

            print(f"Applied template '{template_name}' to content {content_id}")
            return True

        except Exception as e:
            print(f"Error applying template to content {content_id}: {e}")
            import traceback
            traceback.print_exc()
            return False

    def _clean_markdown_fences(self, content: str) -> str:
        """
        Remove markdown code fences from AI-generated content

        Args:
            content: Raw content that may contain ```html or ``` markers

        Returns:
            Cleaned content without markdown code fences
        """
        import re

        content = re.sub(r'^```html\s*\n?', '', content, flags=re.MULTILINE)
        content = re.sub(r'^```\s*$', '', content, flags=re.MULTILINE)
        content = re.sub(r'\n```\s*$', '', content)

        return content.strip()

    def _save_debug_output(
        self,
        project_id: int,
        stage: str,
        content: str,
        extension: str,
        tier: Optional[str] = None,
        article_num: Optional[int] = None
    ):
        """Save debug output to file"""
        debug_dir = Path("debug_output")
        debug_dir.mkdir(exist_ok=True)

        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

        tier_part = f"_tier{tier}" if tier else ""
        article_part = f"_article{article_num}" if article_num else ""

        filename = f"{stage}_project{project_id}{tier_part}{article_part}_{timestamp}.{extension}"
        filepath = debug_dir / filename

        with open(filepath, 'w', encoding='utf-8') as f:
            f.write(content)