diff --git a/JOB_FIELD_REFERENCE.md b/JOB_FIELD_REFERENCE.md index 047af97..5098353 100644 --- a/JOB_FIELD_REFERENCE.md +++ b/JOB_FIELD_REFERENCE.md @@ -28,6 +28,7 @@ min_h3_tags - Integer max_h3_tags - Integer models - {title, outline, content} - overrides job-level interlinking - {links_per_article_min, links_per_article_max, see_also_min, see_also_max} - overrides job-level +anchor_text_config - {mode, custom_text} - overrides job-level for this tier only ``` ## Field Behaviors @@ -38,10 +39,11 @@ interlinking - {links_per_article_min, links_per_article_max, see_also_ **models**: Use format "provider/model-name" (e.g., "openai/gpt-4o-mini") -**anchor_text_config**: Job-level only, applies to ALL tiers (no tier-specific option) +**anchor_text_config**: Can be set at job-level (all tiers) or tier-level (specific tier) - "default" = Use master.config.json tier rules -- "override" = Replace with custom_text for all tiers -- "append" = Add custom_text to tier rules for all tiers +- "override" = Replace with custom_text +- "append" = Add custom_text to tier rules +- Tier-level config overrides job-level config for that tier **tiered_link_count_range**: How many links to lower tier - Tier1: Always 1 link to money site (this setting ignored) diff --git a/docs/stories/story-2.6-batch-title-generation.md b/docs/stories/story-2.6-batch-title-generation.md new file mode 100644 index 0000000..910dab3 --- /dev/null +++ b/docs/stories/story-2.6-batch-title-generation.md @@ -0,0 +1,484 @@ +# Story 2.6: Batch Title Generation + +## Overview +Refactor title generation to generate all titles for a tier in batches before article generation begins. This prevents title similarity issues that occur when titles are generated sequentially one at a time. + +## Status +**PLANNED** + +## Story Details +**As a User**, I want all article titles for a tier to be generated together in batches, so that the AI can ensure title diversity and prevent repetitive titles. + +## Acceptance Criteria + +### 1. Batch Title Generation Before Articles +**Status:** PENDING + +- All titles for a tier are generated before any article content generation begins +- Titles are generated in batches of 25 (or the tier count if less than 25) +- AI prompt instructs generation of N distinct titles in a single call +- Each batch request includes instructions to ensure title diversity + +### 2. Title File Persistence +**Status:** PENDING + +- Generated titles written to: `debug_output/project_{id}_tier_{name}_titles_{timestamp}.txt` +- One title per line +- File is written before article generation loop begins +- Titles loaded from file and used sequentially during article generation + +### 3. Console Output +**Status:** PENDING + +- Print complete list of generated titles to console after generation +- Show title count and batch information +- Format: numbered list for easy review + +### 4. Error Handling +**Status:** PENDING + +- Retry entire batch on generation failure (up to 3 attempts) +- Fail tier processing after 3 failed batch attempts +- If AI returns fewer titles than requested (e.g., 20 instead of 25): + - Log warning to console + - Continue with partial batch + - Generate remaining titles in next batch or individually + +### 5. Existing Title Validation +**Status:** PENDING + +- Continue to validate individual titles (keyword presence, length) +- No new diversity or similarity validation required +- Existing validation logic unchanged + +### 6. Backward Compatibility +**Status:** PENDING + +- No changes to job file schema +- No changes to CLI interface +- Transparent change to users +- Article generation loop works with pre-generated titles + +## Implementation Details + +### Architecture Changes + +#### 1. New Prompt Template +**File:** `src/generation/prompts/batch_title_generation.json` + +**Format:** +```json +{ + "system_message": "You are an expert creative content writer who creates compelling, search-optimized titles that attract clicks while accurately representing the content topic. When generating multiple titles, ensure each takes a unique angle or approach to maximize diversity. Be creative - the titles just need to be tangentially related to the search topic {keyword}. ", + "user_prompt": "Generate {count} distinct, creative titles for articles about: {keyword}\n\nRelated entities: {entities}\nRelated searches: {related_searches}\n\nIMPORTANT: Each title should take a different angle or approach. Ensure diversity across all titles.\n\nReturn exactly {count} titles, one per line. No numbering, quotes, or formatting - just the title text." +} +``` + +#### 2. ContentGenerator Service Enhancement +**File:** `src/generation/service.py` + +**New Method:** +```python +def generate_titles_batch( + self, + project_id: int, + count: int, + batch_size: int = 25, + debug: bool = False, + model: Optional[str] = None +) -> List[str]: + """ + Generate multiple titles in batches + + Args: + project_id: Project ID to generate titles for + count: Total number of titles needed + batch_size: Number of titles per AI call (default: 25) + debug: If True, save responses to debug_output/ + model: Optional model override for this generation stage + + Returns: + List of generated title strings + """ + # Load project data + # Loop in batches of batch_size + # For each batch: + # - Call AI with batch_title_generation prompt + # - Parse newline-separated titles + # - Validate each title + # - Retry batch up to 3 times on failure + # - Warn if fewer titles returned than requested + # Aggregate all titles + # Return list +``` + +**Key Details:** +- Use max_tokens: 100 * batch_size (e.g., 2500 for 25 titles) +- Temperature: 0.7 (same as current) +- Parse response by splitting on newlines +- Strip whitespace, quotes, numbering from each line +- Validate each title using existing validation logic +- 3 retry attempts per batch + +#### 3. BatchProcessor Refactoring +**File:** `src/generation/batch_processor.py` + +**New Method:** +```python +def _generate_all_titles_for_tier( + self, + project_id: int, + tier_name: str, + tier_config: TierConfig, + debug: bool +) -> str: + """ + Generate all titles for a tier and save to file + + Args: + project_id: Project ID + tier_name: Name of tier (e.g., "tier1") + tier_config: Tier configuration + debug: Debug mode flag + + Returns: + Path to generated titles file + """ + # Generate timestamp + # Call service.generate_titles_batch(count=tier_config.count) + # Create filename: debug_output/project_{id}_tier_{name}_titles_{timestamp}.txt + # Write titles to file (one per line) + # Print titles to console (numbered list) + # Return file path +``` + +**Modified Method:** `_process_tier()` +```python +def _process_tier(...): + """Process a single tier with pre-generated titles""" + + # NEW: Generate all titles first + click.echo(f"\n[{tier_name}] Generating {tier_config.count} titles in batches...") + titles_file = self._generate_all_titles_for_tier( + project_id, tier_name, tier_config, debug + ) + + # NEW: Load titles from file + with open(titles_file, 'r', encoding='utf-8') as f: + titles = [line.strip() for line in f if line.strip()] + + click.echo(f"[{tier_name}] Generated {len(titles)} titles") + click.echo(f"[{tier_name}] Titles saved to: {titles_file}") + + # NEW: Print titles to console + click.echo(f"\n[{tier_name}] Title List:") + for i, title in enumerate(titles, 1): + click.echo(f" {i}. {title}") + click.echo() + + # EXISTING: Loop through articles + for article_num in range(1, tier_config.count + 1): + article_index = article_num - 1 + + # NEW: Get pre-generated title + if article_index < len(titles): + title = titles[article_index] + else: + click.echo(f" Warning: Not enough titles generated, skipping article {article_num}") + continue + + # MODIFIED: Call with pre-generated title + self._generate_single_article( + project_id=project_id, + tier_name=tier_name, + tier_config=tier_config, + article_num=article_num, + article_index=article_index, + title=title, # NEW PARAMETER + keyword=keyword, + resolved_targets=resolved_targets, + debug=debug + ) +``` + +**Modified Method:** `_generate_single_article()` +```python +def _generate_single_article( + self, + project_id: int, + tier_name: str, + tier_config: TierConfig, + article_num: int, + article_index: int, + title: str, # NEW PARAMETER + keyword: str, + resolved_targets: Dict[str, int], + debug: bool +): + """Generate a single article with pre-generated title""" + prefix = f" [{article_num}/{tier_config.count}]" + + # ... site assignment logic ... + + # REMOVED: Title generation block + # click.echo(f"{prefix} Generating title...") + # title = self.generator.generate_title(...) + + # NEW: Just use the provided title + click.echo(f"{prefix} Using title: \"{title}\"") + + # EXISTING: Generate outline and content + click.echo(f"{prefix} Generating outline...") + outline = self.generator.generate_outline(...) + # ... rest of method unchanged ... +``` + +### Console Output Example + +``` +[tier1] Generating 5 titles in batches... +[tier1] Generated 5 titles +[tier1] Titles saved to: debug_output/project_1_tier1_titles_20251024_143052.txt + +[tier1] Title List: + 1. Complete Guide to Shaft Machining: Techniques and Best Practices + 2. Advanced CNC Shaft Machining: From Setup to Finish + 3. Troubleshooting Common Shaft Machining Challenges + 4. Precision Shaft Manufacturing: Tools and Equipment Guide + 5. How to Optimize Shaft Machining Operations for Higher Output + +Processing tier1: 5 articles... + [1/5] Assigned to site: getcnc.info (ID: 1) + [1/5] Using title: "Complete Guide to Shaft Machining: Techniques and Best Practices" + [1/5] Generating outline... + [1/5] Generated outline: 4 H2s, 8 H3s + [1/5] Generating content... + ... +``` + +### Batch Size Logic + +**Determining Batch Size:** +- If tier count <= 25: Use tier count (single batch) +- If tier count > 25: Use batches of 25 + +**Examples:** +- 5 articles: 1 batch of 5 +- 20 articles: 1 batch of 20 +- 25 articles: 1 batch of 25 +- 50 articles: 2 batches of 25 each +- 100 articles: 4 batches of 25 each + +### Error Scenarios + +**Scenario 1: AI Call Fails** +- Retry entire batch (up to 3 attempts) +- After 3 failures: Fail tier processing +- Log error message to console + +**Scenario 2: AI Returns Fewer Titles Than Requested** +``` +Warning: Requested 25 titles but received 20. Continuing with partial batch. +``` +- Continue with titles received +- Process remaining count in next batch + +**Scenario 3: AI Returns More Titles Than Requested** +- Use first N titles (where N = requested count) +- Discard extras + +**Scenario 4: Malformed Response** +- Retry batch (counts toward 3 attempts) +- Log parsing error + +### File Management + +**Title File Format:** +``` +Complete Guide to Shaft Machining: Techniques and Best Practices +Advanced CNC Shaft Machining: From Setup to Finish +Troubleshooting Common Shaft Machining Challenges +Precision Shaft Manufacturing: Tools and Equipment Guide +How to Optimize Shaft Machining Operations for Higher Output +``` + +**File Location:** +- Directory: `debug_output/` +- Naming: `project_{project_id}_tier_{tier_name}_titles_{timestamp}.txt` +- Encoding: UTF-8 +- Format: One title per line, no extra formatting + +**File Lifecycle:** +- Created at start of tier processing +- Read once after creation +- Preserved for debugging/review +- Not deleted after processing + +## Testing Strategy + +### Unit Tests +**File:** `tests/unit/test_generation_service.py` + +New tests: +- `test_generate_titles_batch_single_batch()` - 5 titles +- `test_generate_titles_batch_multiple_batches()` - 50 titles +- `test_generate_titles_batch_exact_25()` - 25 titles +- `test_generate_titles_batch_retry_on_failure()` - Failure handling +- `test_generate_titles_batch_partial_return()` - Fewer titles returned +- `test_generate_titles_batch_validation()` - Individual title validation + +### Integration Tests +**File:** `tests/integration/test_batch_title_generation.py` + +New tests: +- `test_tier_processing_with_batch_titles()` - Full tier with pre-generated titles +- `test_title_file_creation_and_loading()` - File I/O +- `test_console_output_formatting()` - Output validation +- `test_multiple_batches_aggregation()` - 100 articles across 4 batches + +### Manual Testing +```bash +# Small batch (5 articles) +python main.py generate-batch -j jobs/test_shaft_machining.json -u admin -p password + +# Medium batch (20 articles) +python main.py generate-batch -j jobs/tier2_20articles.json -u admin -p password + +# Large batch (100 articles) +python main.py generate-batch -j jobs/tier3_100articles.json -u admin -p password +``` + +**Validation Checklist:** +- [ ] Titles file created in debug_output/ +- [ ] All titles printed to console +- [ ] No duplicate/similar titles in batch +- [ ] Article generation uses pre-generated titles +- [ ] "Generating title..." message removed from article loop +- [ ] "Using title: ..." message present instead + +## Design Decisions + +### Why Batches of 25? +- Balances context window usage vs API efficiency +- Allows AI to see enough titles to ensure diversity +- Reasonable token count (~2500 output tokens) +- Easy to retry on failure + +### Why Write to File? +- Provides debugging artifact +- Separates title generation from article pipeline +- Enables manual review if needed +- Fault tolerance: titles preserved if article generation crashes + +### Why Not Store in Database First? +- Simpler implementation +- No partial GeneratedContent records +- Clear separation of concerns +- File serves as intermediate format + +### Why Print to Console? +- Immediate visibility for user +- Quick sanity check on title quality +- Helps identify if batch generation is working +- Minimal cost (just console output) + +### Why Allow Partial Batches? +- More resilient to AI inconsistencies +- Better than failing entire tier +- Warning provides visibility +- Can continue processing with available titles + +## Known Limitations + +1. **No Similarity Scoring**: Does not quantitatively measure title diversity +2. **No Manual Review Step**: Fully automated, no approval gate +3. **Sequential Batches**: Batches generated sequentially, not in parallel +4. **Fixed Batch Size**: 25 is hardcoded (not configurable per job) +5. **No Title Regeneration**: Can't regenerate individual bad titles + +## Migration Notes + +**No Breaking Changes:** +- CLI interface unchanged +- Job file schema unchanged +- Database schema unchanged +- Existing validation unchanged + +**Transparent to Users:** +- Only console output differs +- New debug files appear +- Articles generated same way + +## Files Created/Modified + +### New Files: +- `src/generation/prompts/batch_title_generation.json` - Batch title prompt +- `tests/unit/test_batch_title_generation.py` - Unit tests +- `tests/integration/test_batch_title_generation.py` - Integration tests +- `docs/stories/story-2.6-batch-title-generation.md` - This document + +### Modified Files: +- `src/generation/service.py` - Add generate_titles_batch() method +- `src/generation/batch_processor.py` - Refactor _process_tier() and _generate_single_article() +- `src/generation/ai_client.py` - May need token limit adjustments (if hardcoded) + +## Performance Impact + +**Before (Sequential):** +- Title per article: ~3-5 seconds +- 25 articles: ~75-125 seconds for titles alone + +**After (Batch):** +- 25 titles in 1 batch: ~8-12 seconds +- 25 articles: ~8-12 seconds for all titles + +**Improvement:** +- ~85% faster title generation +- Better API efficiency (fewer calls) +- Improved title diversity (subjective) + +## Next Steps + +After Story 2.6 completion: +- Monitor title quality and diversity in production +- Consider adding similarity scoring if issues persist +- Potential future: Manual review step for Tier 1 titles +- Potential future: Configurable batch size in job files + +## Completion Checklist + +- [ ] Create batch_title_generation.json prompt +- [ ] Add generate_titles_batch() to ContentGenerator +- [ ] Add _generate_all_titles_for_tier() to BatchProcessor +- [ ] Refactor _process_tier() for batch titles +- [ ] Modify _generate_single_article() signature +- [ ] Implement title file I/O +- [ ] Add console output formatting +- [ ] Implement retry logic (3 attempts) +- [ ] Implement partial batch handling +- [ ] Write unit tests +- [ ] Write integration tests +- [ ] Manual testing with 5, 20, 100 article batches +- [ ] Update documentation +- [ ] Code review + +## Success Metrics + +**Primary:** +- All titles generated before article content generation +- Titles stored in debug_output files +- Article generation uses pre-generated titles + +**Secondary:** +- Subjectively less repetitive titles (manual review) +- Faster title generation (85% improvement) +- No regression in title quality validation + +## Notes + +- This change addresses user feedback about title similarity +- Batch generation allows AI to "see" all titles and ensure diversity +- File-based approach provides debugging capability +- No changes to downstream systems (outline, content, interlinking) +- Maintains existing validation and error handling patterns + diff --git a/src/generation/batch_processor.py b/src/generation/batch_processor.py index 16c8c13..8e5d845 100644 --- a/src/generation/batch_processor.py +++ b/src/generation/batch_processor.py @@ -5,6 +5,8 @@ Batch processor for content generation jobs from typing import Dict, Any, Optional import click import os +from pathlib import Path +from datetime import datetime from src.generation.service import ContentGenerator from src.generation.job_config import JobConfig, Job, TierConfig from src.generation.deployment_assignment import validate_and_resolve_targets, assign_site_for_article @@ -73,6 +75,54 @@ class BatchProcessor: self._print_summary() + def _generate_all_titles_for_tier( + self, + project_id: int, + tier_name: str, + tier_config: TierConfig, + debug: bool, + model: Optional[str] = None + ) -> str: + """ + Generate all titles for a tier and save to file + + Args: + project_id: Project ID + tier_name: Name of tier (e.g., "tier1") + tier_config: Tier configuration + debug: Debug mode flag + model: Optional model override for title generation + + Returns: + Path to generated titles file + """ + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + + titles = self.generator.generate_titles_batch( + project_id=project_id, + count=tier_config.count, + batch_size=25, + debug=debug, + model=model + ) + + debug_dir = Path("debug_output") + debug_dir.mkdir(exist_ok=True) + + filename = f"project_{project_id}_tier_{tier_name}_titles_{timestamp}.txt" + filepath = debug_dir / filename + + with open(filepath, 'w', encoding='utf-8') as f: + for title in titles: + f.write(title + '\n') + + click.echo(f"\n[{tier_name}] Title List:") + for i, title in enumerate(titles, 1): + click.echo(f" {i}. {title}") + click.echo() + + return str(filepath) + def _process_single_job( self, job: Job, @@ -149,18 +199,41 @@ class BatchProcessor: debug: bool, continue_on_error: bool ): - """Process all articles for a tier""" + """Process all articles for a tier with pre-generated titles""" click.echo(f" {tier_name}: Generating {tier_config.count} articles") project = self.project_repo.get_by_id(project_id) keyword = project.main_keyword + models = job.models if job.models else None + + click.echo(f"\n[{tier_name}] Generating {tier_config.count} titles in batches...") + titles_file = self._generate_all_titles_for_tier( + project_id, + tier_name, + tier_config, + debug, + model=models.title if models else None + ) + + with open(titles_file, 'r', encoding='utf-8') as f: + titles = [line.strip() for line in f if line.strip()] + + click.echo(f"[{tier_name}] Generated {len(titles)} titles") + click.echo(f"[{tier_name}] Titles saved to: {titles_file}") + targets_for_tier = resolved_targets if tier_name == "tier1" else {} for article_num in range(1, tier_config.count + 1): self.stats["total_articles"] += 1 article_index = article_num - 1 + if article_index >= len(titles): + click.echo(f" Warning: Not enough titles generated, skipping article {article_num}") + continue + + title = titles[article_index] + try: self._generate_single_article( project_id, @@ -168,6 +241,7 @@ class BatchProcessor: tier_config, article_num, article_index, + title, keyword, targets_for_tier, debug @@ -213,11 +287,12 @@ class BatchProcessor: tier_config: TierConfig, article_num: int, article_index: int, + title: str, keyword: str, resolved_targets: Dict[str, int], debug: bool ): - """Generate a single article""" + """Generate a single article with pre-generated title""" prefix = f" [{article_num}/{tier_config.count}]" models = self.current_job.models if hasattr(self, 'current_job') and self.current_job.models else None @@ -230,13 +305,7 @@ class BatchProcessor: elif resolved_targets: click.echo(f"{prefix} No site assignment (index {article_index} >= {len(resolved_targets)} targets)") - click.echo(f"{prefix} Generating title...") - title = self.generator.generate_title( - project_id, - debug=debug, - model=models.title if models else None - ) - click.echo(f"{prefix} Generated title: \"{title}\"") + click.echo(f"{prefix} Using title: \"{title}\"") click.echo(f"{prefix} Generating outline...") outline = self.generator.generate_outline( diff --git a/src/generation/job_config.py b/src/generation/job_config.py index 91ac15f..775eb35 100644 --- a/src/generation/job_config.py +++ b/src/generation/job_config.py @@ -77,6 +77,7 @@ class TierConfig: max_h2_tags: int min_h3_tags: int max_h3_tags: int + anchor_text_config: Optional[AnchorTextConfig] = None @dataclass @@ -305,6 +306,22 @@ class JobConfig: """Parse tier configuration with defaults (object format)""" defaults = TIER_DEFAULTS.get(tier_name, TIER_DEFAULTS["tier3"]) + # Parse tier-level anchor_text_config if present + anchor_text_config = None + if "anchor_text_config" in tier_data: + anchor_text_data = tier_data["anchor_text_config"] + if not isinstance(anchor_text_data, dict): + raise ValueError(f"'{tier_name}.anchor_text_config' must be an object") + if "mode" not in anchor_text_data: + raise ValueError(f"'{tier_name}.anchor_text_config' must have 'mode' field") + mode = anchor_text_data["mode"] + if mode not in ["default", "override", "append"]: + raise ValueError(f"'{tier_name}.anchor_text_config' mode must be 'default', 'override', or 'append'") + custom_text = anchor_text_data.get("custom_text") + if custom_text is not None and not isinstance(custom_text, list): + raise ValueError(f"'{tier_name}.anchor_text_config' custom_text must be an array") + anchor_text_config = AnchorTextConfig(mode=mode, custom_text=custom_text) + return TierConfig( count=tier_data.get("count", 1), min_word_count=tier_data.get("min_word_count", defaults["min_word_count"]), @@ -312,7 +329,8 @@ class JobConfig: min_h2_tags=tier_data.get("min_h2_tags", defaults["min_h2_tags"]), max_h2_tags=tier_data.get("max_h2_tags", defaults["max_h2_tags"]), min_h3_tags=tier_data.get("min_h3_tags", defaults["min_h3_tags"]), - max_h3_tags=tier_data.get("max_h3_tags", defaults["max_h3_tags"]) + max_h3_tags=tier_data.get("max_h3_tags", defaults["max_h3_tags"]), + anchor_text_config=anchor_text_config ) def _parse_tier_from_array(self, tier_name: str, tier_data: dict) -> TierConfig: @@ -322,6 +340,22 @@ class JobConfig: # Array format uses "article_count" instead of "count" count = tier_data.get("article_count", tier_data.get("count", 1)) + # Parse tier-level anchor_text_config if present + anchor_text_config = None + if "anchor_text_config" in tier_data: + anchor_text_data = tier_data["anchor_text_config"] + if not isinstance(anchor_text_data, dict): + raise ValueError(f"'{tier_name}.anchor_text_config' must be an object") + if "mode" not in anchor_text_data: + raise ValueError(f"'{tier_name}.anchor_text_config' must have 'mode' field") + mode = anchor_text_data["mode"] + if mode not in ["default", "override", "append"]: + raise ValueError(f"'{tier_name}.anchor_text_config' mode must be 'default', 'override', or 'append'") + custom_text = anchor_text_data.get("custom_text") + if custom_text is not None and not isinstance(custom_text, list): + raise ValueError(f"'{tier_name}.anchor_text_config' custom_text must be an array") + anchor_text_config = AnchorTextConfig(mode=mode, custom_text=custom_text) + return TierConfig( count=count, min_word_count=tier_data.get("min_word_count", defaults["min_word_count"]), @@ -329,7 +363,8 @@ class JobConfig: min_h2_tags=tier_data.get("min_h2_tags", defaults["min_h2_tags"]), max_h2_tags=tier_data.get("max_h2_tags", defaults["max_h2_tags"]), min_h3_tags=tier_data.get("min_h3_tags", defaults["min_h3_tags"]), - max_h3_tags=tier_data.get("max_h3_tags", defaults["max_h3_tags"]) + max_h3_tags=tier_data.get("max_h3_tags", defaults["max_h3_tags"]), + anchor_text_config=anchor_text_config ) def get_jobs(self) -> list[Job]: diff --git a/src/generation/service.py b/src/generation/service.py index 4da24e1..e6958dc 100644 --- a/src/generation/service.py +++ b/src/generation/service.py @@ -7,7 +7,7 @@ import json from html import unescape from pathlib import Path from datetime import datetime -from typing import Optional, Tuple +from typing import Optional, Tuple, List from src.generation.ai_client import AIClient, PromptManager from src.database.repositories import ProjectRepository, GeneratedContentRepository, SiteDeploymentRepository from src.templating.service import TemplateService @@ -75,6 +75,98 @@ class ContentGenerator: return title + def generate_titles_batch( + self, + project_id: int, + count: int, + batch_size: int = 25, + debug: bool = False, + model: Optional[str] = None + ) -> List[str]: + """ + Generate multiple titles in batches + + Args: + project_id: Project ID to generate titles for + count: Total number of titles needed + batch_size: Number of titles per AI call (default: 25) + debug: If True, save responses to debug_output/ + model: Optional model override for this generation stage + + Returns: + List of generated title strings + """ + project = self.project_repo.get_by_id(project_id) + if not project: + raise ValueError(f"Project {project_id} not found") + + entities_str = ", ".join(project.entities or []) + related_str = ", ".join(project.related_searches or []) + + all_titles = [] + titles_remaining = count + + while titles_remaining > 0: + current_batch_size = min(batch_size, titles_remaining) + + system_msg, user_prompt = self.prompt_manager.format_prompt( + "batch_title_generation", + keyword=project.main_keyword, + entities=entities_str, + related_searches=related_str, + count=current_batch_size + ) + + batch_titles = None + for attempt in range(3): + try: + response = self.ai_client.generate_completion( + prompt=user_prompt, + system_message=system_msg, + max_tokens=100 * current_batch_size, + temperature=0.7, + override_model=model + ) + + lines = response.strip().split('\n') + batch_titles = [] + + for line in lines: + line = line.strip() + if not line: + continue + line = re.sub(r'^\d+[\.\)]\s*', '', line) + line = line.strip('"').strip("'") + if line: + batch_titles.append(line) + + if len(batch_titles) < current_batch_size: + print(f"Warning: Requested {current_batch_size} titles but received {len(batch_titles)}. Continuing with partial batch.") + + if len(batch_titles) > current_batch_size: + batch_titles = batch_titles[:current_batch_size] + + break + + except Exception as e: + if attempt == 2: + raise ValueError(f"Failed to generate batch after 3 attempts: {e}") + print(f"Batch generation attempt {attempt + 1} failed: {e}, retrying...") + + if batch_titles: + all_titles.extend(batch_titles) + titles_remaining -= len(batch_titles) + else: + raise ValueError("Failed to generate any titles in batch") + + if debug: + for i, title in enumerate(all_titles, 1): + self._save_debug_output( + project_id, f"batch_title_{i}", title, "txt" + ) + + return all_titles + def generate_outline( self, project_id: int, diff --git a/src/interlinking/content_injection.py b/src/interlinking/content_injection.py index badd9a7..7cc6d4c 100644 --- a/src/interlinking/content_injection.py +++ b/src/interlinking/content_injection.py @@ -276,16 +276,25 @@ def _get_anchor_texts_for_tier( job_config, count: int = 5 ) -> List[str]: - """Get anchor texts for a tier, applying job config overrides""" + """Get anchor texts for a tier, applying tier-level or job-level config overrides""" # Get default tier-based anchor texts default_anchors = get_anchor_text_for_tier(tier, project, count) - # Apply job config overrides if present + # Check tier-level config first, then fall back to job-level anchor_text_config = None - if hasattr(job_config, 'anchor_text_config'): - anchor_text_config = job_config.anchor_text_config - elif isinstance(job_config, dict): - anchor_text_config = job_config.get('anchor_text_config') + + # Try tier-level config + if hasattr(job_config, 'tiers') and tier in job_config.tiers: + tier_config = job_config.tiers[tier] + if hasattr(tier_config, 'anchor_text_config'): + anchor_text_config = tier_config.anchor_text_config + + # Fall back to job-level config if no tier-level config + if not anchor_text_config: + if hasattr(job_config, 'anchor_text_config'): + anchor_text_config = job_config.anchor_text_config + elif isinstance(job_config, dict): + anchor_text_config = job_config.get('anchor_text_config') if not anchor_text_config: return default_anchors