diff --git a/docs/stories/story-2.2-task-breakdown.md b/docs/stories/story-2.2-task-breakdown.md index 1831f76..e52b946 100644 --- a/docs/stories/story-2.2-task-breakdown.md +++ b/docs/stories/story-2.2-task-breakdown.md @@ -157,7 +157,8 @@ TIER_DEFAULTS = { ```python AVAILABLE_MODELS = { "gpt-4o-mini": "openai/gpt-4o-mini", - "claude-sonnet-4.5": "anthropic/claude-3.5-sonnet" + "claude-sonnet-4.5": "anthropic/claude-3.5-sonnet", + MANY OTHERS _ CHECK OUT OPENROUTER API FOR MORE } ``` @@ -488,6 +489,9 @@ class BatchProcessor: Process all jobs in job file For each job: + 0. Validate project configuration (fail fast if invalid) + - Check project exists + - Validate money_site_url is set (required for tiered linking strategy) For each tier: For count times: 1. Generate title (log to console) @@ -530,6 +534,9 @@ Summary: **File**: `src/generation/batch_processor.py` **Error handling strategy**: +- Project validation errors: Fail fast before generation starts + - Missing project: Abort with clear error + - Missing money_site_url: Abort with clear error (required for all jobs) - AI API errors: Log error, mark as `status='failed'`, save to DB - If `continue_on_error=True`: continue to next article - If `continue_on_error=False`: stop batch processing diff --git a/docs/technical-debt.md b/docs/technical-debt.md index dc8e197..08f1fab 100644 --- a/docs/technical-debt.md +++ b/docs/technical-debt.md @@ -602,7 +602,12 @@ Generate `index.html` for each site with: **Dependencies**: Story 3.4 (boilerplate page infrastructure) --- +### www vs root in domain imports +#### Problem +Domains are stored as either www.domain.com or domain.com in the table, but if you search on the wrong one through any of the scripts (like main.py get-site or on an job.json import) it will fail. +#### Solution +partial match on search? search for both www or root in the logic? Just ideas, havent fleshed it out. ## Future Sections Add new technical debt items below as they're identified during development. diff --git a/jobs/example_multi_tier_batch.json b/jobs/example_multi_tier_batch.json index 84ae16a..6b9c969 100644 --- a/jobs/example_multi_tier_batch.json +++ b/jobs/example_multi_tier_batch.json @@ -5,11 +5,11 @@ "tiers": { "tier1": { "count": 5, - "min_word_count": 2200, - "max_word_count": 2600 + "min_word_count": 1500, + "max_word_count": 2000 }, "tier2": { - "count": 10 + "count": 20 }, "tier3": { "count": 15, diff --git a/scripts/post_process_batch.py b/scripts/post_process_batch.py new file mode 100644 index 0000000..f87f14b --- /dev/null +++ b/scripts/post_process_batch.py @@ -0,0 +1,210 @@ +""" +Post-process existing articles that were generated but not fully processed. + +This script applies post-processing steps to articles that are already in the database: +- Site assignment (if needed) +- URL generation +- Tiered link discovery +- Interlink injection +- Template application + +Usage: + uv run python scripts/post_process_batch.py --project-id 1 --tier tier1 tier2 + uv run python scripts/post_process_batch.py --project-id 1 --all-tiers +""" + +import click +import os +from src.database.session import db_manager +from src.database.repositories import ( + GeneratedContentRepository, + ProjectRepository, + SiteDeploymentRepository, + ArticleLinkRepository, + SitePageRepository +) +from src.generation.ai_client import AIClient, PromptManager +from src.generation.service import ContentGenerator +from src.generation.url_generator import generate_urls_for_batch +from src.generation.site_assignment import assign_sites_to_batch +from src.generation.job_config import Job, InterlinkingConfig +from src.interlinking.tiered_links import find_tiered_links +from src.interlinking.content_injection import inject_interlinks + + +@click.command() +@click.option('--project-id', '-p', required=True, type=int, help='Project ID to post-process') +@click.option('--tier', '-t', multiple=True, help='Tiers to process (e.g., tier1, tier2)') +@click.option('--all-tiers', is_flag=True, help='Process all tiers') +@click.option('--skip-site-assignment', is_flag=True, help='Skip site assignment step') +@click.option('--skip-urls', is_flag=True, help='Skip URL generation') +@click.option('--skip-interlinks', is_flag=True, help='Skip interlinking') +@click.option('--skip-templates', is_flag=True, help='Skip template application') +def post_process_batch( + project_id: int, + tier: tuple, + all_tiers: bool, + skip_site_assignment: bool, + skip_urls: bool, + skip_interlinks: bool, + skip_templates: bool +): + """Post-process existing articles in the database""" + + if not tier and not all_tiers: + click.echo("Error: Must specify either --tier or --all-tiers", err=True) + return + + session = db_manager.get_session() + + try: + project_repo = ProjectRepository(session) + content_repo = GeneratedContentRepository(session) + site_repo = SiteDeploymentRepository(session) + link_repo = ArticleLinkRepository(session) + + project = project_repo.get_by_id(project_id) + if not project: + click.echo(f"Error: Project {project_id} not found", err=True) + return + + if not project.money_site_url: + click.echo(f"Error: Project {project_id} has no money_site_url set", err=True) + click.echo("Please set money_site_url before post-processing", err=True) + return + + click.echo(f"\nPost-processing project: {project.name} (ID: {project_id})") + click.echo(f"Keyword: {project.main_keyword}") + click.echo(f"Money site: {project.money_site_url}\n") + + tiers_to_process = [] + if all_tiers: + all_articles = content_repo.get_by_project_id(project_id) + tiers_to_process = sorted(set(a.tier for a in all_articles)) + click.echo(f"Found tiers: {', '.join(tiers_to_process)}\n") + else: + tiers_to_process = list(tier) + + api_key = os.getenv("OPENROUTER_API_KEY") + if not api_key: + click.echo("Error: OPENROUTER_API_KEY not found in environment", err=True) + return + + ai_client = AIClient(api_key=api_key, model='gpt-4o-mini') + prompt_manager = PromptManager() + content_generator = ContentGenerator( + ai_client=ai_client, + prompt_manager=prompt_manager, + project_repo=project_repo, + content_repo=content_repo, + site_deployment_repo=site_repo + ) + + job = Job( + project_id=project_id, + tiers={}, + interlinking=InterlinkingConfig( + links_per_article_min=2, + links_per_article_max=4, + include_home_link=True + ) + ) + + for tier_name in tiers_to_process: + click.echo(f"Processing {tier_name}...") + + all_articles = content_repo.get_by_project_and_tier( + project_id, tier_name, require_site=False + ) + + if not all_articles: + click.echo(f" No articles found for {tier_name}") + continue + + click.echo(f" Found {len(all_articles)} articles") + + articles_without_sites = [a for a in all_articles if not a.site_deployment_id] + + if articles_without_sites and not skip_site_assignment: + click.echo(f" Assigning sites to {len(articles_without_sites)} articles...") + try: + assign_sites_to_batch( + content_records=all_articles, + job=job, + site_repo=site_repo, + bunny_client=None, + project_keyword=project.main_keyword + ) + session.expire_all() + all_articles = content_repo.get_by_project_and_tier( + project_id, tier_name, require_site=False + ) + click.echo(f" Assigned sites successfully") + except Exception as e: + click.echo(f" Warning: Site assignment failed: {e}") + + content_records = [a for a in all_articles if a.site_deployment_id] + + if not content_records: + click.echo(f" No articles with site assignments, skipping {tier_name}") + continue + + click.echo(f" Processing {len(content_records)} articles with sites...") + + if not skip_urls: + click.echo(f" Generating URLs...") + article_urls = generate_urls_for_batch(content_records, site_repo) + click.echo(f" Generated {len(article_urls)} URLs") + else: + article_urls = {a.id: a.url for a in content_records if a.url} + + if not skip_interlinks: + click.echo(f" Finding tiered links...") + tiered_links = find_tiered_links( + content_records, + job, + project_repo, + content_repo, + site_repo + ) + click.echo(f" Found tiered links for tier {tiered_links.get('tier', 'N/A')}") + + click.echo(f" Injecting interlinks...") + inject_interlinks( + content_records, + article_urls, + tiered_links, + project, + job, + content_repo, + link_repo + ) + click.echo(f" Interlinks injected successfully") + + if not skip_templates: + click.echo(f" Applying templates...") + template_count = 0 + for content in content_records: + try: + if content_generator.apply_template(content.id): + template_count += 1 + except Exception as e: + click.echo(f" Warning: Failed to apply template to content {content.id}: {e}") + + click.echo(f" Applied templates to {template_count}/{len(content_records)} articles") + + click.echo(f" {tier_name}: Complete\n") + + click.echo("\n" + "=" * 70) + click.echo("Post-processing complete!") + click.echo("=" * 70) + click.echo(f"\nYou can now deploy with:") + click.echo(f" uv run python main.py deploy-batch --batch-id {project_id}") + + finally: + session.close() + + +if __name__ == "__main__": + post_process_batch() + diff --git a/scripts/set_money_site_url.py b/scripts/set_money_site_url.py new file mode 100644 index 0000000..29387f5 --- /dev/null +++ b/scripts/set_money_site_url.py @@ -0,0 +1,51 @@ +""" +Set the money_site_url for a project + +Usage: + uv run python scripts/set_money_site_url.py --project-id 1 --url "https://example.com" + uv run python scripts/set_money_site_url.py --project-id 1 --url "https://www.mysite.com" +""" + +import click +from src.database.session import db_manager +from src.database.repositories import ProjectRepository + + +@click.command() +@click.option('--project-id', '-p', required=True, type=int, help='Project ID') +@click.option('--url', '-u', required=True, help='Money site URL (e.g., https://example.com)') +def set_money_site_url(project_id: int, url: str): + """Set the money_site_url for a project""" + + if not url.startswith('http://') and not url.startswith('https://'): + click.echo("Error: URL must start with http:// or https://", err=True) + return + + url = url.rstrip('/') + + session = db_manager.get_session() + + try: + project_repo = ProjectRepository(session) + + project = project_repo.get_by_id(project_id) + if not project: + click.echo(f"Error: Project {project_id} not found", err=True) + return + + old_url = project.money_site_url or "(not set)" + + project.money_site_url = url + project_repo.update(project) + + click.echo(f"Success: Updated project {project_id}: {project.name}") + click.echo(f" Old URL: {old_url}") + click.echo(f" New URL: {url}") + + finally: + session.close() + + +if __name__ == "__main__": + set_money_site_url() + diff --git a/src/cli/commands.py b/src/cli/commands.py index a67b793..f5c871e 100644 --- a/src/cli/commands.py +++ b/src/cli/commands.py @@ -500,7 +500,7 @@ def list_sites(admin_user: Optional[str], admin_password: Optional[str]): click.echo("-" * 100) for site in sites: - click.echo(f"{site.id:<5} {site.site_name:<25} {site.custom_hostname:<30} " + click.echo(f"{site.id:<5} {site.site_name:<25} {site.custom_hostname or 'N/A':<30} " f"{site.storage_zone_name:<20} {site.storage_zone_region:<8}") click.echo("-" * 100) diff --git a/src/generation/batch_processor.py b/src/generation/batch_processor.py index 761e4d6..21122b9 100644 --- a/src/generation/batch_processor.py +++ b/src/generation/batch_processor.py @@ -88,6 +88,13 @@ class BatchProcessor: if not project: raise ValueError(f"Project {job.project_id} not found") + if not project.money_site_url: + raise ValueError( + f"Cannot generate articles: money_site_url not set for project {job.project_id}. " + f"Please set money_site_url in the project configuration. " + f"The money site is required for the tiered linking strategy." + ) + click.echo(f"\nProcessing Job {job_idx}/{self.stats['total_jobs']}: Project ID {job.project_id}") if job.models: diff --git a/src/generation/job_config.py b/src/generation/job_config.py index 19b3b52..8cfe722 100644 --- a/src/generation/job_config.py +++ b/src/generation/job_config.py @@ -17,16 +17,16 @@ TIER_DEFAULTS = { "max_h3_tags": 10 }, "tier2": { - "min_word_count": 1500, - "max_word_count": 2000, + "min_word_count": 1100, + "max_word_count": 1500, "min_h2_tags": 2, "max_h2_tags": 4, "min_h3_tags": 3, "max_h3_tags": 8 }, "tier3": { - "min_word_count": 1000, - "max_word_count": 1500, + "min_word_count": 850, + "max_word_count": 1350, "min_h2_tags": 2, "max_h2_tags": 3, "min_h3_tags": 2,