Big-Link-Man/scripts/post_process_batch.py

217 lines
8.1 KiB
Python

"""
Post-process existing articles that were generated but not fully processed.
This script applies post-processing steps to articles that are already in the database:
- Site assignment (if needed)
- URL generation
- Tiered link discovery
- Interlink injection
- Template application
Usage:
uv run python scripts/post_process_batch.py --project-id 1 --tier tier1 tier2
uv run python scripts/post_process_batch.py --project-id 1 --all-tiers
"""
import sys
from pathlib import Path
project_root = Path(__file__).parent.parent
sys.path.insert(0, str(project_root))
import click
import os
from src.database.session import db_manager
from src.database.repositories import (
GeneratedContentRepository,
ProjectRepository,
SiteDeploymentRepository,
ArticleLinkRepository,
SitePageRepository
)
from src.generation.ai_client import AIClient, PromptManager
from src.generation.service import ContentGenerator
from src.generation.url_generator import generate_urls_for_batch
from src.generation.site_assignment import assign_sites_to_batch
from src.generation.job_config import Job, InterlinkingConfig
from src.interlinking.tiered_links import find_tiered_links
from src.interlinking.content_injection import inject_interlinks
@click.command()
@click.option('--project-id', '-p', required=True, type=int, help='Project ID to post-process')
@click.option('--tier', '-t', multiple=True, help='Tiers to process (e.g., tier1, tier2)')
@click.option('--all-tiers', is_flag=True, help='Process all tiers')
@click.option('--skip-site-assignment', is_flag=True, help='Skip site assignment step')
@click.option('--skip-urls', is_flag=True, help='Skip URL generation')
@click.option('--skip-interlinks', is_flag=True, help='Skip interlinking')
@click.option('--skip-templates', is_flag=True, help='Skip template application')
def post_process_batch(
project_id: int,
tier: tuple,
all_tiers: bool,
skip_site_assignment: bool,
skip_urls: bool,
skip_interlinks: bool,
skip_templates: bool
):
"""Post-process existing articles in the database"""
if not tier and not all_tiers:
click.echo("Error: Must specify either --tier or --all-tiers", err=True)
return
session = db_manager.get_session()
try:
project_repo = ProjectRepository(session)
content_repo = GeneratedContentRepository(session)
site_repo = SiteDeploymentRepository(session)
link_repo = ArticleLinkRepository(session)
project = project_repo.get_by_id(project_id)
if not project:
click.echo(f"Error: Project {project_id} not found", err=True)
return
if not project.money_site_url:
click.echo(f"Error: Project {project_id} has no money_site_url set", err=True)
click.echo("Please set money_site_url before post-processing", err=True)
return
click.echo(f"\nPost-processing project: {project.name} (ID: {project_id})")
click.echo(f"Keyword: {project.main_keyword}")
click.echo(f"Money site: {project.money_site_url}\n")
tiers_to_process = []
if all_tiers:
all_articles = content_repo.get_by_project_id(project_id)
tiers_to_process = sorted(set(a.tier for a in all_articles))
click.echo(f"Found tiers: {', '.join(tiers_to_process)}\n")
else:
tiers_to_process = list(tier)
api_key = os.getenv("OPENROUTER_API_KEY")
if not api_key:
click.echo("Error: OPENROUTER_API_KEY not found in environment", err=True)
return
ai_client = AIClient(api_key=api_key, model='gpt-4o-mini')
prompt_manager = PromptManager()
content_generator = ContentGenerator(
ai_client=ai_client,
prompt_manager=prompt_manager,
project_repo=project_repo,
content_repo=content_repo,
site_deployment_repo=site_repo
)
job = Job(
project_id=project_id,
tiers={},
interlinking=InterlinkingConfig(
links_per_article_min=2,
links_per_article_max=4,
include_home_link=True
)
)
for tier_name in tiers_to_process:
click.echo(f"Processing {tier_name}...")
all_articles = content_repo.get_by_project_and_tier(
project_id, tier_name, require_site=False
)
if not all_articles:
click.echo(f" No articles found for {tier_name}")
continue
click.echo(f" Found {len(all_articles)} articles")
articles_without_sites = [a for a in all_articles if not a.site_deployment_id]
if articles_without_sites and not skip_site_assignment:
click.echo(f" Assigning sites to {len(articles_without_sites)} articles...")
try:
assign_sites_to_batch(
content_records=all_articles,
job=job,
site_repo=site_repo,
bunny_client=None,
project_keyword=project.main_keyword
)
session.expire_all()
all_articles = content_repo.get_by_project_and_tier(
project_id, tier_name, require_site=False
)
click.echo(f" Assigned sites successfully")
except Exception as e:
click.echo(f" Warning: Site assignment failed: {e}")
content_records = [a for a in all_articles if a.site_deployment_id]
if not content_records:
click.echo(f" No articles with site assignments, skipping {tier_name}")
continue
click.echo(f" Processing {len(content_records)} articles with sites...")
if not skip_urls:
click.echo(f" Generating URLs...")
article_urls = generate_urls_for_batch(content_records, site_repo)
click.echo(f" Generated {len(article_urls)} URLs")
else:
article_urls = {a.id: a.url for a in content_records if a.url}
if not skip_interlinks:
click.echo(f" Finding tiered links...")
tiered_links = find_tiered_links(
content_records,
job,
project_repo,
content_repo,
site_repo
)
click.echo(f" Found tiered links for tier {tiered_links.get('tier', 'N/A')}")
click.echo(f" Injecting interlinks...")
inject_interlinks(
content_records,
article_urls,
tiered_links,
project,
job,
content_repo,
link_repo
)
click.echo(f" Interlinks injected successfully")
if not skip_templates:
click.echo(f" Applying templates...")
template_count = 0
for content in content_records:
try:
if content_generator.apply_template(content.id):
template_count += 1
except Exception as e:
click.echo(f" Warning: Failed to apply template to content {content.id}: {e}")
click.echo(f" Applied templates to {template_count}/{len(content_records)} articles")
click.echo(f" {tier_name}: Complete\n")
click.echo("\n" + "=" * 70)
click.echo("Post-processing complete!")
click.echo("=" * 70)
click.echo(f"\nYou can now deploy with:")
click.echo(f" uv run python main.py deploy-batch --batch-id {project_id}")
finally:
session.close()
if __name__ == "__main__":
post_process_batch()