Big-Link-Man/src/generation/job_config.py

383 lines
17 KiB
Python

"""
Job configuration parser for batch content generation
"""
import json
from dataclasses import dataclass
from typing import Optional, Dict, Any, List
from pathlib import Path
TIER_DEFAULTS = {
"tier1": {
"min_word_count": 2000,
"max_word_count": 2500,
"min_h2_tags": 3,
"max_h2_tags": 5,
"min_h3_tags": 5,
"max_h3_tags": 10
},
"tier2": {
"min_word_count": 1100,
"max_word_count": 1500,
"min_h2_tags": 2,
"max_h2_tags": 4,
"min_h3_tags": 3,
"max_h3_tags": 8
},
"tier3": {
"min_word_count": 850,
"max_word_count": 1350,
"min_h2_tags": 2,
"max_h2_tags": 3,
"min_h3_tags": 2,
"max_h3_tags": 6
}
}
@dataclass
class ModelConfig:
"""AI model configuration for different generation stages"""
title: str
outline: str
content: str
@dataclass
class AnchorTextConfig:
"""Anchor text configuration for interlinking"""
mode: str # "default", "override", "append"
custom_text: Optional[List[str]] = None
@dataclass
class FailureConfig:
"""Configuration for handling generation failures"""
max_consecutive_failures: int = 5
skip_on_failure: bool = True
@dataclass
class InterlinkingConfig:
"""Configuration for article interlinking"""
links_per_article_min: int = 2
links_per_article_max: int = 4
include_home_link: bool = True
see_also_min: int = 4
see_also_max: int = 5
@dataclass
class TierConfig:
"""Configuration for a specific tier"""
count: int
min_word_count: int
max_word_count: int
min_h2_tags: int
max_h2_tags: int
min_h3_tags: int
max_h3_tags: int
anchor_text_config: Optional[AnchorTextConfig] = None
@dataclass
class Job:
"""Job definition for content generation"""
project_id: int
tiers: Dict[str, TierConfig]
models: Optional[ModelConfig] = None
deployment_targets: Optional[List[str]] = None
tier1_preferred_sites: Optional[List[str]] = None
auto_create_sites: bool = False
create_sites_for_keywords: Optional[List[Dict[str, any]]] = None
tiered_link_count_range: Optional[Dict[str, int]] = None
anchor_text_config: Optional[AnchorTextConfig] = None
failure_config: Optional[FailureConfig] = None
interlinking: Optional[InterlinkingConfig] = None
max_workers: Optional[int] = None
class JobConfig:
"""Parser for job configuration files"""
def __init__(self, job_file_path: str):
"""
Load and parse job file, apply defaults
Args:
job_file_path: Path to JSON job file
"""
self.job_file_path = Path(job_file_path)
self.jobs: list[Job] = []
self._load()
def _load(self):
"""Load and parse the job file"""
if not self.job_file_path.exists():
raise FileNotFoundError(f"Job file not found: {self.job_file_path}")
with open(self.job_file_path, 'r', encoding='utf-8') as f:
data = json.load(f)
# Handle both array format and single job format
if "jobs" in data:
# Array format: {"jobs": [{"project_id": 1, "tiers": {...}}]}
if not isinstance(data["jobs"], list):
raise ValueError("'jobs' must be an array")
for job_data in data["jobs"]:
self._validate_job(job_data)
job = self._parse_job(job_data)
self.jobs.append(job)
elif "project_id" in data:
# Single job format: {"project_id": 1, "tiers": [...], "models": {...}}
self._validate_job(data)
job = self._parse_job(data)
self.jobs.append(job)
else:
raise ValueError("Job file must contain either 'jobs' array or 'project_id' field")
def _validate_job(self, job_data: dict):
"""Validate job structure"""
if "project_id" not in job_data:
raise ValueError("Job missing 'project_id'")
if "tiers" not in job_data:
raise ValueError("Job missing 'tiers'")
# Handle both object format {"tier1": {...}} and array format [{"tier": 1, ...}]
tiers_data = job_data["tiers"]
if not isinstance(tiers_data, (dict, list)):
raise ValueError("'tiers' must be a dictionary or array")
def _parse_job(self, job_data: dict) -> Job:
"""Parse a single job"""
project_id = job_data["project_id"]
tiers = {}
tiers_data = job_data["tiers"]
if isinstance(tiers_data, dict):
# Object format: {"tier1": {"count": 10, ...}}
for tier_name, tier_data in tiers_data.items():
tier_config = self._parse_tier(tier_name, tier_data)
tiers[tier_name] = tier_config
elif isinstance(tiers_data, list):
# Array format: [{"tier": 1, "article_count": 10, ...}]
for tier_data in tiers_data:
if "tier" not in tier_data:
raise ValueError("Tier array items must have 'tier' field")
tier_num = tier_data["tier"]
tier_name = f"tier{tier_num}"
tier_config = self._parse_tier_from_array(tier_name, tier_data)
tiers[tier_name] = tier_config
deployment_targets = job_data.get("deployment_targets")
if deployment_targets is not None:
if not isinstance(deployment_targets, list):
raise ValueError("'deployment_targets' must be an array")
if not all(isinstance(item, str) for item in deployment_targets):
raise ValueError("'deployment_targets' must be an array of strings")
tier1_preferred_sites = job_data.get("tier1_preferred_sites")
if tier1_preferred_sites is not None:
if not isinstance(tier1_preferred_sites, list):
raise ValueError("'tier1_preferred_sites' must be an array")
if not all(isinstance(item, str) for item in tier1_preferred_sites):
raise ValueError("'tier1_preferred_sites' must be an array of strings")
auto_create_sites = job_data.get("auto_create_sites", False)
if not isinstance(auto_create_sites, bool):
raise ValueError("'auto_create_sites' must be a boolean")
create_sites_for_keywords = job_data.get("create_sites_for_keywords")
if create_sites_for_keywords is not None:
if not isinstance(create_sites_for_keywords, list):
raise ValueError("'create_sites_for_keywords' must be an array")
for kw_config in create_sites_for_keywords:
if not isinstance(kw_config, dict):
raise ValueError("Each item in 'create_sites_for_keywords' must be an object")
if "keyword" not in kw_config or "count" not in kw_config:
raise ValueError("Each item in 'create_sites_for_keywords' must have 'keyword' and 'count'")
tiered_link_count_range = job_data.get("tiered_link_count_range")
if tiered_link_count_range is not None:
if not isinstance(tiered_link_count_range, dict):
raise ValueError("'tiered_link_count_range' must be an object")
if "min" not in tiered_link_count_range or "max" not in tiered_link_count_range:
raise ValueError("'tiered_link_count_range' must have 'min' and 'max' fields")
min_val = tiered_link_count_range["min"]
max_val = tiered_link_count_range["max"]
if not isinstance(min_val, int) or not isinstance(max_val, int):
raise ValueError("'tiered_link_count_range' min and max must be integers")
if min_val < 1:
raise ValueError("'tiered_link_count_range' min must be >= 1")
if max_val < min_val:
raise ValueError("'tiered_link_count_range' max must be >= min")
# Parse models configuration
models = None
models_data = job_data.get("models")
if models_data is not None:
if not isinstance(models_data, dict):
raise ValueError("'models' must be an object")
if "title" not in models_data or "outline" not in models_data or "content" not in models_data:
raise ValueError("'models' must have 'title', 'outline', and 'content' fields")
models = ModelConfig(
title=models_data["title"],
outline=models_data["outline"],
content=models_data["content"]
)
# Parse anchor text configuration
anchor_text_config = None
anchor_text_data = job_data.get("anchor_text_config")
if anchor_text_data is not None:
if not isinstance(anchor_text_data, dict):
raise ValueError("'anchor_text_config' must be an object")
if "mode" not in anchor_text_data:
raise ValueError("'anchor_text_config' must have 'mode' field")
mode = anchor_text_data["mode"]
if mode not in ["default", "override", "append"]:
raise ValueError("'anchor_text_config' mode must be 'default', 'override', or 'append'")
custom_text = anchor_text_data.get("custom_text")
if custom_text is not None and not isinstance(custom_text, list):
raise ValueError("'anchor_text_config' custom_text must be an array")
anchor_text_config = AnchorTextConfig(mode=mode, custom_text=custom_text)
# Parse failure configuration
failure_config = None
failure_data = job_data.get("failure_config")
if failure_data is not None:
if not isinstance(failure_data, dict):
raise ValueError("'failure_config' must be an object")
max_failures = failure_data.get("max_consecutive_failures", 5)
skip_on_failure = failure_data.get("skip_on_failure", True)
if not isinstance(max_failures, int) or max_failures < 1:
raise ValueError("'failure_config' max_consecutive_failures must be a positive integer")
if not isinstance(skip_on_failure, bool):
raise ValueError("'failure_config' skip_on_failure must be a boolean")
failure_config = FailureConfig(
max_consecutive_failures=max_failures,
skip_on_failure=skip_on_failure
)
# Parse interlinking configuration
interlinking = None
interlinking_data = job_data.get("interlinking")
if interlinking_data is not None:
if not isinstance(interlinking_data, dict):
raise ValueError("'interlinking' must be an object")
min_links = interlinking_data.get("links_per_article_min", 2)
max_links = interlinking_data.get("links_per_article_max", 4)
include_home = interlinking_data.get("include_home_link", True)
see_also_min = interlinking_data.get("see_also_min", 4)
see_also_max = interlinking_data.get("see_also_max", 5)
if not isinstance(min_links, int) or min_links < 0:
raise ValueError("'interlinking' links_per_article_min must be a non-negative integer")
if not isinstance(max_links, int) or max_links < min_links:
raise ValueError("'interlinking' links_per_article_max must be >= links_per_article_min")
if not isinstance(include_home, bool):
raise ValueError("'interlinking' include_home_link must be a boolean")
if not isinstance(see_also_min, int) or see_also_min < 0:
raise ValueError("'interlinking' see_also_min must be a non-negative integer")
if not isinstance(see_also_max, int) or see_also_max < see_also_min:
raise ValueError("'interlinking' see_also_max must be >= see_also_min")
interlinking = InterlinkingConfig(
links_per_article_min=min_links,
links_per_article_max=max_links,
include_home_link=include_home,
see_also_min=see_also_min,
see_also_max=see_also_max
)
max_workers = job_data.get("max_workers")
if max_workers is not None:
if not isinstance(max_workers, int) or max_workers < 1:
raise ValueError("'max_workers' must be a positive integer")
return Job(
project_id=project_id,
tiers=tiers,
models=models,
deployment_targets=deployment_targets,
tier1_preferred_sites=tier1_preferred_sites,
auto_create_sites=auto_create_sites,
create_sites_for_keywords=create_sites_for_keywords,
tiered_link_count_range=tiered_link_count_range,
anchor_text_config=anchor_text_config,
failure_config=failure_config,
interlinking=interlinking,
max_workers=max_workers
)
def _parse_tier(self, tier_name: str, tier_data: dict) -> TierConfig:
"""Parse tier configuration with defaults (object format)"""
defaults = TIER_DEFAULTS.get(tier_name, TIER_DEFAULTS["tier3"])
# Parse tier-level anchor_text_config if present
anchor_text_config = None
if "anchor_text_config" in tier_data:
anchor_text_data = tier_data["anchor_text_config"]
if not isinstance(anchor_text_data, dict):
raise ValueError(f"'{tier_name}.anchor_text_config' must be an object")
if "mode" not in anchor_text_data:
raise ValueError(f"'{tier_name}.anchor_text_config' must have 'mode' field")
mode = anchor_text_data["mode"]
if mode not in ["default", "override", "append"]:
raise ValueError(f"'{tier_name}.anchor_text_config' mode must be 'default', 'override', or 'append'")
custom_text = anchor_text_data.get("custom_text")
if custom_text is not None and not isinstance(custom_text, list):
raise ValueError(f"'{tier_name}.anchor_text_config' custom_text must be an array")
anchor_text_config = AnchorTextConfig(mode=mode, custom_text=custom_text)
return TierConfig(
count=tier_data.get("count", 1),
min_word_count=tier_data.get("min_word_count", defaults["min_word_count"]),
max_word_count=tier_data.get("max_word_count", defaults["max_word_count"]),
min_h2_tags=tier_data.get("min_h2_tags", defaults["min_h2_tags"]),
max_h2_tags=tier_data.get("max_h2_tags", defaults["max_h2_tags"]),
min_h3_tags=tier_data.get("min_h3_tags", defaults["min_h3_tags"]),
max_h3_tags=tier_data.get("max_h3_tags", defaults["max_h3_tags"]),
anchor_text_config=anchor_text_config
)
def _parse_tier_from_array(self, tier_name: str, tier_data: dict) -> TierConfig:
"""Parse tier configuration from array format"""
defaults = TIER_DEFAULTS.get(tier_name, TIER_DEFAULTS["tier3"])
# Array format uses "article_count" instead of "count"
count = tier_data.get("article_count", tier_data.get("count", 1))
# Parse tier-level anchor_text_config if present
anchor_text_config = None
if "anchor_text_config" in tier_data:
anchor_text_data = tier_data["anchor_text_config"]
if not isinstance(anchor_text_data, dict):
raise ValueError(f"'{tier_name}.anchor_text_config' must be an object")
if "mode" not in anchor_text_data:
raise ValueError(f"'{tier_name}.anchor_text_config' must have 'mode' field")
mode = anchor_text_data["mode"]
if mode not in ["default", "override", "append"]:
raise ValueError(f"'{tier_name}.anchor_text_config' mode must be 'default', 'override', or 'append'")
custom_text = anchor_text_data.get("custom_text")
if custom_text is not None and not isinstance(custom_text, list):
raise ValueError(f"'{tier_name}.anchor_text_config' custom_text must be an array")
anchor_text_config = AnchorTextConfig(mode=mode, custom_text=custom_text)
return TierConfig(
count=count,
min_word_count=tier_data.get("min_word_count", defaults["min_word_count"]),
max_word_count=tier_data.get("max_word_count", defaults["max_word_count"]),
min_h2_tags=tier_data.get("min_h2_tags", defaults["min_h2_tags"]),
max_h2_tags=tier_data.get("max_h2_tags", defaults["max_h2_tags"]),
min_h3_tags=tier_data.get("min_h3_tags", defaults["min_h3_tags"]),
max_h3_tags=tier_data.get("max_h3_tags", defaults["max_h3_tags"]),
anchor_text_config=anchor_text_config
)
def get_jobs(self) -> list[Job]:
"""Return list of all jobs in file"""
return self.jobs
def get_tier_config(self, job: Job, tier_name: str) -> Optional[TierConfig]:
"""Get tier config with defaults applied"""
return job.tiers.get(tier_name)