383 lines
17 KiB
Python
383 lines
17 KiB
Python
"""
|
|
Job configuration parser for batch content generation
|
|
"""
|
|
|
|
import json
|
|
from dataclasses import dataclass
|
|
from typing import Optional, Dict, Any, List
|
|
from pathlib import Path
|
|
|
|
TIER_DEFAULTS = {
|
|
"tier1": {
|
|
"min_word_count": 2000,
|
|
"max_word_count": 2500,
|
|
"min_h2_tags": 3,
|
|
"max_h2_tags": 5,
|
|
"min_h3_tags": 5,
|
|
"max_h3_tags": 10
|
|
},
|
|
"tier2": {
|
|
"min_word_count": 1100,
|
|
"max_word_count": 1500,
|
|
"min_h2_tags": 2,
|
|
"max_h2_tags": 4,
|
|
"min_h3_tags": 3,
|
|
"max_h3_tags": 8
|
|
},
|
|
"tier3": {
|
|
"min_word_count": 850,
|
|
"max_word_count": 1350,
|
|
"min_h2_tags": 2,
|
|
"max_h2_tags": 3,
|
|
"min_h3_tags": 2,
|
|
"max_h3_tags": 6
|
|
}
|
|
}
|
|
|
|
|
|
@dataclass
|
|
class ModelConfig:
|
|
"""AI model configuration for different generation stages"""
|
|
title: str
|
|
outline: str
|
|
content: str
|
|
|
|
|
|
@dataclass
|
|
class AnchorTextConfig:
|
|
"""Anchor text configuration for interlinking"""
|
|
mode: str # "default", "override", "append"
|
|
custom_text: Optional[List[str]] = None
|
|
|
|
|
|
@dataclass
|
|
class FailureConfig:
|
|
"""Configuration for handling generation failures"""
|
|
max_consecutive_failures: int = 5
|
|
skip_on_failure: bool = True
|
|
|
|
|
|
@dataclass
|
|
class InterlinkingConfig:
|
|
"""Configuration for article interlinking"""
|
|
links_per_article_min: int = 2
|
|
links_per_article_max: int = 4
|
|
include_home_link: bool = True
|
|
see_also_min: int = 4
|
|
see_also_max: int = 5
|
|
|
|
|
|
@dataclass
|
|
class TierConfig:
|
|
"""Configuration for a specific tier"""
|
|
count: int
|
|
min_word_count: int
|
|
max_word_count: int
|
|
min_h2_tags: int
|
|
max_h2_tags: int
|
|
min_h3_tags: int
|
|
max_h3_tags: int
|
|
anchor_text_config: Optional[AnchorTextConfig] = None
|
|
|
|
|
|
@dataclass
|
|
class Job:
|
|
"""Job definition for content generation"""
|
|
project_id: int
|
|
tiers: Dict[str, TierConfig]
|
|
models: Optional[ModelConfig] = None
|
|
deployment_targets: Optional[List[str]] = None
|
|
tier1_preferred_sites: Optional[List[str]] = None
|
|
auto_create_sites: bool = False
|
|
create_sites_for_keywords: Optional[List[Dict[str, any]]] = None
|
|
tiered_link_count_range: Optional[Dict[str, int]] = None
|
|
anchor_text_config: Optional[AnchorTextConfig] = None
|
|
failure_config: Optional[FailureConfig] = None
|
|
interlinking: Optional[InterlinkingConfig] = None
|
|
max_workers: Optional[int] = None
|
|
|
|
|
|
class JobConfig:
|
|
"""Parser for job configuration files"""
|
|
|
|
def __init__(self, job_file_path: str):
|
|
"""
|
|
Load and parse job file, apply defaults
|
|
|
|
Args:
|
|
job_file_path: Path to JSON job file
|
|
"""
|
|
self.job_file_path = Path(job_file_path)
|
|
self.jobs: list[Job] = []
|
|
self._load()
|
|
|
|
def _load(self):
|
|
"""Load and parse the job file"""
|
|
if not self.job_file_path.exists():
|
|
raise FileNotFoundError(f"Job file not found: {self.job_file_path}")
|
|
|
|
with open(self.job_file_path, 'r', encoding='utf-8') as f:
|
|
data = json.load(f)
|
|
|
|
# Handle both array format and single job format
|
|
if "jobs" in data:
|
|
# Array format: {"jobs": [{"project_id": 1, "tiers": {...}}]}
|
|
if not isinstance(data["jobs"], list):
|
|
raise ValueError("'jobs' must be an array")
|
|
for job_data in data["jobs"]:
|
|
self._validate_job(job_data)
|
|
job = self._parse_job(job_data)
|
|
self.jobs.append(job)
|
|
elif "project_id" in data:
|
|
# Single job format: {"project_id": 1, "tiers": [...], "models": {...}}
|
|
self._validate_job(data)
|
|
job = self._parse_job(data)
|
|
self.jobs.append(job)
|
|
else:
|
|
raise ValueError("Job file must contain either 'jobs' array or 'project_id' field")
|
|
|
|
def _validate_job(self, job_data: dict):
|
|
"""Validate job structure"""
|
|
if "project_id" not in job_data:
|
|
raise ValueError("Job missing 'project_id'")
|
|
|
|
if "tiers" not in job_data:
|
|
raise ValueError("Job missing 'tiers'")
|
|
|
|
# Handle both object format {"tier1": {...}} and array format [{"tier": 1, ...}]
|
|
tiers_data = job_data["tiers"]
|
|
if not isinstance(tiers_data, (dict, list)):
|
|
raise ValueError("'tiers' must be a dictionary or array")
|
|
|
|
def _parse_job(self, job_data: dict) -> Job:
|
|
"""Parse a single job"""
|
|
project_id = job_data["project_id"]
|
|
tiers = {}
|
|
|
|
tiers_data = job_data["tiers"]
|
|
if isinstance(tiers_data, dict):
|
|
# Object format: {"tier1": {"count": 10, ...}}
|
|
for tier_name, tier_data in tiers_data.items():
|
|
tier_config = self._parse_tier(tier_name, tier_data)
|
|
tiers[tier_name] = tier_config
|
|
elif isinstance(tiers_data, list):
|
|
# Array format: [{"tier": 1, "article_count": 10, ...}]
|
|
for tier_data in tiers_data:
|
|
if "tier" not in tier_data:
|
|
raise ValueError("Tier array items must have 'tier' field")
|
|
tier_num = tier_data["tier"]
|
|
tier_name = f"tier{tier_num}"
|
|
tier_config = self._parse_tier_from_array(tier_name, tier_data)
|
|
tiers[tier_name] = tier_config
|
|
|
|
deployment_targets = job_data.get("deployment_targets")
|
|
if deployment_targets is not None:
|
|
if not isinstance(deployment_targets, list):
|
|
raise ValueError("'deployment_targets' must be an array")
|
|
if not all(isinstance(item, str) for item in deployment_targets):
|
|
raise ValueError("'deployment_targets' must be an array of strings")
|
|
|
|
tier1_preferred_sites = job_data.get("tier1_preferred_sites")
|
|
if tier1_preferred_sites is not None:
|
|
if not isinstance(tier1_preferred_sites, list):
|
|
raise ValueError("'tier1_preferred_sites' must be an array")
|
|
if not all(isinstance(item, str) for item in tier1_preferred_sites):
|
|
raise ValueError("'tier1_preferred_sites' must be an array of strings")
|
|
|
|
auto_create_sites = job_data.get("auto_create_sites", False)
|
|
if not isinstance(auto_create_sites, bool):
|
|
raise ValueError("'auto_create_sites' must be a boolean")
|
|
|
|
create_sites_for_keywords = job_data.get("create_sites_for_keywords")
|
|
if create_sites_for_keywords is not None:
|
|
if not isinstance(create_sites_for_keywords, list):
|
|
raise ValueError("'create_sites_for_keywords' must be an array")
|
|
for kw_config in create_sites_for_keywords:
|
|
if not isinstance(kw_config, dict):
|
|
raise ValueError("Each item in 'create_sites_for_keywords' must be an object")
|
|
if "keyword" not in kw_config or "count" not in kw_config:
|
|
raise ValueError("Each item in 'create_sites_for_keywords' must have 'keyword' and 'count'")
|
|
|
|
tiered_link_count_range = job_data.get("tiered_link_count_range")
|
|
if tiered_link_count_range is not None:
|
|
if not isinstance(tiered_link_count_range, dict):
|
|
raise ValueError("'tiered_link_count_range' must be an object")
|
|
if "min" not in tiered_link_count_range or "max" not in tiered_link_count_range:
|
|
raise ValueError("'tiered_link_count_range' must have 'min' and 'max' fields")
|
|
min_val = tiered_link_count_range["min"]
|
|
max_val = tiered_link_count_range["max"]
|
|
if not isinstance(min_val, int) or not isinstance(max_val, int):
|
|
raise ValueError("'tiered_link_count_range' min and max must be integers")
|
|
if min_val < 1:
|
|
raise ValueError("'tiered_link_count_range' min must be >= 1")
|
|
if max_val < min_val:
|
|
raise ValueError("'tiered_link_count_range' max must be >= min")
|
|
|
|
# Parse models configuration
|
|
models = None
|
|
models_data = job_data.get("models")
|
|
if models_data is not None:
|
|
if not isinstance(models_data, dict):
|
|
raise ValueError("'models' must be an object")
|
|
if "title" not in models_data or "outline" not in models_data or "content" not in models_data:
|
|
raise ValueError("'models' must have 'title', 'outline', and 'content' fields")
|
|
models = ModelConfig(
|
|
title=models_data["title"],
|
|
outline=models_data["outline"],
|
|
content=models_data["content"]
|
|
)
|
|
|
|
# Parse anchor text configuration
|
|
anchor_text_config = None
|
|
anchor_text_data = job_data.get("anchor_text_config")
|
|
if anchor_text_data is not None:
|
|
if not isinstance(anchor_text_data, dict):
|
|
raise ValueError("'anchor_text_config' must be an object")
|
|
if "mode" not in anchor_text_data:
|
|
raise ValueError("'anchor_text_config' must have 'mode' field")
|
|
mode = anchor_text_data["mode"]
|
|
if mode not in ["default", "override", "append"]:
|
|
raise ValueError("'anchor_text_config' mode must be 'default', 'override', or 'append'")
|
|
custom_text = anchor_text_data.get("custom_text")
|
|
if custom_text is not None and not isinstance(custom_text, list):
|
|
raise ValueError("'anchor_text_config' custom_text must be an array")
|
|
anchor_text_config = AnchorTextConfig(mode=mode, custom_text=custom_text)
|
|
|
|
# Parse failure configuration
|
|
failure_config = None
|
|
failure_data = job_data.get("failure_config")
|
|
if failure_data is not None:
|
|
if not isinstance(failure_data, dict):
|
|
raise ValueError("'failure_config' must be an object")
|
|
max_failures = failure_data.get("max_consecutive_failures", 5)
|
|
skip_on_failure = failure_data.get("skip_on_failure", True)
|
|
if not isinstance(max_failures, int) or max_failures < 1:
|
|
raise ValueError("'failure_config' max_consecutive_failures must be a positive integer")
|
|
if not isinstance(skip_on_failure, bool):
|
|
raise ValueError("'failure_config' skip_on_failure must be a boolean")
|
|
failure_config = FailureConfig(
|
|
max_consecutive_failures=max_failures,
|
|
skip_on_failure=skip_on_failure
|
|
)
|
|
|
|
# Parse interlinking configuration
|
|
interlinking = None
|
|
interlinking_data = job_data.get("interlinking")
|
|
if interlinking_data is not None:
|
|
if not isinstance(interlinking_data, dict):
|
|
raise ValueError("'interlinking' must be an object")
|
|
min_links = interlinking_data.get("links_per_article_min", 2)
|
|
max_links = interlinking_data.get("links_per_article_max", 4)
|
|
include_home = interlinking_data.get("include_home_link", True)
|
|
see_also_min = interlinking_data.get("see_also_min", 4)
|
|
see_also_max = interlinking_data.get("see_also_max", 5)
|
|
if not isinstance(min_links, int) or min_links < 0:
|
|
raise ValueError("'interlinking' links_per_article_min must be a non-negative integer")
|
|
if not isinstance(max_links, int) or max_links < min_links:
|
|
raise ValueError("'interlinking' links_per_article_max must be >= links_per_article_min")
|
|
if not isinstance(include_home, bool):
|
|
raise ValueError("'interlinking' include_home_link must be a boolean")
|
|
if not isinstance(see_also_min, int) or see_also_min < 0:
|
|
raise ValueError("'interlinking' see_also_min must be a non-negative integer")
|
|
if not isinstance(see_also_max, int) or see_also_max < see_also_min:
|
|
raise ValueError("'interlinking' see_also_max must be >= see_also_min")
|
|
interlinking = InterlinkingConfig(
|
|
links_per_article_min=min_links,
|
|
links_per_article_max=max_links,
|
|
include_home_link=include_home,
|
|
see_also_min=see_also_min,
|
|
see_also_max=see_also_max
|
|
)
|
|
|
|
max_workers = job_data.get("max_workers")
|
|
if max_workers is not None:
|
|
if not isinstance(max_workers, int) or max_workers < 1:
|
|
raise ValueError("'max_workers' must be a positive integer")
|
|
|
|
return Job(
|
|
project_id=project_id,
|
|
tiers=tiers,
|
|
models=models,
|
|
deployment_targets=deployment_targets,
|
|
tier1_preferred_sites=tier1_preferred_sites,
|
|
auto_create_sites=auto_create_sites,
|
|
create_sites_for_keywords=create_sites_for_keywords,
|
|
tiered_link_count_range=tiered_link_count_range,
|
|
anchor_text_config=anchor_text_config,
|
|
failure_config=failure_config,
|
|
interlinking=interlinking,
|
|
max_workers=max_workers
|
|
)
|
|
|
|
def _parse_tier(self, tier_name: str, tier_data: dict) -> TierConfig:
|
|
"""Parse tier configuration with defaults (object format)"""
|
|
defaults = TIER_DEFAULTS.get(tier_name, TIER_DEFAULTS["tier3"])
|
|
|
|
# Parse tier-level anchor_text_config if present
|
|
anchor_text_config = None
|
|
if "anchor_text_config" in tier_data:
|
|
anchor_text_data = tier_data["anchor_text_config"]
|
|
if not isinstance(anchor_text_data, dict):
|
|
raise ValueError(f"'{tier_name}.anchor_text_config' must be an object")
|
|
if "mode" not in anchor_text_data:
|
|
raise ValueError(f"'{tier_name}.anchor_text_config' must have 'mode' field")
|
|
mode = anchor_text_data["mode"]
|
|
if mode not in ["default", "override", "append"]:
|
|
raise ValueError(f"'{tier_name}.anchor_text_config' mode must be 'default', 'override', or 'append'")
|
|
custom_text = anchor_text_data.get("custom_text")
|
|
if custom_text is not None and not isinstance(custom_text, list):
|
|
raise ValueError(f"'{tier_name}.anchor_text_config' custom_text must be an array")
|
|
anchor_text_config = AnchorTextConfig(mode=mode, custom_text=custom_text)
|
|
|
|
return TierConfig(
|
|
count=tier_data.get("count", 1),
|
|
min_word_count=tier_data.get("min_word_count", defaults["min_word_count"]),
|
|
max_word_count=tier_data.get("max_word_count", defaults["max_word_count"]),
|
|
min_h2_tags=tier_data.get("min_h2_tags", defaults["min_h2_tags"]),
|
|
max_h2_tags=tier_data.get("max_h2_tags", defaults["max_h2_tags"]),
|
|
min_h3_tags=tier_data.get("min_h3_tags", defaults["min_h3_tags"]),
|
|
max_h3_tags=tier_data.get("max_h3_tags", defaults["max_h3_tags"]),
|
|
anchor_text_config=anchor_text_config
|
|
)
|
|
|
|
def _parse_tier_from_array(self, tier_name: str, tier_data: dict) -> TierConfig:
|
|
"""Parse tier configuration from array format"""
|
|
defaults = TIER_DEFAULTS.get(tier_name, TIER_DEFAULTS["tier3"])
|
|
|
|
# Array format uses "article_count" instead of "count"
|
|
count = tier_data.get("article_count", tier_data.get("count", 1))
|
|
|
|
# Parse tier-level anchor_text_config if present
|
|
anchor_text_config = None
|
|
if "anchor_text_config" in tier_data:
|
|
anchor_text_data = tier_data["anchor_text_config"]
|
|
if not isinstance(anchor_text_data, dict):
|
|
raise ValueError(f"'{tier_name}.anchor_text_config' must be an object")
|
|
if "mode" not in anchor_text_data:
|
|
raise ValueError(f"'{tier_name}.anchor_text_config' must have 'mode' field")
|
|
mode = anchor_text_data["mode"]
|
|
if mode not in ["default", "override", "append"]:
|
|
raise ValueError(f"'{tier_name}.anchor_text_config' mode must be 'default', 'override', or 'append'")
|
|
custom_text = anchor_text_data.get("custom_text")
|
|
if custom_text is not None and not isinstance(custom_text, list):
|
|
raise ValueError(f"'{tier_name}.anchor_text_config' custom_text must be an array")
|
|
anchor_text_config = AnchorTextConfig(mode=mode, custom_text=custom_text)
|
|
|
|
return TierConfig(
|
|
count=count,
|
|
min_word_count=tier_data.get("min_word_count", defaults["min_word_count"]),
|
|
max_word_count=tier_data.get("max_word_count", defaults["max_word_count"]),
|
|
min_h2_tags=tier_data.get("min_h2_tags", defaults["min_h2_tags"]),
|
|
max_h2_tags=tier_data.get("max_h2_tags", defaults["max_h2_tags"]),
|
|
min_h3_tags=tier_data.get("min_h3_tags", defaults["min_h3_tags"]),
|
|
max_h3_tags=tier_data.get("max_h3_tags", defaults["max_h3_tags"]),
|
|
anchor_text_config=anchor_text_config
|
|
)
|
|
|
|
def get_jobs(self) -> list[Job]:
|
|
"""Return list of all jobs in file"""
|
|
return self.jobs
|
|
|
|
def get_tier_config(self, job: Job, tier_name: str) -> Optional[TierConfig]:
|
|
"""Get tier config with defaults applied"""
|
|
return job.tiers.get(tier_name) |