""" Job configuration parser for batch content generation """ import json from dataclasses import dataclass from typing import Optional, Dict, Any, List from pathlib import Path TIER_DEFAULTS = { "tier1": { "min_word_count": 2000, "max_word_count": 2500, "min_h2_tags": 3, "max_h2_tags": 5, "min_h3_tags": 5, "max_h3_tags": 10 }, "tier2": { "min_word_count": 1100, "max_word_count": 1500, "min_h2_tags": 2, "max_h2_tags": 4, "min_h3_tags": 3, "max_h3_tags": 8 }, "tier3": { "min_word_count": 850, "max_word_count": 1350, "min_h2_tags": 2, "max_h2_tags": 3, "min_h3_tags": 2, "max_h3_tags": 6 } } @dataclass class ModelConfig: """AI model configuration for different generation stages""" title: str outline: str content: str @dataclass class AnchorTextConfig: """Anchor text configuration for interlinking""" mode: str # "default", "override", "append" custom_text: Optional[List[str]] = None @dataclass class FailureConfig: """Configuration for handling generation failures""" max_consecutive_failures: int = 5 skip_on_failure: bool = True @dataclass class InterlinkingConfig: """Configuration for article interlinking""" links_per_article_min: int = 2 links_per_article_max: int = 4 include_home_link: bool = True see_also_min: int = 4 see_also_max: int = 5 @dataclass class TierConfig: """Configuration for a specific tier""" count: int min_word_count: int max_word_count: int min_h2_tags: int max_h2_tags: int min_h3_tags: int max_h3_tags: int anchor_text_config: Optional[AnchorTextConfig] = None @dataclass class Job: """Job definition for content generation""" project_id: int tiers: Dict[str, TierConfig] models: Optional[ModelConfig] = None deployment_targets: Optional[List[str]] = None tier1_preferred_sites: Optional[List[str]] = None auto_create_sites: bool = False create_sites_for_keywords: Optional[List[Dict[str, any]]] = None tiered_link_count_range: Optional[Dict[str, int]] = None anchor_text_config: Optional[AnchorTextConfig] = None failure_config: Optional[FailureConfig] = None interlinking: Optional[InterlinkingConfig] = None max_workers: Optional[int] = None class JobConfig: """Parser for job configuration files""" def __init__(self, job_file_path: str): """ Load and parse job file, apply defaults Args: job_file_path: Path to JSON job file """ self.job_file_path = Path(job_file_path) self.jobs: list[Job] = [] self._load() def _load(self): """Load and parse the job file""" if not self.job_file_path.exists(): raise FileNotFoundError(f"Job file not found: {self.job_file_path}") with open(self.job_file_path, 'r', encoding='utf-8') as f: data = json.load(f) # Handle both array format and single job format if "jobs" in data: # Array format: {"jobs": [{"project_id": 1, "tiers": {...}}]} if not isinstance(data["jobs"], list): raise ValueError("'jobs' must be an array") for job_data in data["jobs"]: self._validate_job(job_data) job = self._parse_job(job_data) self.jobs.append(job) elif "project_id" in data: # Single job format: {"project_id": 1, "tiers": [...], "models": {...}} self._validate_job(data) job = self._parse_job(data) self.jobs.append(job) else: raise ValueError("Job file must contain either 'jobs' array or 'project_id' field") def _validate_job(self, job_data: dict): """Validate job structure""" if "project_id" not in job_data: raise ValueError("Job missing 'project_id'") if "tiers" not in job_data: raise ValueError("Job missing 'tiers'") # Handle both object format {"tier1": {...}} and array format [{"tier": 1, ...}] tiers_data = job_data["tiers"] if not isinstance(tiers_data, (dict, list)): raise ValueError("'tiers' must be a dictionary or array") def _parse_job(self, job_data: dict) -> Job: """Parse a single job""" project_id = job_data["project_id"] tiers = {} tiers_data = job_data["tiers"] if isinstance(tiers_data, dict): # Object format: {"tier1": {"count": 10, ...}} for tier_name, tier_data in tiers_data.items(): tier_config = self._parse_tier(tier_name, tier_data) tiers[tier_name] = tier_config elif isinstance(tiers_data, list): # Array format: [{"tier": 1, "article_count": 10, ...}] for tier_data in tiers_data: if "tier" not in tier_data: raise ValueError("Tier array items must have 'tier' field") tier_num = tier_data["tier"] tier_name = f"tier{tier_num}" tier_config = self._parse_tier_from_array(tier_name, tier_data) tiers[tier_name] = tier_config deployment_targets = job_data.get("deployment_targets") if deployment_targets is not None: if not isinstance(deployment_targets, list): raise ValueError("'deployment_targets' must be an array") if not all(isinstance(item, str) for item in deployment_targets): raise ValueError("'deployment_targets' must be an array of strings") tier1_preferred_sites = job_data.get("tier1_preferred_sites") if tier1_preferred_sites is not None: if not isinstance(tier1_preferred_sites, list): raise ValueError("'tier1_preferred_sites' must be an array") if not all(isinstance(item, str) for item in tier1_preferred_sites): raise ValueError("'tier1_preferred_sites' must be an array of strings") auto_create_sites = job_data.get("auto_create_sites", False) if not isinstance(auto_create_sites, bool): raise ValueError("'auto_create_sites' must be a boolean") create_sites_for_keywords = job_data.get("create_sites_for_keywords") if create_sites_for_keywords is not None: if not isinstance(create_sites_for_keywords, list): raise ValueError("'create_sites_for_keywords' must be an array") for kw_config in create_sites_for_keywords: if not isinstance(kw_config, dict): raise ValueError("Each item in 'create_sites_for_keywords' must be an object") if "keyword" not in kw_config or "count" not in kw_config: raise ValueError("Each item in 'create_sites_for_keywords' must have 'keyword' and 'count'") tiered_link_count_range = job_data.get("tiered_link_count_range") if tiered_link_count_range is not None: if not isinstance(tiered_link_count_range, dict): raise ValueError("'tiered_link_count_range' must be an object") if "min" not in tiered_link_count_range or "max" not in tiered_link_count_range: raise ValueError("'tiered_link_count_range' must have 'min' and 'max' fields") min_val = tiered_link_count_range["min"] max_val = tiered_link_count_range["max"] if not isinstance(min_val, int) or not isinstance(max_val, int): raise ValueError("'tiered_link_count_range' min and max must be integers") if min_val < 1: raise ValueError("'tiered_link_count_range' min must be >= 1") if max_val < min_val: raise ValueError("'tiered_link_count_range' max must be >= min") # Parse models configuration models = None models_data = job_data.get("models") if models_data is not None: if not isinstance(models_data, dict): raise ValueError("'models' must be an object") if "title" not in models_data or "outline" not in models_data or "content" not in models_data: raise ValueError("'models' must have 'title', 'outline', and 'content' fields") models = ModelConfig( title=models_data["title"], outline=models_data["outline"], content=models_data["content"] ) # Parse anchor text configuration anchor_text_config = None anchor_text_data = job_data.get("anchor_text_config") if anchor_text_data is not None: if not isinstance(anchor_text_data, dict): raise ValueError("'anchor_text_config' must be an object") if "mode" not in anchor_text_data: raise ValueError("'anchor_text_config' must have 'mode' field") mode = anchor_text_data["mode"] if mode not in ["default", "override", "append"]: raise ValueError("'anchor_text_config' mode must be 'default', 'override', or 'append'") custom_text = anchor_text_data.get("custom_text") if custom_text is not None and not isinstance(custom_text, list): raise ValueError("'anchor_text_config' custom_text must be an array") anchor_text_config = AnchorTextConfig(mode=mode, custom_text=custom_text) # Parse failure configuration failure_config = None failure_data = job_data.get("failure_config") if failure_data is not None: if not isinstance(failure_data, dict): raise ValueError("'failure_config' must be an object") max_failures = failure_data.get("max_consecutive_failures", 5) skip_on_failure = failure_data.get("skip_on_failure", True) if not isinstance(max_failures, int) or max_failures < 1: raise ValueError("'failure_config' max_consecutive_failures must be a positive integer") if not isinstance(skip_on_failure, bool): raise ValueError("'failure_config' skip_on_failure must be a boolean") failure_config = FailureConfig( max_consecutive_failures=max_failures, skip_on_failure=skip_on_failure ) # Parse interlinking configuration interlinking = None interlinking_data = job_data.get("interlinking") if interlinking_data is not None: if not isinstance(interlinking_data, dict): raise ValueError("'interlinking' must be an object") min_links = interlinking_data.get("links_per_article_min", 2) max_links = interlinking_data.get("links_per_article_max", 4) include_home = interlinking_data.get("include_home_link", True) see_also_min = interlinking_data.get("see_also_min", 4) see_also_max = interlinking_data.get("see_also_max", 5) if not isinstance(min_links, int) or min_links < 0: raise ValueError("'interlinking' links_per_article_min must be a non-negative integer") if not isinstance(max_links, int) or max_links < min_links: raise ValueError("'interlinking' links_per_article_max must be >= links_per_article_min") if not isinstance(include_home, bool): raise ValueError("'interlinking' include_home_link must be a boolean") if not isinstance(see_also_min, int) or see_also_min < 0: raise ValueError("'interlinking' see_also_min must be a non-negative integer") if not isinstance(see_also_max, int) or see_also_max < see_also_min: raise ValueError("'interlinking' see_also_max must be >= see_also_min") interlinking = InterlinkingConfig( links_per_article_min=min_links, links_per_article_max=max_links, include_home_link=include_home, see_also_min=see_also_min, see_also_max=see_also_max ) max_workers = job_data.get("max_workers") if max_workers is not None: if not isinstance(max_workers, int) or max_workers < 1: raise ValueError("'max_workers' must be a positive integer") return Job( project_id=project_id, tiers=tiers, models=models, deployment_targets=deployment_targets, tier1_preferred_sites=tier1_preferred_sites, auto_create_sites=auto_create_sites, create_sites_for_keywords=create_sites_for_keywords, tiered_link_count_range=tiered_link_count_range, anchor_text_config=anchor_text_config, failure_config=failure_config, interlinking=interlinking, max_workers=max_workers ) def _parse_tier(self, tier_name: str, tier_data: dict) -> TierConfig: """Parse tier configuration with defaults (object format)""" defaults = TIER_DEFAULTS.get(tier_name, TIER_DEFAULTS["tier3"]) # Parse tier-level anchor_text_config if present anchor_text_config = None if "anchor_text_config" in tier_data: anchor_text_data = tier_data["anchor_text_config"] if not isinstance(anchor_text_data, dict): raise ValueError(f"'{tier_name}.anchor_text_config' must be an object") if "mode" not in anchor_text_data: raise ValueError(f"'{tier_name}.anchor_text_config' must have 'mode' field") mode = anchor_text_data["mode"] if mode not in ["default", "override", "append"]: raise ValueError(f"'{tier_name}.anchor_text_config' mode must be 'default', 'override', or 'append'") custom_text = anchor_text_data.get("custom_text") if custom_text is not None and not isinstance(custom_text, list): raise ValueError(f"'{tier_name}.anchor_text_config' custom_text must be an array") anchor_text_config = AnchorTextConfig(mode=mode, custom_text=custom_text) return TierConfig( count=tier_data.get("count", 1), min_word_count=tier_data.get("min_word_count", defaults["min_word_count"]), max_word_count=tier_data.get("max_word_count", defaults["max_word_count"]), min_h2_tags=tier_data.get("min_h2_tags", defaults["min_h2_tags"]), max_h2_tags=tier_data.get("max_h2_tags", defaults["max_h2_tags"]), min_h3_tags=tier_data.get("min_h3_tags", defaults["min_h3_tags"]), max_h3_tags=tier_data.get("max_h3_tags", defaults["max_h3_tags"]), anchor_text_config=anchor_text_config ) def _parse_tier_from_array(self, tier_name: str, tier_data: dict) -> TierConfig: """Parse tier configuration from array format""" defaults = TIER_DEFAULTS.get(tier_name, TIER_DEFAULTS["tier3"]) # Array format uses "article_count" instead of "count" count = tier_data.get("article_count", tier_data.get("count", 1)) # Parse tier-level anchor_text_config if present anchor_text_config = None if "anchor_text_config" in tier_data: anchor_text_data = tier_data["anchor_text_config"] if not isinstance(anchor_text_data, dict): raise ValueError(f"'{tier_name}.anchor_text_config' must be an object") if "mode" not in anchor_text_data: raise ValueError(f"'{tier_name}.anchor_text_config' must have 'mode' field") mode = anchor_text_data["mode"] if mode not in ["default", "override", "append"]: raise ValueError(f"'{tier_name}.anchor_text_config' mode must be 'default', 'override', or 'append'") custom_text = anchor_text_data.get("custom_text") if custom_text is not None and not isinstance(custom_text, list): raise ValueError(f"'{tier_name}.anchor_text_config' custom_text must be an array") anchor_text_config = AnchorTextConfig(mode=mode, custom_text=custom_text) return TierConfig( count=count, min_word_count=tier_data.get("min_word_count", defaults["min_word_count"]), max_word_count=tier_data.get("max_word_count", defaults["max_word_count"]), min_h2_tags=tier_data.get("min_h2_tags", defaults["min_h2_tags"]), max_h2_tags=tier_data.get("max_h2_tags", defaults["max_h2_tags"]), min_h3_tags=tier_data.get("min_h3_tags", defaults["min_h3_tags"]), max_h3_tags=tier_data.get("max_h3_tags", defaults["max_h3_tags"]), anchor_text_config=anchor_text_config ) def get_jobs(self) -> list[Job]: """Return list of all jobs in file""" return self.jobs def get_tier_config(self, job: Job, tier_name: str) -> Optional[TierConfig]: """Get tier config with defaults applied""" return job.tiers.get(tier_name)