"""Link building pipeline tools. Orchestrates the Big-Link-Man CLI tool for automated link building. Primary workflow: ingest CORA .xlsx → generate content batch. """ from __future__ import annotations import logging import os import re import subprocess from collections.abc import Callable from pathlib import Path from . import tool log = logging.getLogger(__name__) # --------------------------------------------------------------------------- # Private helpers # --------------------------------------------------------------------------- def _get_blm_dir(ctx: dict | None) -> str: """Resolve the Big-Link-Man directory from config or env.""" if ctx and "config" in ctx: return ctx["config"].link_building.blm_dir return os.getenv("BLM_DIR", "E:/dev/Big-Link-Man") def _get_blm_timeout(ctx: dict | None) -> int: """Get BLM subprocess timeout from config or default (1800s / 30 min).""" if ctx and "config" in ctx: return ctx["config"].timeouts.blm return 1800 def _run_blm_command( args: list[str], blm_dir: str, timeout: int = 1800 ) -> subprocess.CompletedProcess: """Run a Big-Link-Man CLI command via subprocess. Always injects -u/-p from BLM_USERNAME/BLM_PASSWORD env vars. """ # Use BLM's own venv Python so its dependencies are available venv_python = Path(blm_dir) / ".venv" / "Scripts" / "python.exe" if not venv_python.exists(): # Fallback for Linux/Mac venv_python = Path(blm_dir) / ".venv" / "bin" / "python" if not venv_python.exists(): raise FileNotFoundError( f"No .venv found in {blm_dir}. External tools must have their own venv." ) cmd = [str(venv_python), "main.py", *args] # Inject credentials from env vars username = os.getenv("BLM_USERNAME", "") password = os.getenv("BLM_PASSWORD", "") if username and "-u" not in args and "--username" not in args: cmd.extend(["-u", username]) if password and "-p" not in args and "--password" not in args: cmd.extend(["-p", password]) log.info("Running BLM command: %s (cwd=%s)", " ".join(cmd), blm_dir) result = subprocess.run( cmd, cwd=blm_dir, capture_output=True, text=True, timeout=timeout, ) log.info("BLM exit code: %d", result.returncode) if result.stdout: log.debug("BLM stdout: %s", result.stdout[:1000]) if result.stderr: log.debug("BLM stderr: %s", result.stderr[:1000]) return result def _build_ingest_args( xlsx_path: str, project_name: str, money_site_url: str = "", branded_plus_ratio: float = 0.7, custom_anchors: str = "", cli_flags: str = "", ) -> list[str]: """Construct CLI argument list for ingest-cora command.""" args = ["ingest-cora", "-f", xlsx_path, "-n", project_name] if money_site_url: args.extend(["-m", money_site_url]) if branded_plus_ratio and branded_plus_ratio != 0.7: args.extend(["-bp", str(branded_plus_ratio)]) if custom_anchors: args.extend(["-a", custom_anchors]) # Parse any additional CLI flags if cli_flags: extra = cli_flags.strip().split() args.extend(extra) return args def _parse_ingest_output(stdout: str) -> dict: """Parse ingest-cora stdout to extract project_id and job_file path. Returns dict with keys: project_id, job_file, project_name, main_keyword """ result = { "project_id": "", "job_file": "", "project_name": "", "main_keyword": "", } for line in stdout.splitlines(): line = line.strip() # Success: Project 'My Project' created (ID: 42) m = re.match(r"^Success: Project '(.+)' created \(ID: (\d+)\)$", line) if m: result["project_name"] = m.group(1) result["project_id"] = m.group(2) continue # Job file created: jobs/my-project.json m = re.match(r"^Job file created: (.+)$", line) if m: result["job_file"] = m.group(1).strip() continue # Main Keyword: precision cnc machining m = re.match(r"^Main Keyword: (.+)$", line) if m: result["main_keyword"] = m.group(1).strip() continue return result def _parse_generate_output(stdout: str) -> dict: """Parse generate-batch stdout to extract completion stats. Returns dict with keys: job_moved_to, success (bool), raw_output """ result = { "job_moved_to": "", "success": False, "raw_output": stdout, } for line in stdout.splitlines(): line = line.strip() # Job file moved to: jobs/done/my-project.json m = re.match(r"^Job file moved to: (.+)$", line) if m: result["job_moved_to"] = m.group(1).strip() result["success"] = True continue return result def _set_status(ctx: dict | None, message: str) -> None: """Log pipeline progress. Previously wrote to KV; now just logs.""" if message: log.info("[LB Pipeline] %s", message) def _get_clickup_client(ctx: dict | None): """Create a ClickUpClient from tool context, or None if unavailable.""" if not ctx or not ctx.get("config") or not ctx["config"].clickup.enabled: return None try: from ..clickup import ClickUpClient config = ctx["config"] return ClickUpClient( api_token=config.clickup.api_token, workspace_id=config.clickup.workspace_id, task_type_field_name=config.clickup.task_type_field_name, ) except Exception as e: log.warning("Could not create ClickUp client: %s", e) return None def _sync_clickup(ctx: dict | None, task_id: str, step: str, message: str) -> None: """Post a progress comment to ClickUp.""" if not task_id or not ctx: return cu_client = _get_clickup_client(ctx) if cu_client: try: cu_client.add_comment(task_id, message) except Exception as e: log.warning("ClickUp comment failed for task %s: %s", task_id, e) finally: cu_client.close() def _find_clickup_task(ctx: dict, keyword: str) -> str: """Find a ClickUp Link Building task matching the given keyword. Looks for "to do" tasks with Work Category == "Link Building" and the Keyword custom field fuzzy-matching the keyword param. Returns task_id if found, else "". """ cu_client = _get_clickup_client(ctx) if not cu_client: return "" config = ctx.get("config") if not config or not config.clickup.space_id: return "" try: tasks = cu_client.get_tasks_from_space( config.clickup.space_id, statuses=["to do"], ) except Exception as e: log.warning("ClickUp query failed in _find_clickup_task: %s", e) return "" finally: cu_client.close() keyword_norm = _normalize_for_match(keyword) for task in tasks: if task.task_type != "Link Building": continue task_keyword = task.custom_fields.get("Keyword", "") if not task_keyword: continue if _fuzzy_keyword_match(keyword_norm, _normalize_for_match(str(task_keyword))): # Found a match — move to "automation underway" task_id = task.id # Move to "automation underway" cu_client2 = _get_clickup_client(ctx) if cu_client2: try: cu_client2.update_task_status(task_id, config.clickup.automation_status) except Exception as e: log.warning("Failed to update ClickUp status for %s: %s", task_id, e) finally: cu_client2.close() log.info("Auto-matched ClickUp task %s for keyword '%s'", task_id, keyword) return task_id return "" def _normalize_for_match(text: str) -> str: """Normalize text for fuzzy matching: lowercase, strip non-alnum, collapse spaces.""" text = text.lower().strip() text = re.sub(r"[^a-z0-9\s]", " ", text) text = re.sub(r"\s+", " ", text).strip() return text def _fuzzy_keyword_match(a: str, b: str, llm_check: Callable[[str, str], bool] | None = None) -> bool: """Check if two normalized strings match, allowing singular/plural differences. Fast path: exact match after normalization. Slow path: ask an LLM if the two keywords are the same aside from plural form. Falls back to False if no llm_check is provided and strings differ. """ if not a or not b: return False if a == b: return True if llm_check is None: return False # Only call LLM when keywords share most words (possible plural difference). words_a = set(a.split()) words_b = set(b.split()) shared = words_a & words_b total = max(len(words_a), len(words_b)) if total > 1 and len(shared) < total - 1: return False return llm_check(a, b) def _complete_clickup_task(ctx: dict | None, task_id: str, message: str, status: str = "") -> None: """Mark a ClickUp task as completed.""" if not task_id or not ctx: return config = ctx.get("config") skill_map = config.clickup.skill_map if config else {} lb_map = skill_map.get("Link Building", {}) complete_status = status or lb_map.get("complete_status", "complete") cu_client = _get_clickup_client(ctx) if cu_client: try: cu_client.add_comment(task_id, message) cu_client.update_task_status(task_id, complete_status) except Exception as e: log.warning("ClickUp completion failed for %s: %s", task_id, e) finally: cu_client.close() def _fail_clickup_task(ctx: dict | None, task_id: str, error_msg: str) -> None: """Mark a ClickUp task as failed.""" if not task_id or not ctx: return config = ctx.get("config") error_status = config.clickup.error_status if config else "error" cu_client = _get_clickup_client(ctx) if cu_client: try: cu_client.add_comment( task_id, f"[FAILED]Link building pipeline failed.\n\nError: {error_msg[:2000]}", ) cu_client.update_task_status(task_id, error_status) except Exception as e: log.warning("ClickUp failure update failed for %s: %s", task_id, e) finally: cu_client.close() # --------------------------------------------------------------------------- # Public tools # --------------------------------------------------------------------------- @tool( "run_link_building", description=( "Orchestrator for link building pipelines. Reads the LB Method and " "routes to the correct pipeline tool (e.g., run_cora_backlinks for " "'Cora Backlinks'). Use when a ClickUp task or chat command requests " "link building without specifying the exact pipeline." ), category="linkbuilding", ) def run_link_building( lb_method: str = "", xlsx_path: str = "", project_name: str = "", money_site_url: str = "", branded_plus_ratio: float = 0.7, custom_anchors: str = "", cli_flags: str = "", ctx: dict | None = None, ) -> str: """Dispatch to the correct link building pipeline based on lb_method.""" method = (lb_method or "").strip() if not method: return ( "Skipped: 'LB Method' field is empty. Each Link Building task must have " "an LB Method set (e.g. 'Cora Backlinks') before processing can begin." ) if method == "Cora Backlinks": # For Cora Backlinks, xlsx_path is required if not xlsx_path: return ( "Skipped: Cora Backlinks requires an xlsx_path. " "The folder watcher will trigger this pipeline when a .xlsx " "file appears in the watch folder. Or provide xlsx_path explicitly." ) return run_cora_backlinks( xlsx_path=xlsx_path, project_name=project_name, money_site_url=money_site_url, branded_plus_ratio=branded_plus_ratio, custom_anchors=custom_anchors, cli_flags=cli_flags, ctx=ctx, ) else: return f"Unknown LB Method: '{method}'. Supported methods: Cora Backlinks" @tool( "run_cora_backlinks", description=( "Full Cora Backlinks pipeline: ingests a CORA .xlsx report via " "Big-Link-Man's ingest-cora command, then runs generate-batch to " "produce content. Requires xlsx_path and project_name. Optionally " "syncs with ClickUp task." ), category="linkbuilding", ) def run_cora_backlinks( xlsx_path: str, project_name: str, money_site_url: str = "", branded_plus_ratio: float = 0.7, custom_anchors: str = "", cli_flags: str = "", ctx: dict | None = None, ) -> str: """Run the full Cora Backlinks pipeline: ingest-cora → generate-batch.""" if not xlsx_path: return "Error: xlsx_path is required for Cora Backlinks pipeline." if not project_name: return "Error: project_name is required for Cora Backlinks pipeline." if not money_site_url: return ( "Error: money_site_url (IMSURL) is required for Cora Backlinks pipeline. " "Set the IMSURL custom field on the ClickUp task before processing." ) blm_dir = _get_blm_dir(ctx) # Check if xlsx file exists xlsx = Path(xlsx_path) if not xlsx.exists(): return f"Error: CORA file not found: {xlsx_path}" # Try to find matching ClickUp task clickup_task_id = "" if ctx: clickup_task_id = ctx.get("clickup_task_id", "") if not clickup_task_id: # Auto-lookup from keyword (derive from project name) clickup_task_id = _find_clickup_task(ctx, project_name) output_parts = [] # ── Step 1: ingest-cora ── _set_status(ctx, f"Step 1/2: Ingesting CORA report for {project_name}...") if clickup_task_id: _sync_clickup(ctx, clickup_task_id, "ingest", "[STARTED]Starting Cora Backlinks pipeline...") # Convert branded_plus_ratio from string if needed try: bp_ratio = float(branded_plus_ratio) if branded_plus_ratio else 0.7 except (ValueError, TypeError): bp_ratio = 0.7 ingest_args = _build_ingest_args( xlsx_path=xlsx_path, project_name=project_name, money_site_url=money_site_url, branded_plus_ratio=bp_ratio, custom_anchors=custom_anchors, cli_flags=cli_flags, ) blm_timeout = _get_blm_timeout(ctx) try: ingest_result = _run_blm_command(ingest_args, blm_dir, timeout=blm_timeout) except subprocess.TimeoutExpired: error = f"ingest-cora timed out after {blm_timeout // 60} minutes" _set_status(ctx, "") if clickup_task_id: _fail_clickup_task(ctx, clickup_task_id, error) return f"Error: {error}" ingest_parsed = _parse_ingest_output(ingest_result.stdout) if ingest_result.returncode != 0 or not ingest_parsed["job_file"]: error = ( f"ingest-cora failed (exit code {ingest_result.returncode}).\n" f"stdout: {ingest_result.stdout[-500:]}\n" f"stderr: {ingest_result.stderr[-500:]}" ) _set_status(ctx, "") if clickup_task_id: _fail_clickup_task(ctx, clickup_task_id, error) return f"Error: {error}" project_id = ingest_parsed["project_id"] job_file = ingest_parsed["job_file"] output_parts.append("## Step 1: Ingest CORA Report") output_parts.append(f"- Project: {project_name} (ID: {project_id})") output_parts.append(f"- Keyword: {ingest_parsed['main_keyword']}") output_parts.append(f"- Job file: {job_file}") output_parts.append("") if clickup_task_id: _sync_clickup( ctx, clickup_task_id, "ingest_done", f"[DONE]CORA report ingested. Project ID: {project_id}. Job file: {job_file}", ) # ── Step 2: generate-batch ── _set_status(ctx, f"Step 2/2: Generating content batch for {project_name}...") # Build the job file path (may be relative to BLM dir) job_path = Path(blm_dir) / job_file if not Path(job_file).is_absolute() else Path(job_file) gen_args = ["generate-batch", "-j", str(job_path), "--continue-on-error"] try: gen_result = _run_blm_command(gen_args, blm_dir, timeout=blm_timeout) except subprocess.TimeoutExpired: error = f"generate-batch timed out after {blm_timeout // 60} minutes" _set_status(ctx, "") if clickup_task_id: _fail_clickup_task(ctx, clickup_task_id, error) return "\n".join(output_parts) + f"\n\nError: {error}" gen_parsed = _parse_generate_output(gen_result.stdout) if gen_result.returncode != 0: error = ( f"generate-batch failed (exit code {gen_result.returncode}).\n" f"stdout: {gen_result.stdout[-500:]}\n" f"stderr: {gen_result.stderr[-500:]}" ) _set_status(ctx, "") if clickup_task_id: _fail_clickup_task(ctx, clickup_task_id, error) return "\n".join(output_parts) + f"\n\nError: {error}" output_parts.append("## Step 2: Generate Content Batch") output_parts.append(f"- Status: {'Success' if gen_parsed['success'] else 'Completed'}") if gen_parsed["job_moved_to"]: output_parts.append(f"- Job moved to: {gen_parsed['job_moved_to']}") output_parts.append("") # ── Completion ── _set_status(ctx, "") if clickup_task_id: summary = ( f"[DONE]Cora Backlinks pipeline completed for {project_name}.\n\n" f"Project ID: {project_id}\n" f"Keyword: {ingest_parsed['main_keyword']}\n" f"Job file: {gen_parsed['job_moved_to'] or job_file}" ) _complete_clickup_task(ctx, clickup_task_id, summary) output_parts.append("## ClickUp Sync") output_parts.append(f"- Task `{clickup_task_id}` completed") output_parts.append("- Status set to 'complete'") return "\n".join(output_parts) @tool( "blm_ingest_cora", description=( "Standalone CORA ingest: runs Big-Link-Man's ingest-cora command " "to parse a CORA .xlsx report and create a project. Returns the " "project ID and job file path without running generate-batch." ), category="linkbuilding", ) def blm_ingest_cora( xlsx_path: str, project_name: str, money_site_url: str = "", branded_plus_ratio: float = 0.7, custom_anchors: str = "", cli_flags: str = "", ctx: dict | None = None, ) -> str: """Run ingest-cora only and return project ID + job file path.""" if not xlsx_path: return "Error: xlsx_path is required." if not project_name: return "Error: project_name is required." blm_dir = _get_blm_dir(ctx) xlsx = Path(xlsx_path) if not xlsx.exists(): return f"Error: CORA file not found: {xlsx_path}" try: bp_ratio = float(branded_plus_ratio) if branded_plus_ratio else 0.7 except (ValueError, TypeError): bp_ratio = 0.7 ingest_args = _build_ingest_args( xlsx_path=xlsx_path, project_name=project_name, money_site_url=money_site_url, branded_plus_ratio=bp_ratio, custom_anchors=custom_anchors, cli_flags=cli_flags, ) blm_timeout = _get_blm_timeout(ctx) try: result = _run_blm_command(ingest_args, blm_dir, timeout=blm_timeout) except subprocess.TimeoutExpired: return f"Error: ingest-cora timed out after {blm_timeout // 60} minutes." parsed = _parse_ingest_output(result.stdout) if result.returncode != 0 or not parsed["job_file"]: return ( f"Error: ingest-cora failed (exit code {result.returncode}).\n" f"stdout: {result.stdout[-500:]}\n" f"stderr: {result.stderr[-500:]}" ) return ( f"CORA ingest complete.\n\n" f"- Project: {parsed['project_name']} (ID: {parsed['project_id']})\n" f"- Keyword: {parsed['main_keyword']}\n" f"- Job file: {parsed['job_file']}\n\n" f"Run `blm_generate_batch` with this job file to generate content." ) @tool( "blm_generate_batch", description=( "Standalone content generation: runs Big-Link-Man's generate-batch " "command on an existing job file. Use after ingest-cora or for " "re-running generation on a manually created job." ), category="linkbuilding", ) def blm_generate_batch( job_file: str, continue_on_error: bool = True, debug: bool = False, ctx: dict | None = None, ) -> str: """Run generate-batch on an existing job file.""" if not job_file: return "Error: job_file is required." blm_dir = _get_blm_dir(ctx) job_path = Path(blm_dir) / job_file if not Path(job_file).is_absolute() else Path(job_file) if not job_path.exists(): return f"Error: Job file not found: {job_path}" args = ["generate-batch", "-j", str(job_path)] if continue_on_error: args.append("--continue-on-error") if debug: args.append("--debug") blm_timeout = _get_blm_timeout(ctx) try: result = _run_blm_command(args, blm_dir, timeout=blm_timeout) except subprocess.TimeoutExpired: return f"Error: generate-batch timed out after {blm_timeout // 60} minutes." parsed = _parse_generate_output(result.stdout) if result.returncode != 0: return ( f"Error: generate-batch failed (exit code {result.returncode}).\n" f"stdout: {result.stdout[-500:]}\n" f"stderr: {result.stderr[-500:]}" ) output = "Content generation complete.\n\n" output += f"- Status: {'Success' if parsed['success'] else 'Completed'}\n" if parsed["job_moved_to"]: output += f"- Job moved to: {parsed['job_moved_to']}\n" return output @tool( "scan_cora_folder", description=( "Scan the Cora inbox watch folder for .xlsx files and report " "their processing status. Shows which files are new, processed, " "or failed, and whether they match a ClickUp task." ), category="linkbuilding", ) def scan_cora_folder(ctx: dict | None = None) -> str: """Scan the watch folder and return status of .xlsx files.""" if not ctx or "config" not in ctx: return "Error: scan_cora_folder requires agent context." config = ctx["config"] watch_folder = config.link_building.watch_folder if not watch_folder: return "Watch folder not configured (link_building.watch_folder is empty)." watch_path = Path(watch_folder) if not watch_path.exists(): return f"Watch folder does not exist: {watch_folder}" xlsx_files = sorted(watch_path.glob("*.xlsx")) if not xlsx_files: return f"No .xlsx files found in {watch_folder}." lines = [f"## Cora Inbox: {watch_folder}\n"] processed_dir = watch_path / "processed" processed_names = set() if processed_dir.exists(): processed_names = {f.name for f in processed_dir.glob("*.xlsx")} for f in xlsx_files: filename = f.name if filename.startswith("~$"): continue status = "processed" if filename in processed_names else "new" lines.append(f"- **{filename}** — status: {status}") # Check processed subfolder processed_dir = watch_path / "processed" if processed_dir.exists(): processed = list(processed_dir.glob("*.xlsx")) if processed: lines.append(f"\n### Processed ({len(processed)} files)") for f in processed[:10]: lines.append(f"- {f.name}") if len(processed) > 10: lines.append(f"- ... and {len(processed) - 10} more") return "\n".join(lines) @tool( "setup_linkbuilding_fields", description=( "One-time setup tool: creates the required ClickUp custom fields " "(LB Method, Keyword, CoraFile, etc.) across all lists in the space. " "Safe to re-run — skips fields that already exist." ), category="linkbuilding", ) def setup_linkbuilding_fields(ctx: dict | None = None) -> str: """Create link building custom fields in ClickUp.""" if not ctx or "config" not in ctx: return "Error: requires agent context." config = ctx["config"] if not config.clickup.enabled: return "Error: ClickUp integration not enabled." cu_client = _get_clickup_client(ctx) if not cu_client: return "Error: could not create ClickUp client." try: space_id = config.clickup.space_id list_ids = cu_client.get_list_ids_from_space(space_id) if not list_ids: return f"No lists found in space {space_id}." fields_to_create = [ { "name": "LB Method", "type": "drop_down", "type_config": { "options": [ {"name": "Cora Backlinks", "color": "#04A9F4"}, ] }, }, {"name": "Keyword", "type": "short_text"}, {"name": "CoraFile", "type": "short_text"}, {"name": "CustomAnchors", "type": "short_text"}, {"name": "BrandedPlusRatio", "type": "short_text"}, {"name": "CLIFlags", "type": "short_text"}, ] results = [] for list_id in list_ids: existing = cu_client.get_custom_fields(list_id) existing_names = {f.get("name") for f in existing} for field_def in fields_to_create: if field_def["name"] in existing_names: continue try: cu_client.create_custom_field( list_id, field_def["name"], field_def["type"], field_def.get("type_config"), ) results.append(f"Created '{field_def['name']}' in list {list_id}") except Exception as e: results.append(f"Failed to create '{field_def['name']}' in list {list_id}: {e}") if not results: return "All fields already exist in all lists." return "## Setup Results\n\n" + "\n".join(f"- {r}" for r in results) finally: cu_client.close()