CheddahBot/cheddahbot/tools/linkbuilding.py

809 lines
26 KiB
Python

"""Link building pipeline tools.
Orchestrates the Big-Link-Man CLI tool for automated link building.
Primary workflow: ingest CORA .xlsx → generate content batch.
"""
from __future__ import annotations
import logging
import os
import re
import subprocess
from collections.abc import Callable
from pathlib import Path
from . import tool
log = logging.getLogger(__name__)
# ---------------------------------------------------------------------------
# Private helpers
# ---------------------------------------------------------------------------
def _get_blm_dir(ctx: dict | None) -> str:
"""Resolve the Big-Link-Man directory from config or env."""
if ctx and "config" in ctx:
return ctx["config"].link_building.blm_dir
return os.getenv("BLM_DIR", "E:/dev/Big-Link-Man")
def _get_blm_timeout(ctx: dict | None) -> int:
"""Get BLM subprocess timeout from config or default (1800s / 30 min)."""
if ctx and "config" in ctx:
return ctx["config"].timeouts.blm
return 1800
def _run_blm_command(
args: list[str], blm_dir: str, timeout: int = 1800
) -> subprocess.CompletedProcess:
"""Run a Big-Link-Man CLI command via subprocess.
Always injects -u/-p from BLM_USERNAME/BLM_PASSWORD env vars.
"""
# Use BLM's own venv Python so its dependencies are available
venv_python = Path(blm_dir) / ".venv" / "Scripts" / "python.exe"
if not venv_python.exists():
# Fallback for Linux/Mac
venv_python = Path(blm_dir) / ".venv" / "bin" / "python"
if not venv_python.exists():
raise FileNotFoundError(
f"No .venv found in {blm_dir}. External tools must have their own venv."
)
cmd = [str(venv_python), "main.py", *args]
# Inject credentials from env vars
username = os.getenv("BLM_USERNAME", "")
password = os.getenv("BLM_PASSWORD", "")
if username and "-u" not in args and "--username" not in args:
cmd.extend(["-u", username])
if password and "-p" not in args and "--password" not in args:
cmd.extend(["-p", password])
log.info("Running BLM command: %s (cwd=%s)", " ".join(cmd), blm_dir)
result = subprocess.run(
cmd,
cwd=blm_dir,
capture_output=True,
text=True,
timeout=timeout,
)
log.info("BLM exit code: %d", result.returncode)
if result.stdout:
log.debug("BLM stdout: %s", result.stdout[:1000])
if result.stderr:
log.debug("BLM stderr: %s", result.stderr[:1000])
return result
def _build_ingest_args(
xlsx_path: str,
project_name: str,
money_site_url: str = "",
branded_plus_ratio: float = 0.7,
custom_anchors: str = "",
cli_flags: str = "",
) -> list[str]:
"""Construct CLI argument list for ingest-cora command."""
args = ["ingest-cora", "-f", xlsx_path, "-n", project_name]
if money_site_url:
args.extend(["-m", money_site_url])
if branded_plus_ratio and branded_plus_ratio != 0.7:
args.extend(["-bp", str(branded_plus_ratio)])
if custom_anchors:
args.extend(["-a", custom_anchors])
# Parse any additional CLI flags
if cli_flags:
extra = cli_flags.strip().split()
args.extend(extra)
return args
def _parse_ingest_output(stdout: str) -> dict:
"""Parse ingest-cora stdout to extract project_id and job_file path.
Returns dict with keys: project_id, job_file, project_name, main_keyword
"""
result = {
"project_id": "",
"job_file": "",
"project_name": "",
"main_keyword": "",
}
for line in stdout.splitlines():
line = line.strip()
# Success: Project 'My Project' created (ID: 42)
m = re.match(r"^Success: Project '(.+)' created \(ID: (\d+)\)$", line)
if m:
result["project_name"] = m.group(1)
result["project_id"] = m.group(2)
continue
# Job file created: jobs/my-project.json
m = re.match(r"^Job file created: (.+)$", line)
if m:
result["job_file"] = m.group(1).strip()
continue
# Main Keyword: precision cnc machining
m = re.match(r"^Main Keyword: (.+)$", line)
if m:
result["main_keyword"] = m.group(1).strip()
continue
return result
def _parse_generate_output(stdout: str) -> dict:
"""Parse generate-batch stdout to extract completion stats.
Returns dict with keys: job_moved_to, success (bool), raw_output
"""
result = {
"job_moved_to": "",
"success": False,
"raw_output": stdout,
}
for line in stdout.splitlines():
line = line.strip()
# Job file moved to: jobs/done/my-project.json
m = re.match(r"^Job file moved to: (.+)$", line)
if m:
result["job_moved_to"] = m.group(1).strip()
result["success"] = True
continue
return result
def _set_status(ctx: dict | None, message: str) -> None:
"""Log pipeline progress. Previously wrote to KV; now just logs."""
if message:
log.info("[LB Pipeline] %s", message)
def _get_clickup_client(ctx: dict | None):
"""Create a ClickUpClient from tool context, or None if unavailable."""
if not ctx or not ctx.get("config") or not ctx["config"].clickup.enabled:
return None
try:
from ..clickup import ClickUpClient
config = ctx["config"]
return ClickUpClient(
api_token=config.clickup.api_token,
workspace_id=config.clickup.workspace_id,
task_type_field_name=config.clickup.task_type_field_name,
)
except Exception as e:
log.warning("Could not create ClickUp client: %s", e)
return None
def _sync_clickup(ctx: dict | None, task_id: str, step: str, message: str) -> None:
"""Post a progress comment to ClickUp."""
if not task_id or not ctx:
return
cu_client = _get_clickup_client(ctx)
if cu_client:
try:
cu_client.add_comment(task_id, message)
except Exception as e:
log.warning("ClickUp comment failed for task %s: %s", task_id, e)
finally:
cu_client.close()
def _find_clickup_task(ctx: dict, keyword: str) -> str:
"""Find a ClickUp Link Building task matching the given keyword.
Looks for "to do" tasks with Work Category == "Link Building" and
the Keyword custom field fuzzy-matching the keyword param.
Returns task_id if found, else "".
"""
cu_client = _get_clickup_client(ctx)
if not cu_client:
return ""
config = ctx.get("config")
if not config or not config.clickup.space_id:
return ""
try:
tasks = cu_client.get_tasks_from_space(
config.clickup.space_id,
statuses=["to do"],
)
except Exception as e:
log.warning("ClickUp query failed in _find_clickup_task: %s", e)
return ""
finally:
cu_client.close()
keyword_norm = _normalize_for_match(keyword)
for task in tasks:
if task.task_type != "Link Building":
continue
task_keyword = task.custom_fields.get("Keyword", "")
if not task_keyword:
continue
if _fuzzy_keyword_match(keyword_norm, _normalize_for_match(str(task_keyword))):
# Found a match — move to "automation underway"
task_id = task.id
# Move to "automation underway"
cu_client2 = _get_clickup_client(ctx)
if cu_client2:
try:
cu_client2.update_task_status(task_id, config.clickup.automation_status)
except Exception as e:
log.warning("Failed to update ClickUp status for %s: %s", task_id, e)
finally:
cu_client2.close()
log.info("Auto-matched ClickUp task %s for keyword '%s'", task_id, keyword)
return task_id
return ""
def _normalize_for_match(text: str) -> str:
"""Normalize text for fuzzy matching: lowercase, strip non-alnum, collapse spaces."""
text = text.lower().strip()
text = re.sub(r"[^a-z0-9\s]", " ", text)
text = re.sub(r"\s+", " ", text).strip()
return text
def _fuzzy_keyword_match(a: str, b: str, llm_check: Callable[[str, str], bool] | None = None) -> bool:
"""Check if two normalized strings match, allowing singular/plural differences.
Fast path: exact match after normalization.
Slow path: ask an LLM if the two keywords are the same aside from plural form.
Falls back to False if no llm_check is provided and strings differ.
"""
if not a or not b:
return False
if a == b:
return True
if llm_check is None:
return False
# Only call LLM when keywords share most words (possible plural difference).
words_a = set(a.split())
words_b = set(b.split())
shared = words_a & words_b
total = max(len(words_a), len(words_b))
if total > 1 and len(shared) < total - 1:
return False
return llm_check(a, b)
def _complete_clickup_task(ctx: dict | None, task_id: str, message: str, status: str = "") -> None:
"""Mark a ClickUp task as completed."""
if not task_id or not ctx:
return
config = ctx.get("config")
skill_map = config.clickup.skill_map if config else {}
lb_map = skill_map.get("Link Building", {})
complete_status = status or lb_map.get("complete_status", "complete")
cu_client = _get_clickup_client(ctx)
if cu_client:
try:
cu_client.add_comment(task_id, message)
cu_client.update_task_status(task_id, complete_status)
except Exception as e:
log.warning("ClickUp completion failed for %s: %s", task_id, e)
finally:
cu_client.close()
def _fail_clickup_task(ctx: dict | None, task_id: str, error_msg: str) -> None:
"""Mark a ClickUp task as failed."""
if not task_id or not ctx:
return
config = ctx.get("config")
error_status = config.clickup.error_status if config else "error"
cu_client = _get_clickup_client(ctx)
if cu_client:
try:
cu_client.add_comment(
task_id,
f"[FAILED]Link building pipeline failed.\n\nError: {error_msg[:2000]}",
)
cu_client.update_task_status(task_id, error_status)
except Exception as e:
log.warning("ClickUp failure update failed for %s: %s", task_id, e)
finally:
cu_client.close()
# ---------------------------------------------------------------------------
# Public tools
# ---------------------------------------------------------------------------
@tool(
"run_link_building",
description=(
"Orchestrator for link building pipelines. Reads the LB Method and "
"routes to the correct pipeline tool (e.g., run_cora_backlinks for "
"'Cora Backlinks'). Use when a ClickUp task or chat command requests "
"link building without specifying the exact pipeline."
),
category="linkbuilding",
)
def run_link_building(
lb_method: str = "",
xlsx_path: str = "",
project_name: str = "",
money_site_url: str = "",
branded_plus_ratio: float = 0.7,
custom_anchors: str = "",
cli_flags: str = "",
ctx: dict | None = None,
) -> str:
"""Dispatch to the correct link building pipeline based on lb_method."""
method = (lb_method or "").strip()
if not method:
return (
"Skipped: 'LB Method' field is empty. Each Link Building task must have "
"an LB Method set (e.g. 'Cora Backlinks') before processing can begin."
)
if method == "Cora Backlinks":
# For Cora Backlinks, xlsx_path is required
if not xlsx_path:
return (
"Skipped: Cora Backlinks requires an xlsx_path. "
"The folder watcher will trigger this pipeline when a .xlsx "
"file appears in the watch folder. Or provide xlsx_path explicitly."
)
return run_cora_backlinks(
xlsx_path=xlsx_path,
project_name=project_name,
money_site_url=money_site_url,
branded_plus_ratio=branded_plus_ratio,
custom_anchors=custom_anchors,
cli_flags=cli_flags,
ctx=ctx,
)
else:
return f"Unknown LB Method: '{method}'. Supported methods: Cora Backlinks"
@tool(
"run_cora_backlinks",
description=(
"Full Cora Backlinks pipeline: ingests a CORA .xlsx report via "
"Big-Link-Man's ingest-cora command, then runs generate-batch to "
"produce content. Requires xlsx_path and project_name. Optionally "
"syncs with ClickUp task."
),
category="linkbuilding",
)
def run_cora_backlinks(
xlsx_path: str,
project_name: str,
money_site_url: str = "",
branded_plus_ratio: float = 0.7,
custom_anchors: str = "",
cli_flags: str = "",
ctx: dict | None = None,
) -> str:
"""Run the full Cora Backlinks pipeline: ingest-cora → generate-batch."""
if not xlsx_path:
return "Error: xlsx_path is required for Cora Backlinks pipeline."
if not project_name:
return "Error: project_name is required for Cora Backlinks pipeline."
if not money_site_url:
return (
"Error: money_site_url (IMSURL) is required for Cora Backlinks pipeline. "
"Set the IMSURL custom field on the ClickUp task before processing."
)
blm_dir = _get_blm_dir(ctx)
# Check if xlsx file exists
xlsx = Path(xlsx_path)
if not xlsx.exists():
return f"Error: CORA file not found: {xlsx_path}"
# Try to find matching ClickUp task
clickup_task_id = ""
if ctx:
clickup_task_id = ctx.get("clickup_task_id", "")
if not clickup_task_id:
# Auto-lookup from keyword (derive from project name)
clickup_task_id = _find_clickup_task(ctx, project_name)
output_parts = []
# ── Step 1: ingest-cora ──
_set_status(ctx, f"Step 1/2: Ingesting CORA report for {project_name}...")
if clickup_task_id:
_sync_clickup(ctx, clickup_task_id, "ingest", "[STARTED]Starting Cora Backlinks pipeline...")
# Convert branded_plus_ratio from string if needed
try:
bp_ratio = float(branded_plus_ratio) if branded_plus_ratio else 0.7
except (ValueError, TypeError):
bp_ratio = 0.7
ingest_args = _build_ingest_args(
xlsx_path=xlsx_path,
project_name=project_name,
money_site_url=money_site_url,
branded_plus_ratio=bp_ratio,
custom_anchors=custom_anchors,
cli_flags=cli_flags,
)
blm_timeout = _get_blm_timeout(ctx)
try:
ingest_result = _run_blm_command(ingest_args, blm_dir, timeout=blm_timeout)
except subprocess.TimeoutExpired:
error = f"ingest-cora timed out after {blm_timeout // 60} minutes"
_set_status(ctx, "")
if clickup_task_id:
_fail_clickup_task(ctx, clickup_task_id, error)
return f"Error: {error}"
ingest_parsed = _parse_ingest_output(ingest_result.stdout)
if ingest_result.returncode != 0 or not ingest_parsed["job_file"]:
error = (
f"ingest-cora failed (exit code {ingest_result.returncode}).\n"
f"stdout: {ingest_result.stdout[-500:]}\n"
f"stderr: {ingest_result.stderr[-500:]}"
)
_set_status(ctx, "")
if clickup_task_id:
_fail_clickup_task(ctx, clickup_task_id, error)
return f"Error: {error}"
project_id = ingest_parsed["project_id"]
job_file = ingest_parsed["job_file"]
output_parts.append("## Step 1: Ingest CORA Report")
output_parts.append(f"- Project: {project_name} (ID: {project_id})")
output_parts.append(f"- Keyword: {ingest_parsed['main_keyword']}")
output_parts.append(f"- Job file: {job_file}")
output_parts.append("")
if clickup_task_id:
_sync_clickup(
ctx,
clickup_task_id,
"ingest_done",
f"[DONE]CORA report ingested. Project ID: {project_id}. Job file: {job_file}",
)
# ── Step 2: generate-batch ──
_set_status(ctx, f"Step 2/2: Generating content batch for {project_name}...")
# Build the job file path (may be relative to BLM dir)
job_path = Path(blm_dir) / job_file if not Path(job_file).is_absolute() else Path(job_file)
gen_args = ["generate-batch", "-j", str(job_path), "--continue-on-error"]
try:
gen_result = _run_blm_command(gen_args, blm_dir, timeout=blm_timeout)
except subprocess.TimeoutExpired:
error = f"generate-batch timed out after {blm_timeout // 60} minutes"
_set_status(ctx, "")
if clickup_task_id:
_fail_clickup_task(ctx, clickup_task_id, error)
return "\n".join(output_parts) + f"\n\nError: {error}"
gen_parsed = _parse_generate_output(gen_result.stdout)
if gen_result.returncode != 0:
error = (
f"generate-batch failed (exit code {gen_result.returncode}).\n"
f"stdout: {gen_result.stdout[-500:]}\n"
f"stderr: {gen_result.stderr[-500:]}"
)
_set_status(ctx, "")
if clickup_task_id:
_fail_clickup_task(ctx, clickup_task_id, error)
return "\n".join(output_parts) + f"\n\nError: {error}"
output_parts.append("## Step 2: Generate Content Batch")
output_parts.append(f"- Status: {'Success' if gen_parsed['success'] else 'Completed'}")
if gen_parsed["job_moved_to"]:
output_parts.append(f"- Job moved to: {gen_parsed['job_moved_to']}")
output_parts.append("")
# ── Completion ──
_set_status(ctx, "")
if clickup_task_id:
summary = (
f"[DONE]Cora Backlinks pipeline completed for {project_name}.\n\n"
f"Project ID: {project_id}\n"
f"Keyword: {ingest_parsed['main_keyword']}\n"
f"Job file: {gen_parsed['job_moved_to'] or job_file}"
)
_complete_clickup_task(ctx, clickup_task_id, summary)
output_parts.append("## ClickUp Sync")
output_parts.append(f"- Task `{clickup_task_id}` completed")
output_parts.append("- Status set to 'complete'")
return "\n".join(output_parts)
@tool(
"blm_ingest_cora",
description=(
"Standalone CORA ingest: runs Big-Link-Man's ingest-cora command "
"to parse a CORA .xlsx report and create a project. Returns the "
"project ID and job file path without running generate-batch."
),
category="linkbuilding",
)
def blm_ingest_cora(
xlsx_path: str,
project_name: str,
money_site_url: str = "",
branded_plus_ratio: float = 0.7,
custom_anchors: str = "",
cli_flags: str = "",
ctx: dict | None = None,
) -> str:
"""Run ingest-cora only and return project ID + job file path."""
if not xlsx_path:
return "Error: xlsx_path is required."
if not project_name:
return "Error: project_name is required."
blm_dir = _get_blm_dir(ctx)
xlsx = Path(xlsx_path)
if not xlsx.exists():
return f"Error: CORA file not found: {xlsx_path}"
try:
bp_ratio = float(branded_plus_ratio) if branded_plus_ratio else 0.7
except (ValueError, TypeError):
bp_ratio = 0.7
ingest_args = _build_ingest_args(
xlsx_path=xlsx_path,
project_name=project_name,
money_site_url=money_site_url,
branded_plus_ratio=bp_ratio,
custom_anchors=custom_anchors,
cli_flags=cli_flags,
)
blm_timeout = _get_blm_timeout(ctx)
try:
result = _run_blm_command(ingest_args, blm_dir, timeout=blm_timeout)
except subprocess.TimeoutExpired:
return f"Error: ingest-cora timed out after {blm_timeout // 60} minutes."
parsed = _parse_ingest_output(result.stdout)
if result.returncode != 0 or not parsed["job_file"]:
return (
f"Error: ingest-cora failed (exit code {result.returncode}).\n"
f"stdout: {result.stdout[-500:]}\n"
f"stderr: {result.stderr[-500:]}"
)
return (
f"CORA ingest complete.\n\n"
f"- Project: {parsed['project_name']} (ID: {parsed['project_id']})\n"
f"- Keyword: {parsed['main_keyword']}\n"
f"- Job file: {parsed['job_file']}\n\n"
f"Run `blm_generate_batch` with this job file to generate content."
)
@tool(
"blm_generate_batch",
description=(
"Standalone content generation: runs Big-Link-Man's generate-batch "
"command on an existing job file. Use after ingest-cora or for "
"re-running generation on a manually created job."
),
category="linkbuilding",
)
def blm_generate_batch(
job_file: str,
continue_on_error: bool = True,
debug: bool = False,
ctx: dict | None = None,
) -> str:
"""Run generate-batch on an existing job file."""
if not job_file:
return "Error: job_file is required."
blm_dir = _get_blm_dir(ctx)
job_path = Path(blm_dir) / job_file if not Path(job_file).is_absolute() else Path(job_file)
if not job_path.exists():
return f"Error: Job file not found: {job_path}"
args = ["generate-batch", "-j", str(job_path)]
if continue_on_error:
args.append("--continue-on-error")
if debug:
args.append("--debug")
blm_timeout = _get_blm_timeout(ctx)
try:
result = _run_blm_command(args, blm_dir, timeout=blm_timeout)
except subprocess.TimeoutExpired:
return f"Error: generate-batch timed out after {blm_timeout // 60} minutes."
parsed = _parse_generate_output(result.stdout)
if result.returncode != 0:
return (
f"Error: generate-batch failed (exit code {result.returncode}).\n"
f"stdout: {result.stdout[-500:]}\n"
f"stderr: {result.stderr[-500:]}"
)
output = "Content generation complete.\n\n"
output += f"- Status: {'Success' if parsed['success'] else 'Completed'}\n"
if parsed["job_moved_to"]:
output += f"- Job moved to: {parsed['job_moved_to']}\n"
return output
@tool(
"scan_cora_folder",
description=(
"Scan the Cora inbox watch folder for .xlsx files and report "
"their processing status. Shows which files are new, processed, "
"or failed, and whether they match a ClickUp task."
),
category="linkbuilding",
)
def scan_cora_folder(ctx: dict | None = None) -> str:
"""Scan the watch folder and return status of .xlsx files."""
if not ctx or "config" not in ctx:
return "Error: scan_cora_folder requires agent context."
config = ctx["config"]
watch_folder = config.link_building.watch_folder
if not watch_folder:
return "Watch folder not configured (link_building.watch_folder is empty)."
watch_path = Path(watch_folder)
if not watch_path.exists():
return f"Watch folder does not exist: {watch_folder}"
xlsx_files = sorted(watch_path.glob("*.xlsx"))
if not xlsx_files:
return f"No .xlsx files found in {watch_folder}."
lines = [f"## Cora Inbox: {watch_folder}\n"]
processed_dir = watch_path / "processed"
processed_names = set()
if processed_dir.exists():
processed_names = {f.name for f in processed_dir.glob("*.xlsx")}
for f in xlsx_files:
filename = f.name
if filename.startswith("~$"):
continue
status = "processed" if filename in processed_names else "new"
lines.append(f"- **{filename}** — status: {status}")
# Check processed subfolder
processed_dir = watch_path / "processed"
if processed_dir.exists():
processed = list(processed_dir.glob("*.xlsx"))
if processed:
lines.append(f"\n### Processed ({len(processed)} files)")
for f in processed[:10]:
lines.append(f"- {f.name}")
if len(processed) > 10:
lines.append(f"- ... and {len(processed) - 10} more")
return "\n".join(lines)
@tool(
"setup_linkbuilding_fields",
description=(
"One-time setup tool: creates the required ClickUp custom fields "
"(LB Method, Keyword, CoraFile, etc.) across all lists in the space. "
"Safe to re-run — skips fields that already exist."
),
category="linkbuilding",
)
def setup_linkbuilding_fields(ctx: dict | None = None) -> str:
"""Create link building custom fields in ClickUp."""
if not ctx or "config" not in ctx:
return "Error: requires agent context."
config = ctx["config"]
if not config.clickup.enabled:
return "Error: ClickUp integration not enabled."
cu_client = _get_clickup_client(ctx)
if not cu_client:
return "Error: could not create ClickUp client."
try:
space_id = config.clickup.space_id
list_ids = cu_client.get_list_ids_from_space(space_id)
if not list_ids:
return f"No lists found in space {space_id}."
fields_to_create = [
{
"name": "LB Method",
"type": "drop_down",
"type_config": {
"options": [
{"name": "Cora Backlinks", "color": "#04A9F4"},
]
},
},
{"name": "Keyword", "type": "short_text"},
{"name": "CoraFile", "type": "short_text"},
{"name": "CustomAnchors", "type": "short_text"},
{"name": "BrandedPlusRatio", "type": "short_text"},
{"name": "CLIFlags", "type": "short_text"},
]
results = []
for list_id in list_ids:
existing = cu_client.get_custom_fields(list_id)
existing_names = {f.get("name") for f in existing}
for field_def in fields_to_create:
if field_def["name"] in existing_names:
continue
try:
cu_client.create_custom_field(
list_id,
field_def["name"],
field_def["type"],
field_def.get("type_config"),
)
results.append(f"Created '{field_def['name']}' in list {list_id}")
except Exception as e:
results.append(f"Failed to create '{field_def['name']}' in list {list_id}: {e}")
if not results:
return "All fields already exist in all lists."
return "## Setup Results\n\n" + "\n".join(f"- {r}" for r in results)
finally:
cu_client.close()