263 lines
7.4 KiB
Python
263 lines
7.4 KiB
Python
"""Big-Link-Man CLI runner.
|
|
|
|
Runs ingest-cora and generate-batch via BLM's own venv Python.
|
|
Ported from cheddahbot/tools/linkbuilding.py for headless use.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import logging
|
|
import os
|
|
import re
|
|
import subprocess
|
|
from dataclasses import dataclass
|
|
from pathlib import Path
|
|
|
|
log = logging.getLogger(__name__)
|
|
|
|
|
|
@dataclass
|
|
class IngestResult:
|
|
"""Parsed output from ingest-cora."""
|
|
|
|
success: bool
|
|
project_id: str = ""
|
|
project_name: str = ""
|
|
main_keyword: str = ""
|
|
job_file: str = ""
|
|
error: str = ""
|
|
|
|
|
|
@dataclass
|
|
class GenerateResult:
|
|
"""Parsed output from generate-batch."""
|
|
|
|
success: bool
|
|
job_moved_to: str = ""
|
|
error: str = ""
|
|
|
|
|
|
def _resolve_venv_python(blm_dir: str) -> Path:
|
|
"""Find BLM's venv Python executable."""
|
|
venv_python = Path(blm_dir) / ".venv" / "Scripts" / "python.exe"
|
|
if not venv_python.exists():
|
|
# Fallback for Linux/Mac
|
|
venv_python = Path(blm_dir) / ".venv" / "bin" / "python"
|
|
if not venv_python.exists():
|
|
raise FileNotFoundError(
|
|
"No .venv found in %s. BLM must have its own venv." % blm_dir
|
|
)
|
|
return venv_python
|
|
|
|
|
|
def _run_blm(
|
|
args: list[str], blm_dir: str, timeout: int = 1800
|
|
) -> subprocess.CompletedProcess:
|
|
"""Run a BLM CLI command with credential injection."""
|
|
venv_python = _resolve_venv_python(blm_dir)
|
|
cmd = [str(venv_python), "main.py"] + args
|
|
|
|
# Inject credentials from env vars
|
|
username = os.getenv("BLM_USERNAME", "")
|
|
password = os.getenv("BLM_PASSWORD", "")
|
|
if username and "-u" not in args and "--username" not in args:
|
|
cmd.extend(["-u", username])
|
|
if password and "-p" not in args and "--password" not in args:
|
|
cmd.extend(["-p", password])
|
|
|
|
log.info("BLM command: %s (cwd=%s)", " ".join(cmd), blm_dir)
|
|
result = subprocess.run(
|
|
cmd,
|
|
cwd=blm_dir,
|
|
capture_output=True,
|
|
text=True,
|
|
timeout=timeout,
|
|
)
|
|
log.info("BLM exit code: %d", result.returncode)
|
|
if result.stdout:
|
|
log.debug("BLM stdout: %s", result.stdout[:2000])
|
|
if result.stderr:
|
|
log.debug("BLM stderr: %s", result.stderr[:2000])
|
|
return result
|
|
|
|
|
|
def find_cora_xlsx(keyword: str, cora_inbox: str) -> str | None:
|
|
"""Find the Cora xlsx in the inbox directory by keyword match.
|
|
|
|
Looks for files whose name (slugified) matches the keyword.
|
|
Returns the full path or None.
|
|
"""
|
|
inbox = Path(cora_inbox)
|
|
if not inbox.exists():
|
|
log.warning("Cora inbox not found: %s", cora_inbox)
|
|
return None
|
|
|
|
# Slugify keyword for matching: lowercase, spaces -> underscores
|
|
slug = keyword.lower().strip().replace(" ", "_")
|
|
slug = re.sub(r"[^a-z0-9_]", "", slug)
|
|
|
|
# Look for exact match first, then prefix match
|
|
for xlsx in sorted(inbox.glob("*.xlsx"), key=lambda p: p.stat().st_mtime, reverse=True):
|
|
name_lower = xlsx.stem.lower()
|
|
if name_lower == slug:
|
|
return str(xlsx)
|
|
|
|
# Prefix match (keyword slug is prefix of filename)
|
|
for xlsx in sorted(inbox.glob("*.xlsx"), key=lambda p: p.stat().st_mtime, reverse=True):
|
|
name_lower = xlsx.stem.lower()
|
|
if name_lower.startswith(slug):
|
|
return str(xlsx)
|
|
|
|
log.warning("No xlsx matching '%s' in %s", keyword, cora_inbox)
|
|
return None
|
|
|
|
|
|
def build_ingest_args(
|
|
xlsx_path: str,
|
|
project_name: str,
|
|
money_site_url: str = "",
|
|
branded_plus_ratio: str = "",
|
|
custom_anchors: str = "",
|
|
cli_flags: str = "",
|
|
) -> list[str]:
|
|
"""Build the ingest-cora CLI argument list."""
|
|
args = ["ingest-cora", "-f", xlsx_path, "-n", project_name]
|
|
|
|
if money_site_url:
|
|
args.extend(["-m", money_site_url])
|
|
|
|
if branded_plus_ratio:
|
|
try:
|
|
bp = float(branded_plus_ratio)
|
|
if bp != 0.7:
|
|
args.extend(["-bp", str(bp)])
|
|
except (ValueError, TypeError):
|
|
pass
|
|
|
|
if custom_anchors:
|
|
args.extend(["-a", custom_anchors])
|
|
|
|
if cli_flags:
|
|
args.extend(cli_flags.strip().split())
|
|
|
|
return args
|
|
|
|
|
|
def parse_ingest_output(stdout: str) -> IngestResult:
|
|
"""Parse ingest-cora stdout."""
|
|
result = IngestResult(success=False)
|
|
|
|
for line in stdout.splitlines():
|
|
line = line.strip()
|
|
|
|
m = re.match(r"^Success: Project '(.+)' created \(ID: (\d+)\)$", line)
|
|
if m:
|
|
result.project_name = m.group(1)
|
|
result.project_id = m.group(2)
|
|
result.success = True
|
|
continue
|
|
|
|
m = re.match(r"^Job file created: (.+)$", line)
|
|
if m:
|
|
result.job_file = m.group(1).strip()
|
|
continue
|
|
|
|
m = re.match(r"^Main Keyword: (.+)$", line)
|
|
if m:
|
|
result.main_keyword = m.group(1).strip()
|
|
continue
|
|
|
|
return result
|
|
|
|
|
|
def parse_generate_output(stdout: str) -> GenerateResult:
|
|
"""Parse generate-batch stdout."""
|
|
result = GenerateResult(success=False)
|
|
|
|
for line in stdout.splitlines():
|
|
line = line.strip()
|
|
|
|
m = re.match(r"^Job file moved to: (.+)$", line)
|
|
if m:
|
|
result.job_moved_to = m.group(1).strip()
|
|
result.success = True
|
|
continue
|
|
|
|
return result
|
|
|
|
|
|
def run_ingest(
|
|
xlsx_path: str,
|
|
keyword: str,
|
|
money_site_url: str,
|
|
blm_dir: str,
|
|
timeout: int = 1800,
|
|
branded_plus_ratio: str = "",
|
|
custom_anchors: str = "",
|
|
cli_flags: str = "",
|
|
) -> IngestResult:
|
|
"""Run ingest-cora and return parsed result."""
|
|
args = build_ingest_args(
|
|
xlsx_path=xlsx_path,
|
|
project_name=keyword,
|
|
money_site_url=money_site_url,
|
|
branded_plus_ratio=branded_plus_ratio,
|
|
custom_anchors=custom_anchors,
|
|
cli_flags=cli_flags,
|
|
)
|
|
|
|
try:
|
|
proc = _run_blm(args, blm_dir, timeout=timeout)
|
|
except subprocess.TimeoutExpired:
|
|
return IngestResult(
|
|
success=False,
|
|
error="ingest-cora timed out after %d seconds" % timeout,
|
|
)
|
|
except FileNotFoundError as e:
|
|
return IngestResult(success=False, error=str(e))
|
|
|
|
if proc.returncode != 0:
|
|
return IngestResult(
|
|
success=False,
|
|
error="ingest-cora failed (exit code %d).\nstdout: %s\nstderr: %s"
|
|
% (proc.returncode, proc.stdout[-500:], proc.stderr[-500:]),
|
|
)
|
|
|
|
parsed = parse_ingest_output(proc.stdout)
|
|
if not parsed.job_file:
|
|
return IngestResult(
|
|
success=False,
|
|
error="ingest-cora produced no job file.\nstdout: %s" % proc.stdout[-500:],
|
|
)
|
|
|
|
return parsed
|
|
|
|
|
|
def run_generate(
|
|
job_file: str,
|
|
blm_dir: str,
|
|
timeout: int = 1800,
|
|
) -> GenerateResult:
|
|
"""Run generate-batch and return parsed result."""
|
|
job_path = Path(blm_dir) / job_file if not Path(job_file).is_absolute() else Path(job_file)
|
|
args = ["generate-batch", "-j", str(job_path), "--continue-on-error"]
|
|
|
|
try:
|
|
proc = _run_blm(args, blm_dir, timeout=timeout)
|
|
except subprocess.TimeoutExpired:
|
|
return GenerateResult(
|
|
success=False,
|
|
error="generate-batch timed out after %d seconds" % timeout,
|
|
)
|
|
except FileNotFoundError as e:
|
|
return GenerateResult(success=False, error=str(e))
|
|
|
|
if proc.returncode != 0:
|
|
return GenerateResult(
|
|
success=False,
|
|
error="generate-batch failed (exit code %d).\nstdout: %s\nstderr: %s"
|
|
% (proc.returncode, proc.stdout[-500:], proc.stderr[-500:]),
|
|
)
|
|
|
|
return parse_generate_output(proc.stdout)
|