From 381d51e00188a3447dea4c551216a4ebb03cae2f Mon Sep 17 00:00:00 2001 From: bryanb Date: Wed, 22 Apr 2026 12:11:16 +0000 Subject: [PATCH] Initial commit: link building workflow extracted from CheddahBot Standalone package wrapping Big-Link-Man (BLM) for Paperclip. Extracted from cheddahbot/tools/linkbuilding.py and related modules, with task-system coupling, folder watching, and AutoCora queue logic stripped out. Public API: - Deps, BLMConfig, LLMCheck (injection types) - normalize_for_match, fuzzy_keyword_match, filename_stem_to_keyword - list_inbox_xlsx, find_xlsx_for_keyword, find_all_xlsx_for_keyword - blm_ingest_cora, blm_generate_batch, run_cora_backlinks (pipelines) - PipelineResult, IngestResult, GenerateResult (return types) 89 tests, 96% coverage. --- .gitignore | 14 + README.md | 211 ++++++++++++ pyproject.toml | 53 +++ src/link_building_workflow/__init__.py | 128 +++++++ src/link_building_workflow/blm.py | 146 ++++++++ src/link_building_workflow/deps.py | 63 ++++ src/link_building_workflow/inbox.py | 116 +++++++ src/link_building_workflow/matching.py | 59 ++++ src/link_building_workflow/pipeline.py | 318 +++++++++++++++++ tests/conftest.py | 80 +++++ tests/test_blm.py | 220 ++++++++++++ tests/test_inbox.py | 170 +++++++++ tests/test_matching.py | 97 ++++++ tests/test_pipeline.py | 460 +++++++++++++++++++++++++ 14 files changed, 2135 insertions(+) create mode 100644 .gitignore create mode 100644 README.md create mode 100644 pyproject.toml create mode 100644 src/link_building_workflow/__init__.py create mode 100644 src/link_building_workflow/blm.py create mode 100644 src/link_building_workflow/deps.py create mode 100644 src/link_building_workflow/inbox.py create mode 100644 src/link_building_workflow/matching.py create mode 100644 src/link_building_workflow/pipeline.py create mode 100644 tests/conftest.py create mode 100644 tests/test_blm.py create mode 100644 tests/test_inbox.py create mode 100644 tests/test_matching.py create mode 100644 tests/test_pipeline.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..c572a56 --- /dev/null +++ b/.gitignore @@ -0,0 +1,14 @@ +__pycache__/ +*.py[cod] +*.egg-info/ +.pytest_cache/ +.coverage +.coverage.* +htmlcov/ +coverage.xml +.ruff_cache/ +.venv/ +dist/ +build/ +*.bak +uv.lock diff --git a/README.md b/README.md new file mode 100644 index 0000000..d4beaa4 --- /dev/null +++ b/README.md @@ -0,0 +1,211 @@ +# Linkman-Paperclip-Wrap + +A standalone Python package wrapping the Big-Link-Man (BLM) CLI for use by +Paperclip agents. Extracted from CheddahBot (`cheddahbot/tools/linkbuilding.py`) +and simplified for consumption by external callers. + +## What it does + +Given a task keyword, the package can: + +1. **Find a matching CORA `.xlsx`** in an inbox folder (e.g. `Cora-For-Humans/`) + using fuzzy keyword matching with singular/plural awareness. +2. **Invoke Big-Link-Man** to run `ingest-cora` and `generate-batch` on that + xlsx, producing the backlink content. +3. **Return a structured result** the caller can use to update task state. + +No folder watching, no task-system coupling, no notifications. The caller owns +task state and polling cadence; this package is pure work. + +## Package layout + +``` +src/link_building_workflow/ + deps.py -- Deps, BLMConfig, LLMCheck types + matching.py -- Keyword normalization and fuzzy matching + inbox.py -- Inbox folder scanning (list / find-by-keyword) + blm.py -- BLM subprocess wrapper and stdout parsers + pipeline.py -- run_cora_backlinks, blm_ingest_cora, blm_generate_batch + __init__.py -- Public API re-exports +``` + +## Installation + +``` +uv add git+https://git.peninsulaindustries.com/bryanb/Linkman-Paperclip-Wrap.git +``` + +Big-Link-Man itself is a separate dependency the caller provides. Install it +on the same host and point `BLMConfig.blm_dir` at the checkout. + +## Public API + +All imports available from the top level: + +```python +from link_building_workflow import ( + # Dependency types + Deps, BLMConfig, LLMCheck, + # Matching primitives + normalize_for_match, fuzzy_keyword_match, filename_stem_to_keyword, + # Inbox scanning + InboxMatch, list_inbox_xlsx, find_xlsx_for_keyword, find_all_xlsx_for_keyword, + # Pipeline entry points + PipelineResult, run_cora_backlinks, blm_ingest_cora, blm_generate_batch, + # Low-level BLM (if you need to run a custom BLM command) + IngestResult, GenerateResult, build_ingest_args, + parse_ingest_output, parse_generate_output, run_blm_command, +) +``` + +## Typical usage (Paperclip) + +The caller decides when a task is eligible to run (all required task fields +filled in, xlsx present in the inbox). This package provides the primitives +to check the xlsx gate and to execute the work. + +```python +from link_building_workflow import ( + Deps, BLMConfig, find_xlsx_for_keyword, run_cora_backlinks, +) + +deps = Deps( + blm=BLMConfig( + blm_dir="/opt/big-link-man", + username="your-blm-user", + password="your-blm-pass", + timeout_seconds=1800, + ), + llm_check=your_plural_checker, # callable[[str, str], bool] +) + +def try_run_link_building(task): + # Caller gates 1-4: task-field checks (LB Method, Keyword, IMSURL, ...) + if not (task.keyword and task.imsurl): + return "blocked: missing task fields" + + # Gate 5: does a matching xlsx exist yet? + match = find_xlsx_for_keyword( + "/data/Cora-For-Humans", + task.keyword, + deps.llm_check, + ) + if match is None: + return "blocked: no xlsx in Cora-For-Humans" + + # Execute + result = run_cora_backlinks( + xlsx_path=str(match.path), + project_name=task.keyword, + money_site_url=task.imsurl, + custom_anchors=task.custom_anchors or "", + cli_flags=task.cli_flags or "", + branded_plus_ratio=task.branded_plus_ratio, # None -> BLMConfig default + deps=deps, + ) + + if result.ok: + # result.summary is a multi-line human-readable string + # result.ingest.project_id, result.generate.job_moved_to, etc. + return f"done: {result.summary}" + else: + # result.step tells you where it stopped: "ingest" or "generate" + # result.error has the details + return f"failed at {result.step}: {result.error}" +``` + +## The `LLMCheck` callable + +Used when the fast-path string equality fails during fuzzy matching. Should +return `True` iff two keywords are the same modulo plural form ("shaft" vs +"shafts", "company" vs "companies"). Return `False` for any other kind of +difference. Implementations should cache -- the workflow may call this +repeatedly with the same pair while scanning an inbox. + +Example implementation (the one CheddahBot uses): + +```python +import httpx + +_cache = {} + +def openrouter_plural_check(a: str, b: str) -> bool: + key = (a, b) if a <= b else (b, a) + if key in _cache: + return _cache[key] + resp = httpx.post( + "https://openrouter.ai/api/v1/chat/completions", + headers={"Authorization": f"Bearer {OPENROUTER_API_KEY}"}, + json={ + "model": "anthropic/claude-haiku-4.5", + "max_tokens": 5, + "messages": [ + {"role": "system", "content": + "Reply with only 'YES' or 'NO'. YES iff the two keywords " + "are identical except for singular/plural form."}, + {"role": "user", "content": f'A: "{a}"\nB: "{b}"'}, + ], + }, + timeout=15, + ) + result = "YES" in resp.json()["choices"][0]["message"]["content"].upper() + _cache[key] = result + return result +``` + +Tests may pass `lambda a, b: False` for the fast-path-only case, or any +deterministic fake. + +## The `PipelineResult` dataclass + +Every pipeline entry point returns the same shape: + +| field | meaning | +|-----------------|----------------------------------------------------------------| +| `ok` | True if the pipeline completed the phase it was asked to do | +| `step` | "ingest" / "generate" / "complete" (on success) or where it failed | +| `ingest` | `IngestResult` if ingest ran, else None | +| `generate` | `GenerateResult` if generate ran, else None | +| `error` | Human-readable error message (empty on success) | +| `summary` | Multi-line human-readable summary, safe to post as a comment | +| `project_name` | The BLM project name | +| `job_file` | Path to the final job file (post-move on success) | +| `log_lines` | Progress messages captured during the run | + +## What this package does NOT do + +- Does not watch folders. No threads, no polling loops. +- Does not know about ClickUp, Linear, or any task system. The caller owns + task state and decides what status transitions mean. +- Does not sync with shared-folder job queues (the old AutoCora queue). +- Does not manage the Cora tool itself. It only consumes xlsx files that + Cora has already produced. +- Does not pick up where BLM leaves off. When BLM finishes `generate-batch`, + the job is done from this package's perspective. + +These were deliberate drops during extraction. CheddahBot had folder-watch +threads, ClickUp auto-matching, AutoCora queue submission, and a multi-inbox +distribution loop. Paperclip owns that scheduling logic in its own code. + +## Development + +Requires Python 3.11+ and [uv](https://docs.astral.sh/uv/). + +``` +uv sync # install dev + test deps +uv run pytest # run the test suite (89 tests, ~96% coverage) +uv run ruff check . # lint +``` + +## Provenance + +Extracted from the CheddahBot repo, specifically: + +- `cheddahbot/tools/linkbuilding.py` -- pipeline logic and fuzzy matching +- `cheddahbot/tools/autocora.py` -- only the fuzzy-match helpers were kept; + the shared-folder job queue and result polling were dropped +- `cheddahbot/scheduler.py` -- folder-watch loops were dropped; their + matching logic was converted to a synchronous `find_xlsx_for_keyword` call + +The BLM invocation parameters, stdout parsing regexes, and default ratios +match CheddahBot's production behavior exactly. diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..2f8ec6e --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,53 @@ +[project] +name = "link-building-workflow" +version = "0.1.0" +description = "CORA xlsx -> Big-Link-Man link building pipeline, extracted from CheddahBot for Paperclip" +requires-python = ">=3.11" +dependencies = [] + +[build-system] +requires = ["uv_build>=0.9,<1"] +build-backend = "uv_build" + +[tool.uv.build-backend] +module-root = "src" + +[dependency-groups] +dev = [{ include-group = "lint" }, { include-group = "test" }] +lint = ["ruff"] +test = ["pytest", "pytest-cov"] + +[tool.uv] +default-groups = ["dev", "test"] + +[tool.pytest.ini_options] +testpaths = ["tests"] +addopts = [ + "-ra", + "--strict-markers", + "--strict-config", + "--cov=link_building_workflow", + "--cov-report=term-missing", +] + +[tool.coverage.run] +branch = true +source = ["link_building_workflow"] + +[tool.coverage.report] +exclude_lines = [ + "pragma: no cover", + "if TYPE_CHECKING:", + "if __name__ == .__main__.:", +] +show_missing = true + +[tool.ruff] +line-length = 100 +target-version = "py311" + +[tool.ruff.lint] +select = ["E", "F", "I", "UP", "B", "SIM", "RUF"] + +[tool.ruff.lint.per-file-ignores] +"tests/**/*.py" = ["S101", "PLR2004", "ANN"] diff --git a/src/link_building_workflow/__init__.py b/src/link_building_workflow/__init__.py new file mode 100644 index 0000000..eb2685a --- /dev/null +++ b/src/link_building_workflow/__init__.py @@ -0,0 +1,128 @@ +"""Link building workflow: CORA xlsx -> Big-Link-Man pipeline. + +A standalone package extracted from CheddahBot for consumption by Paperclip +(and anything else that wants to trigger link-building work). The caller +owns task state; this package owns the work. + +Public API +---------- +Matching primitives:: + + normalize_for_match(text) -> str + fuzzy_keyword_match(a, b, llm_check=None) -> bool + filename_stem_to_keyword(stem) -> str + +Inbox scanning:: + + list_inbox_xlsx(folder) -> list[Path] + find_xlsx_for_keyword(folder, keyword, llm_check=None) -> InboxMatch | None + find_all_xlsx_for_keyword(folder, keyword, llm_check=None) -> list[InboxMatch] + +Pipeline entry points:: + + run_cora_backlinks(xlsx_path, project_name, money_site_url, deps, ...) -> PipelineResult + blm_ingest_cora(xlsx_path, project_name, deps, ...) -> PipelineResult + blm_generate_batch(job_file, deps, ...) -> PipelineResult + +Dependency types:: + + Deps(blm, llm_check) + BLMConfig(blm_dir, username, password, timeout_seconds, ...) + LLMCheck = Callable[[str, str], bool] + +Typical Paperclip usage +----------------------- + + from link_building_workflow import ( + Deps, BLMConfig, + find_xlsx_for_keyword, + run_cora_backlinks, + ) + + deps = Deps( + blm=BLMConfig(blm_dir="/opt/blm", username="...", password="..."), + llm_check=my_plural_checker, + ) + + # Gate: does a matching xlsx exist yet? + match = find_xlsx_for_keyword( + "/data/Cora-For-Humans", task.keyword, deps.llm_check + ) + if match is None: + return "waiting for xlsx" + + # All gates passed -- run the work + result = run_cora_backlinks( + xlsx_path=str(match.path), + project_name=task.keyword, + money_site_url=task.imsurl, + deps=deps, + ) + if result.ok: + # success -- post result.summary as a comment, advance task state + ... + else: + # failure -- result.error has the reason, result.step is where it stopped + ... +""" + +from __future__ import annotations + +from .blm import ( + GenerateResult, + IngestResult, + build_ingest_args, + parse_generate_output, + parse_ingest_output, + run_blm_command, +) +from .deps import BLMConfig, Deps, LLMCheck +from .inbox import ( + InboxMatch, + find_all_xlsx_for_keyword, + find_xlsx_for_keyword, + list_inbox_xlsx, +) +from .matching import ( + filename_stem_to_keyword, + fuzzy_keyword_match, + normalize_for_match, +) +from .pipeline import ( + PipelineResult, + ProgressFn, + blm_generate_batch, + blm_ingest_cora, + run_cora_backlinks, +) + +__all__ = [ # noqa: RUF022 -- grouped by module for readability + # deps + "Deps", + "BLMConfig", + "LLMCheck", + # matching + "normalize_for_match", + "fuzzy_keyword_match", + "filename_stem_to_keyword", + # inbox + "InboxMatch", + "list_inbox_xlsx", + "find_xlsx_for_keyword", + "find_all_xlsx_for_keyword", + # blm low-level + "IngestResult", + "GenerateResult", + "build_ingest_args", + "parse_ingest_output", + "parse_generate_output", + "run_blm_command", + # pipeline + "PipelineResult", + "ProgressFn", + "run_cora_backlinks", + "blm_ingest_cora", + "blm_generate_batch", +] + +__version__ = "0.1.0" diff --git a/src/link_building_workflow/blm.py b/src/link_building_workflow/blm.py new file mode 100644 index 0000000..0bc8729 --- /dev/null +++ b/src/link_building_workflow/blm.py @@ -0,0 +1,146 @@ +"""Big-Link-Man CLI wrapper. + +BLM is an external Python tool. We invoke it via subprocess using whatever +Python interpreter the caller configured in `BLMConfig.python_exe`. With +BLM installed alongside the caller on the same host (the expected Paperclip +setup), this is usually just "python". +""" + +from __future__ import annotations + +import logging +import re +import subprocess +from dataclasses import dataclass +from pathlib import Path + +from .deps import BLMConfig + +log = logging.getLogger(__name__) + + +@dataclass +class IngestResult: + """Parsed output of an `ingest-cora` run.""" + + project_id: str = "" + job_file: str = "" + project_name: str = "" + main_keyword: str = "" + + @property + def success(self) -> bool: + return bool(self.project_id and self.job_file) + + +@dataclass +class GenerateResult: + """Parsed output of a `generate-batch` run.""" + + job_moved_to: str = "" + success: bool = False + raw_output: str = "" + + +def build_ingest_args( + *, + xlsx_path: str, + project_name: str, + money_site_url: str = "", + branded_plus_ratio: float = 0.7, + custom_anchors: str = "", + cli_flags: str = "", +) -> list[str]: + """Construct the argv tail for `main.py ingest-cora ...`. + + Does not include the interpreter or `main.py`. Does not include -u/-p + credentials -- those are injected by `run_blm_command`. Only emits + -bp when the ratio differs from BLM's own default of 0.7 to keep the + command line minimal when defaults are in play. + """ + args = ["ingest-cora", "-f", xlsx_path, "-n", project_name] + if money_site_url: + args.extend(["-m", money_site_url]) + if branded_plus_ratio and branded_plus_ratio != 0.7: + args.extend(["-bp", str(branded_plus_ratio)]) + if custom_anchors: + args.extend(["-a", custom_anchors]) + if cli_flags: + args.extend(cli_flags.strip().split()) + return args + + +def run_blm_command( + args: list[str], + blm: BLMConfig, +) -> subprocess.CompletedProcess: + """Run a BLM CLI command and return the CompletedProcess. + + Always injects -u/-p from `blm.username`/`blm.password` unless the + caller already put them in `args`. cwd is set to `blm.blm_dir` so BLM + can find its own relative paths (config files, jobs/ directory, etc.). + + Raises subprocess.TimeoutExpired if the command exceeds + `blm.timeout_seconds`. Raises FileNotFoundError if `blm.blm_dir` or + `blm.python_exe` can't be found. + """ + blm_path = Path(blm.blm_dir) + if not blm_path.exists(): + raise FileNotFoundError(f"BLM directory not found: {blm.blm_dir}") + + cmd: list[str] = [blm.python_exe, "main.py", *args] + + if blm.username and "-u" not in args and "--username" not in args: + cmd.extend(["-u", blm.username]) + if blm.password and "-p" not in args and "--password" not in args: + cmd.extend(["-p", blm.password]) + + log.info("Running BLM: %s (cwd=%s)", " ".join(cmd), blm.blm_dir) + result = subprocess.run( + cmd, + cwd=blm.blm_dir, + capture_output=True, + text=True, + timeout=blm.timeout_seconds, + ) + log.info("BLM exit=%d", result.returncode) + if result.stdout: + log.debug("BLM stdout: %s", result.stdout[:1000]) + if result.stderr: + log.debug("BLM stderr: %s", result.stderr[:1000]) + return result + + +_INGEST_PROJECT_RE = re.compile(r"^Success: Project '(.+)' created \(ID: (\d+)\)$") +_INGEST_JOB_RE = re.compile(r"^Job file created: (.+)$") +_INGEST_KEYWORD_RE = re.compile(r"^Main Keyword: (.+)$") +_GENERATE_MOVED_RE = re.compile(r"^Job file moved to: (.+)$") + + +def parse_ingest_output(stdout: str) -> IngestResult: + """Extract project id, job file path, and main keyword from ingest stdout.""" + result = IngestResult() + for raw in stdout.splitlines(): + line = raw.strip() + if m := _INGEST_PROJECT_RE.match(line): + result.project_name = m.group(1) + result.project_id = m.group(2) + continue + if m := _INGEST_JOB_RE.match(line): + result.job_file = m.group(1).strip() + continue + if m := _INGEST_KEYWORD_RE.match(line): + result.main_keyword = m.group(1).strip() + continue + return result + + +def parse_generate_output(stdout: str) -> GenerateResult: + """Extract the post-run job file path from generate-batch stdout.""" + result = GenerateResult(raw_output=stdout) + for raw in stdout.splitlines(): + line = raw.strip() + if m := _GENERATE_MOVED_RE.match(line): + result.job_moved_to = m.group(1).strip() + result.success = True + return result diff --git a/src/link_building_workflow/deps.py b/src/link_building_workflow/deps.py new file mode 100644 index 0000000..7e4e9a7 --- /dev/null +++ b/src/link_building_workflow/deps.py @@ -0,0 +1,63 @@ +"""Dependency injection types for the link building workflow. + +The workflow is LLM-agnostic: the caller (Paperclip, tests, anything) implements +the `LLMCheck` callable and passes a `Deps` instance into every public function. + +Nothing in this module touches the network, a task system, or an LLM directly. +Task state (status, custom fields, comments) is owned by the caller -- this +package only returns structured results and lets the caller decide what to do. +""" + +from __future__ import annotations + +from collections.abc import Callable +from dataclasses import dataclass + +# An LLM-backed equality check for singular/plural keyword matching. +# Returns True iff `a` and `b` are the same SEO keyword modulo plural form +# (e.g. "shaft" vs "shafts", "company" vs "companies"). Returns False for +# any other kind of difference. +# +# Implementations should cache results; the workflow may call this repeatedly +# with the same pair while scanning a folder of xlsx files against a task list. +LLMCheck = Callable[[str, str], bool] + + +@dataclass +class BLMConfig: + """Configuration for invoking the Big-Link-Man CLI. + + BLM is an external Python tool; the workflow shells out to it. With BLM + installed on the same host as the caller (the expected Paperclip setup), + the default of using the system `python` resolves BLM's own dependencies + if BLM was installed into the same environment. Override `python_exe` to + point at a specific interpreter when BLM has its own virtualenv. + """ + + # Path to the Big-Link-Man checkout. Must contain main.py. + blm_dir: str + # BLM auth, passed as -u / -p on every CLI call. Empty strings are + # skipped, so it's safe to leave these unset if BLM doesn't need them. + username: str = "" + password: str = "" + # Subprocess timeout per BLM invocation, in seconds. + # Default covers generate-batch runs of ~25 min plus headroom. + timeout_seconds: int = 1800 + # Default branded+ ratio passed to ingest-cora if the caller doesn't + # supply one. BLM's own default is 0.7; we match it. + default_branded_plus_ratio: float = 0.7 + # Python interpreter used to run BLM. Defaults to "python" (on PATH). + # Set to an absolute path like "/opt/blm/.venv/bin/python" if BLM has + # its own venv separate from the caller. + python_exe: str = "python" + + +@dataclass +class Deps: + """Container for everything the workflow needs from the outside world. + + Construct this once per run and pass it through. + """ + + blm: BLMConfig + llm_check: LLMCheck diff --git a/src/link_building_workflow/inbox.py b/src/link_building_workflow/inbox.py new file mode 100644 index 0000000..429f645 --- /dev/null +++ b/src/link_building_workflow/inbox.py @@ -0,0 +1,116 @@ +"""Locate CORA .xlsx files in an inbox folder by keyword. + +The caller's task state tells them which keyword they're looking for; this +module tells them whether a matching xlsx exists and where. No folder +watching, no threading -- it's a one-shot scan that Paperclip calls when +evaluating whether a task can run. +""" + +from __future__ import annotations + +import logging +from dataclasses import dataclass +from pathlib import Path + +from .deps import LLMCheck +from .matching import filename_stem_to_keyword, fuzzy_keyword_match, normalize_for_match + +log = logging.getLogger(__name__) + + +@dataclass +class InboxMatch: + """A single xlsx in the inbox that matched the requested keyword.""" + + path: Path + filename: str + stem_keyword: str # the normalized keyword derived from the filename + + +def list_inbox_xlsx( + folder: str | Path, + *, + skip_processed: bool = True, +) -> list[Path]: + """Return all .xlsx files in `folder` that are candidates for matching. + + Skips Office temp/lock files (names starting with "~$"). If + `skip_processed` is True (the default), also skips any file whose name + appears in `folder/processed/` -- that subfolder is treated as the + archive of already-handled files. + + Returns an empty list if the folder doesn't exist. The caller decides + whether that's an error or a "no xlsx available yet" gate. + """ + path = Path(folder) + if not path.exists(): + log.warning("Inbox folder does not exist: %s", path) + return [] + + processed_names: set[str] = set() + if skip_processed: + processed_dir = path / "processed" + if processed_dir.exists(): + processed_names = {f.name for f in processed_dir.glob("*.xlsx")} + + candidates: list[Path] = [] + for f in sorted(path.glob("*.xlsx")): + if f.name.startswith("~$"): + continue + if f.name in processed_names: + continue + candidates.append(f) + return candidates + + +def find_xlsx_for_keyword( + folder: str | Path, + keyword: str, + llm_check: LLMCheck | None = None, + *, + skip_processed: bool = True, +) -> InboxMatch | None: + """Find a single xlsx in `folder` whose filename matches `keyword`. + + Returns the first match on a filename-stem fuzzy-match against the + requested keyword, or None if nothing matches. + + `keyword` is normalized internally, so the caller can pass it in any + form (e.g. "Precision CNC Machining" or "precision-cnc-machining"). + + Uses `llm_check` for singular/plural equivalence; falls back to exact + match if `llm_check` is None. `skip_processed` controls whether files + already in `folder/processed/` are considered. + """ + matches = find_all_xlsx_for_keyword( + folder, keyword, llm_check, skip_processed=skip_processed + ) + return matches[0] if matches else None + + +def find_all_xlsx_for_keyword( + folder: str | Path, + keyword: str, + llm_check: LLMCheck | None = None, + *, + skip_processed: bool = True, +) -> list[InboxMatch]: + """Find every xlsx in `folder` whose filename matches `keyword`. + + Same matching rules as `find_xlsx_for_keyword`, but returns all matches + instead of just the first. Useful when a keyword legitimately has + several xlsx variants (for example an original and a re-run) and the + caller wants to pick the newest by mtime. + """ + target = normalize_for_match(keyword) + if not target: + return [] + + results: list[InboxMatch] = [] + for f in list_inbox_xlsx(folder, skip_processed=skip_processed): + stem_kw = filename_stem_to_keyword(f.stem) + if fuzzy_keyword_match(target, stem_kw, llm_check): + results.append( + InboxMatch(path=f, filename=f.name, stem_keyword=stem_kw) + ) + return results diff --git a/src/link_building_workflow/matching.py b/src/link_building_workflow/matching.py new file mode 100644 index 0000000..c94ffd1 --- /dev/null +++ b/src/link_building_workflow/matching.py @@ -0,0 +1,59 @@ +"""Keyword normalization and fuzzy matching. + +Pure functions with no I/O. The LLM check for singular/plural equivalence +is injected, so tests can substitute a deterministic fake and production +can plug in any model. +""" + +from __future__ import annotations + +import re + +from .deps import LLMCheck + + +def normalize_for_match(text: str) -> str: + """Normalize text for fuzzy matching. + + Lowercases, replaces runs of non-alphanumeric characters with a single + space, and collapses whitespace. The result is suitable as input to + `fuzzy_keyword_match`. + """ + text = text.lower().strip() + text = re.sub(r"[^a-z0-9\s]", " ", text) + text = re.sub(r"\s+", " ", text).strip() + return text + + +def fuzzy_keyword_match(a: str, b: str, llm_check: LLMCheck | None = None) -> bool: + """Compare two normalized strings for keyword equivalence. + + Fast path: exact match after normalization returns True immediately. + Slow path: delegate to `llm_check` to decide if the two keywords differ + only in singular vs plural form. + + If `llm_check` is None and the fast path fails, returns False. Empty + inputs always return False. + + Inputs are expected to be pre-normalized (call `normalize_for_match` + first). Passing un-normalized strings will still work but is wasteful + when matching against many candidates. + """ + if not a or not b: + return False + if a == b: + return True + if llm_check is None: + return False + return llm_check(a, b) + + +def filename_stem_to_keyword(stem: str) -> str: + """Convert a filename stem to a matchable keyword. + + Example: "precision-cnc_machining" -> "precision cnc machining" + + The returned value is already normalized. + """ + stem = stem.lower().replace("-", " ").replace("_", " ") + return normalize_for_match(stem) diff --git a/src/link_building_workflow/pipeline.py b/src/link_building_workflow/pipeline.py new file mode 100644 index 0000000..4d698e7 --- /dev/null +++ b/src/link_building_workflow/pipeline.py @@ -0,0 +1,318 @@ +"""High-level link building pipelines. + +These functions do the work and return a structured result. They do NOT +touch any task system. The caller (Paperclip) reads the returned +`PipelineResult` and decides what to do: update task status, post a +comment, move the xlsx to a processed folder, alert, etc. +""" + +from __future__ import annotations + +import logging +import subprocess +from collections.abc import Callable +from dataclasses import dataclass, field +from pathlib import Path + +from . import blm as blm_mod +from .blm import ( + GenerateResult, + IngestResult, + build_ingest_args, + parse_generate_output, + parse_ingest_output, +) +from .deps import Deps + +log = logging.getLogger(__name__) + + +ProgressFn = Callable[[str], None] + + +@dataclass +class PipelineResult: + """Outcome of a full or partial link building pipeline run. + + `ok` is the single boolean the caller should branch on. `step` tells + them which phase the result is from: on failure it's where the + pipeline stopped; on success it's "ingest", "generate", or "complete" + depending on which entry point was called. + + `summary` is a multi-line human-readable string safe to post as a + task comment or log. `error` is populated only when `ok` is False. + """ + + ok: bool + step: str # "ingest" | "generate" | "complete" + ingest: IngestResult | None = None + generate: GenerateResult | None = None + error: str = "" + summary: str = "" + project_name: str = "" + job_file: str = "" + log_lines: list[str] = field(default_factory=list) + + +def _err( + step: str, + message: str, + ingest: IngestResult | None = None, + generate: GenerateResult | None = None, + log_lines: list[str] | None = None, +) -> PipelineResult: + return PipelineResult( + ok=False, + step=step, + ingest=ingest, + generate=generate, + error=message, + summary=f"Error during {step}: {message}", + log_lines=log_lines or [], + ) + + +def blm_ingest_cora( + *, + xlsx_path: str, + project_name: str, + deps: Deps, + money_site_url: str = "", + branded_plus_ratio: float | None = None, + custom_anchors: str = "", + cli_flags: str = "", +) -> PipelineResult: + """Run only BLM's `ingest-cora` command. + + Use when you want to parse a CORA .xlsx into a BLM project + job file + without immediately generating content. The returned result carries + the project id and the job file path; pass that path to + `blm_generate_batch` later to produce content. + """ + if not xlsx_path: + return _err("ingest", "xlsx_path is required") + if not project_name: + return _err("ingest", "project_name is required") + if not Path(xlsx_path).exists(): + return _err("ingest", f"CORA file not found: {xlsx_path}") + + bp_ratio = ( + branded_plus_ratio + if branded_plus_ratio is not None + else deps.blm.default_branded_plus_ratio + ) + + args = build_ingest_args( + xlsx_path=xlsx_path, + project_name=project_name, + money_site_url=money_site_url, + branded_plus_ratio=bp_ratio, + custom_anchors=custom_anchors, + cli_flags=cli_flags, + ) + + try: + proc = blm_mod.run_blm_command(args, deps.blm) + except subprocess.TimeoutExpired: + return _err( + "ingest", f"ingest-cora timed out after {deps.blm.timeout_seconds // 60}m" + ) + except FileNotFoundError as e: + return _err("ingest", str(e)) + + ingest = parse_ingest_output(proc.stdout) + if proc.returncode != 0 or not ingest.success: + return _err( + "ingest", + f"ingest-cora failed (exit={proc.returncode}). " + f"stdout tail: {proc.stdout[-500:]}\nstderr tail: {proc.stderr[-500:]}", + ingest=ingest, + ) + + summary = ( + f"CORA ingest complete.\n" + f"- Project: {ingest.project_name} (ID: {ingest.project_id})\n" + f"- Keyword: {ingest.main_keyword}\n" + f"- Job file: {ingest.job_file}" + ) + return PipelineResult( + ok=True, + step="ingest", + ingest=ingest, + summary=summary, + project_name=ingest.project_name, + job_file=ingest.job_file, + ) + + +def blm_generate_batch( + *, + job_file: str, + deps: Deps, + continue_on_error: bool = True, + debug: bool = False, +) -> PipelineResult: + """Run only BLM's `generate-batch` command on an existing job file. + + `job_file` may be absolute or relative; relative paths are resolved + against `deps.blm.blm_dir`. + """ + if not job_file: + return _err("generate", "job_file is required") + + job_path = ( + Path(job_file) + if Path(job_file).is_absolute() + else Path(deps.blm.blm_dir) / job_file + ) + if not job_path.exists(): + return _err("generate", f"Job file not found: {job_path}") + + args = ["generate-batch", "-j", str(job_path)] + if continue_on_error: + args.append("--continue-on-error") + if debug: + args.append("--debug") + + try: + proc = blm_mod.run_blm_command(args, deps.blm) + except subprocess.TimeoutExpired: + return _err( + "generate", + f"generate-batch timed out after {deps.blm.timeout_seconds // 60}m", + ) + except FileNotFoundError as e: + return _err("generate", str(e)) + + gen = parse_generate_output(proc.stdout) + if proc.returncode != 0: + return _err( + "generate", + f"generate-batch failed (exit={proc.returncode}). " + f"stdout tail: {proc.stdout[-500:]}\nstderr tail: {proc.stderr[-500:]}", + generate=gen, + ) + + parts = ["Content generation complete."] + parts.append(f"- Status: {'Success' if gen.success else 'Completed'}") + if gen.job_moved_to: + parts.append(f"- Job moved to: {gen.job_moved_to}") + + return PipelineResult( + ok=True, + step="generate", + generate=gen, + summary="\n".join(parts), + job_file=gen.job_moved_to or job_file, + ) + + +def run_cora_backlinks( + *, + xlsx_path: str, + project_name: str, + money_site_url: str, + deps: Deps, + branded_plus_ratio: float | None = None, + custom_anchors: str = "", + cli_flags: str = "", + on_progress: ProgressFn | None = None, +) -> PipelineResult: + """Full Cora Backlinks pipeline: ingest-cora -> generate-batch. + + Requires `money_site_url` (IMSURL) -- BLM cannot run interactively in + subprocess mode, so the URL must be supplied up front. + + `on_progress` is an optional callback invoked with free-form status + strings ("Step 1/2: ..." etc.); pass one if you want live updates + streamed to a UI, log, or task comment as the pipeline runs. The + same strings are also captured in the returned `log_lines`. + + On failure, the returned `PipelineResult` has `ok=False`, `step` set + to where it stopped ("ingest" or "generate"), and `error` populated. + On success, `step == "complete"` and both `ingest` and `generate` are + populated. + """ + if not xlsx_path: + return _err("ingest", "xlsx_path is required") + if not project_name: + return _err("ingest", "project_name is required") + if not money_site_url: + return _err( + "ingest", + "money_site_url (IMSURL) is required; BLM runs non-interactively", + ) + + log_lines: list[str] = [] + + def _progress(msg: str) -> None: + log_lines.append(msg) + log.info("[LB Pipeline] %s", msg) + if on_progress is not None: + try: + on_progress(msg) + except Exception: + log.exception("on_progress callback raised; continuing") + + _progress(f"Step 1/2: Ingesting CORA report for {project_name}...") + + ingest = blm_ingest_cora( + xlsx_path=xlsx_path, + project_name=project_name, + deps=deps, + money_site_url=money_site_url, + branded_plus_ratio=branded_plus_ratio, + custom_anchors=custom_anchors, + cli_flags=cli_flags, + ) + if not ingest.ok: + ingest.log_lines = log_lines + return ingest + + assert ingest.ingest is not None + ing = ingest.ingest + + _progress(f"Step 2/2: Generating content batch for {project_name}...") + + gen = blm_generate_batch( + job_file=ing.job_file, + deps=deps, + continue_on_error=True, + ) + if not gen.ok: + return PipelineResult( + ok=False, + step="generate", + ingest=ing, + generate=gen.generate, + error=gen.error, + summary=ingest.summary + "\n\n" + gen.summary, + project_name=project_name, + job_file=ing.job_file, + log_lines=log_lines, + ) + + assert gen.generate is not None + g = gen.generate + + summary = ( + f"## Step 1: Ingest CORA Report\n" + f"- Project: {project_name} (ID: {ing.project_id})\n" + f"- Keyword: {ing.main_keyword}\n" + f"- Job file: {ing.job_file}\n" + f"\n" + f"## Step 2: Generate Content Batch\n" + f"- Status: {'Success' if g.success else 'Completed'}\n" + + (f"- Job moved to: {g.job_moved_to}\n" if g.job_moved_to else "") + ) + + return PipelineResult( + ok=True, + step="complete", + ingest=ing, + generate=g, + summary=summary, + project_name=project_name, + job_file=g.job_moved_to or ing.job_file, + log_lines=log_lines, + ) diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..ccb9b85 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,80 @@ +"""Shared test fixtures.""" + +from __future__ import annotations + +from pathlib import Path + +import pytest + +from link_building_workflow import BLMConfig, Deps, LLMCheck + + +@pytest.fixture() +def blm_dir(tmp_path: Path) -> Path: + """A fake BLM directory on disk so run_blm_command's existence check passes.""" + d = tmp_path / "blm" + d.mkdir() + # Touch main.py so any accidental real subprocess call gets further; we + # still mock subprocess.run in tests, but this is a harmless safety net. + (d / "main.py").write_text("# fake\n") + return d + + +@pytest.fixture() +def blm_config(blm_dir: Path) -> BLMConfig: + return BLMConfig( + blm_dir=str(blm_dir), + username="testuser", + password="testpass", + timeout_seconds=300, + python_exe="python", + ) + + +@pytest.fixture() +def llm_never() -> LLMCheck: + """LLM check that always returns False (fast-path only matches).""" + return lambda a, b: False + + +@pytest.fixture() +def llm_always() -> LLMCheck: + """LLM check that always returns True (treat everything as plural-equiv).""" + return lambda a, b: True + + +@pytest.fixture() +def deps(blm_config: BLMConfig, llm_never) -> Deps: + return Deps(blm=blm_config, llm_check=llm_never) + + +# Canonical ingest stdout, matches the BLM output format the parser is tuned for +@pytest.fixture() +def ingest_success_stdout() -> str: + return ( + "Authenticated as: testuser (User)\n" + "\n" + "Parsing CORA file: /tmp/test.xlsx\n" + "Main Keyword: precision cnc machining\n" + "Word Count: 1500\n" + "\n" + "Creating project: Test Project\n" + "Money Site URL: https://example.com\n" + "\n" + "Success: Project 'Test Project' created (ID: 42)\n" + "Main Keyword: precision cnc machining\n" + "Money Site URL: https://example.com\n" + "Job file created: jobs/test-project.json\n" + ) + + +@pytest.fixture() +def generate_success_stdout() -> str: + return ( + "Loading job file: jobs/test-project.json\n" + "Generating backlink 1 of 10...\n" + "Generating backlink 2 of 10...\n" + "...\n" + "All backlinks generated.\n" + "Job file moved to: jobs/done/test-project.json\n" + ) diff --git a/tests/test_blm.py b/tests/test_blm.py new file mode 100644 index 0000000..d8717d8 --- /dev/null +++ b/tests/test_blm.py @@ -0,0 +1,220 @@ +"""Tests for the BLM CLI subprocess wrapper and output parsers.""" + +from __future__ import annotations + +import subprocess +from pathlib import Path +from unittest.mock import MagicMock, patch + +import pytest + +from link_building_workflow import BLMConfig +from link_building_workflow.blm import ( + build_ingest_args, + parse_generate_output, + parse_ingest_output, + run_blm_command, +) + + +class TestBuildIngestArgs: + def test_required_args_only(self): + args = build_ingest_args(xlsx_path="/tmp/f.xlsx", project_name="P") + assert args == ["ingest-cora", "-f", "/tmp/f.xlsx", "-n", "P"] + + def test_with_money_site_url(self): + args = build_ingest_args( + xlsx_path="/tmp/f.xlsx", + project_name="P", + money_site_url="https://example.com", + ) + assert "-m" in args + i = args.index("-m") + assert args[i + 1] == "https://example.com" + + def test_branded_plus_ratio_default_omitted(self): + args = build_ingest_args( + xlsx_path="/tmp/f.xlsx", project_name="P", branded_plus_ratio=0.7 + ) + assert "-bp" not in args + + def test_branded_plus_ratio_custom_included(self): + args = build_ingest_args( + xlsx_path="/tmp/f.xlsx", project_name="P", branded_plus_ratio=0.8 + ) + assert "-bp" in args + assert args[args.index("-bp") + 1] == "0.8" + + def test_custom_anchors(self): + args = build_ingest_args( + xlsx_path="/tmp/f.xlsx", + project_name="P", + custom_anchors="a1,a2", + ) + assert "-a" in args + assert args[args.index("-a") + 1] == "a1,a2" + + def test_cli_flags_split_on_whitespace(self): + args = build_ingest_args( + xlsx_path="/tmp/f.xlsx", + project_name="P", + cli_flags="--foo --bar baz", + ) + assert "--foo" in args + assert "--bar" in args + assert "baz" in args + + def test_cli_flags_empty_string_no_extra_args(self): + args = build_ingest_args( + xlsx_path="/tmp/f.xlsx", project_name="P", cli_flags="" + ) + assert args == ["ingest-cora", "-f", "/tmp/f.xlsx", "-n", "P"] + + +class TestParseIngestOutput: + def test_full_success(self, ingest_success_stdout): + result = parse_ingest_output(ingest_success_stdout) + assert result.project_id == "42" + assert result.project_name == "Test Project" + assert result.main_keyword == "precision cnc machining" + assert result.job_file == "jobs/test-project.json" + assert result.success is True + + def test_missing_project_line(self): + stdout = "Job file created: jobs/x.json\n" + result = parse_ingest_output(stdout) + assert result.project_id == "" + assert result.project_name == "" + assert result.success is False # no project_id + + def test_missing_job_line(self): + stdout = "Success: Project 'X' created (ID: 1)\n" + result = parse_ingest_output(stdout) + assert result.project_id == "1" + assert result.job_file == "" + assert result.success is False # no job_file + + def test_empty_stdout(self): + result = parse_ingest_output("") + assert result.project_id == "" + assert result.job_file == "" + assert result.success is False + + def test_ignores_noise(self): + stdout = ( + "Some random banner\n" + "DEBUG: lots of stuff\n" + "Success: Project 'Foo Bar' created (ID: 99)\n" + "WARNING: meaningless\n" + "Main Keyword: foo bar\n" + "Job file created: jobs/foo-bar.json\n" + "Done.\n" + ) + result = parse_ingest_output(stdout) + assert result.project_id == "99" + assert result.project_name == "Foo Bar" + assert result.main_keyword == "foo bar" + assert result.job_file == "jobs/foo-bar.json" + + def test_whitespace_around_job_file(self): + stdout = "Job file created: jobs/x.json \n" + result = parse_ingest_output(stdout) + assert result.job_file == "jobs/x.json" + + +class TestParseGenerateOutput: + def test_success_with_move(self, generate_success_stdout): + result = parse_generate_output(generate_success_stdout) + assert result.success is True + assert result.job_moved_to == "jobs/done/test-project.json" + assert "Job file moved to" in result.raw_output + + def test_no_move_line(self): + stdout = "Generating backlinks...\nSome error occurred.\n" + result = parse_generate_output(stdout) + assert result.success is False + assert result.job_moved_to == "" + assert result.raw_output == stdout + + def test_empty_stdout(self): + result = parse_generate_output("") + assert result.success is False + assert result.job_moved_to == "" + + +class TestRunBlmCommand: + def test_passes_cwd_and_interpreter(self, blm_config: BLMConfig): + mock_result = MagicMock(returncode=0, stdout="", stderr="") + with patch("subprocess.run", return_value=mock_result) as mock_run: + run_blm_command(["ingest-cora", "-f", "x.xlsx"], blm_config) + call = mock_run.call_args + cmd = call[0][0] + assert cmd[0] == "python" + assert cmd[1] == "main.py" + assert "ingest-cora" in cmd + assert call[1]["cwd"] == blm_config.blm_dir + + def test_injects_credentials(self, blm_config: BLMConfig): + mock_result = MagicMock(returncode=0, stdout="", stderr="") + with patch("subprocess.run", return_value=mock_result) as mock_run: + run_blm_command(["ingest-cora"], blm_config) + cmd = mock_run.call_args[0][0] + assert "-u" in cmd + assert cmd[cmd.index("-u") + 1] == "testuser" + assert "-p" in cmd + assert cmd[cmd.index("-p") + 1] == "testpass" + + def test_does_not_duplicate_user_flag(self, blm_config: BLMConfig): + mock_result = MagicMock(returncode=0, stdout="", stderr="") + with patch("subprocess.run", return_value=mock_result) as mock_run: + run_blm_command(["ingest-cora", "-u", "other"], blm_config) + cmd = mock_run.call_args[0][0] + # -u should appear once, with the caller's value preserved + assert cmd.count("-u") == 1 + assert cmd[cmd.index("-u") + 1] == "other" + + def test_does_not_duplicate_password_flag(self, blm_config: BLMConfig): + mock_result = MagicMock(returncode=0, stdout="", stderr="") + with patch("subprocess.run", return_value=mock_result) as mock_run: + run_blm_command(["ingest-cora", "-p", "otherpw"], blm_config) + cmd = mock_run.call_args[0][0] + assert cmd.count("-p") == 1 + assert cmd[cmd.index("-p") + 1] == "otherpw" + + def test_skips_credentials_when_not_configured(self, tmp_path: Path): + blm_dir = tmp_path / "blm" + blm_dir.mkdir() + config = BLMConfig(blm_dir=str(blm_dir)) # no user/pass + mock_result = MagicMock(returncode=0, stdout="", stderr="") + with patch("subprocess.run", return_value=mock_result) as mock_run: + run_blm_command(["ingest-cora"], config) + cmd = mock_run.call_args[0][0] + assert "-u" not in cmd + assert "-p" not in cmd + + def test_raises_on_missing_blm_dir(self, tmp_path: Path): + config = BLMConfig(blm_dir=str(tmp_path / "nope")) + with pytest.raises(FileNotFoundError): + run_blm_command(["ingest-cora"], config) + + def test_passes_timeout(self, blm_config: BLMConfig): + mock_result = MagicMock(returncode=0, stdout="", stderr="") + with patch("subprocess.run", return_value=mock_result) as mock_run: + run_blm_command(["ingest-cora"], blm_config) + assert mock_run.call_args[1]["timeout"] == blm_config.timeout_seconds + + def test_propagates_timeout_expired(self, blm_config: BLMConfig): + with patch( + "subprocess.run", + side_effect=subprocess.TimeoutExpired(cmd="python", timeout=300), + ), pytest.raises(subprocess.TimeoutExpired): + run_blm_command(["ingest-cora"], blm_config) + + def test_custom_python_exe(self, tmp_path: Path): + blm_dir = tmp_path / "blm" + blm_dir.mkdir() + config = BLMConfig(blm_dir=str(blm_dir), python_exe="/opt/venv/bin/python") + mock_result = MagicMock(returncode=0, stdout="", stderr="") + with patch("subprocess.run", return_value=mock_result) as mock_run: + run_blm_command(["ingest-cora"], config) + assert mock_run.call_args[0][0][0] == "/opt/venv/bin/python" diff --git a/tests/test_inbox.py b/tests/test_inbox.py new file mode 100644 index 0000000..3e72f17 --- /dev/null +++ b/tests/test_inbox.py @@ -0,0 +1,170 @@ +"""Tests for inbox folder scanning and keyword-based file lookup.""" + +from __future__ import annotations + +from pathlib import Path + +import pytest + +from link_building_workflow.inbox import ( + find_all_xlsx_for_keyword, + find_xlsx_for_keyword, + list_inbox_xlsx, +) + + +@pytest.fixture() +def inbox(tmp_path: Path) -> Path: + """An empty inbox folder.""" + d = tmp_path / "cora-inbox" + d.mkdir() + return d + + +def _touch(folder: Path, name: str) -> Path: + p = folder / name + p.write_bytes(b"fake xlsx") + return p + + +class TestListInboxXlsx: + def test_missing_folder_returns_empty(self, tmp_path: Path): + assert list_inbox_xlsx(tmp_path / "does-not-exist") == [] + + def test_empty_folder(self, inbox: Path): + assert list_inbox_xlsx(inbox) == [] + + def test_lists_xlsx_only(self, inbox: Path): + _touch(inbox, "a.xlsx") + _touch(inbox, "readme.txt") + _touch(inbox, "b.xlsx") + result = list_inbox_xlsx(inbox) + names = [p.name for p in result] + assert names == ["a.xlsx", "b.xlsx"] + + def test_skips_office_lock_files(self, inbox: Path): + _touch(inbox, "real.xlsx") + _touch(inbox, "~$real.xlsx") + result = list_inbox_xlsx(inbox) + assert [p.name for p in result] == ["real.xlsx"] + + def test_skips_processed_by_default(self, inbox: Path): + _touch(inbox, "new.xlsx") + processed = inbox / "processed" + processed.mkdir() + _touch(processed, "old.xlsx") + # Also duplicate the name in root to prove it gets filtered + _touch(inbox, "old.xlsx") + + result = list_inbox_xlsx(inbox) + assert [p.name for p in result] == ["new.xlsx"] + + def test_skip_processed_disabled(self, inbox: Path): + _touch(inbox, "new.xlsx") + processed = inbox / "processed" + processed.mkdir() + _touch(inbox, "old.xlsx") # same name as one we "processed" + + result = list_inbox_xlsx(inbox, skip_processed=False) + assert sorted(p.name for p in result) == ["new.xlsx", "old.xlsx"] + + def test_sorted_output(self, inbox: Path): + _touch(inbox, "c.xlsx") + _touch(inbox, "a.xlsx") + _touch(inbox, "b.xlsx") + result = list_inbox_xlsx(inbox) + assert [p.name for p in result] == ["a.xlsx", "b.xlsx", "c.xlsx"] + + +class TestFindXlsxForKeyword: + def test_exact_match(self, inbox: Path, llm_never): + _touch(inbox, "precision-cnc-machining.xlsx") + match = find_xlsx_for_keyword(inbox, "precision cnc machining", llm_never) + assert match is not None + assert match.filename == "precision-cnc-machining.xlsx" + assert match.stem_keyword == "precision cnc machining" + + def test_no_match(self, inbox: Path, llm_never): + _touch(inbox, "other-keyword.xlsx") + match = find_xlsx_for_keyword(inbox, "cnc machining", llm_never) + assert match is None + + def test_missing_folder(self, tmp_path: Path, llm_never): + match = find_xlsx_for_keyword( + tmp_path / "no-such-dir", "cnc machining", llm_never + ) + assert match is None + + def test_empty_keyword(self, inbox: Path, llm_never): + _touch(inbox, "anything.xlsx") + match = find_xlsx_for_keyword(inbox, "", llm_never) + assert match is None + + def test_keyword_with_hyphens(self, inbox: Path, llm_never): + # Caller may pass the keyword in hyphenated form; should still match + _touch(inbox, "precision-cnc-machining.xlsx") + match = find_xlsx_for_keyword( + inbox, "precision-cnc-machining", llm_never + ) + assert match is not None + + def test_keyword_case_insensitive(self, inbox: Path, llm_never): + _touch(inbox, "cnc-machining.xlsx") + match = find_xlsx_for_keyword(inbox, "CNC Machining", llm_never) + assert match is not None + + def test_plural_match_via_llm(self, inbox: Path): + _touch(inbox, "cnc-shafts.xlsx") + + def only_plural_of_shaft(a: str, b: str) -> bool: + return {a, b} == {"cnc shaft", "cnc shafts"} + + # Singular keyword should match the plural filename via LLM + match = find_xlsx_for_keyword(inbox, "cnc shaft", only_plural_of_shaft) + assert match is not None + assert match.filename == "cnc-shafts.xlsx" + + def test_first_match_returned(self, inbox: Path, llm_never): + # Two xlsx files both match; sorted order picks "a..." first + _touch(inbox, "b-cnc-machining.xlsx") + _touch(inbox, "a-cnc-machining.xlsx") + # These don't fuzzy match the keyword "cnc machining" because of + # the a-/b- prefix. So use a real collision: + _touch(inbox, "cnc-machining.xlsx") + match = find_xlsx_for_keyword(inbox, "cnc machining", llm_never) + assert match is not None + assert match.filename == "cnc-machining.xlsx" + + def test_processed_files_ignored(self, inbox: Path, llm_never): + processed = inbox / "processed" + processed.mkdir() + _touch(processed, "cnc-machining.xlsx") + _touch(inbox, "cnc-machining.xlsx") + # Inbox file with same name as processed one is also skipped by + # list_inbox_xlsx, so no match available + match = find_xlsx_for_keyword(inbox, "cnc machining", llm_never) + assert match is None + + +class TestFindAllXlsxForKeyword: + def test_returns_all_matches(self, inbox: Path): + _touch(inbox, "cnc-shaft.xlsx") + _touch(inbox, "cnc-shafts.xlsx") + _touch(inbox, "unrelated.xlsx") + + def plural_ok(a: str, b: str) -> bool: + return {a, b} == {"cnc shaft", "cnc shafts"} + + results = find_all_xlsx_for_keyword(inbox, "cnc shaft", plural_ok) + names = sorted(r.filename for r in results) + assert names == ["cnc-shaft.xlsx", "cnc-shafts.xlsx"] + + def test_empty_when_no_matches(self, inbox: Path, llm_never): + _touch(inbox, "unrelated.xlsx") + results = find_all_xlsx_for_keyword(inbox, "cnc shaft", llm_never) + assert results == [] + + def test_empty_keyword_returns_empty(self, inbox: Path, llm_never): + _touch(inbox, "anything.xlsx") + results = find_all_xlsx_for_keyword(inbox, "", llm_never) + assert results == [] diff --git a/tests/test_matching.py b/tests/test_matching.py new file mode 100644 index 0000000..760236a --- /dev/null +++ b/tests/test_matching.py @@ -0,0 +1,97 @@ +"""Tests for keyword normalization and fuzzy matching.""" + +from __future__ import annotations + +from link_building_workflow.matching import ( + filename_stem_to_keyword, + fuzzy_keyword_match, + normalize_for_match, +) + + +class TestNormalizeForMatch: + def test_lowercases(self): + assert normalize_for_match("Hello World") == "hello world" + + def test_strips_punctuation(self): + assert normalize_for_match("hello, world!") == "hello world" + + def test_collapses_whitespace(self): + assert normalize_for_match("hello world\n\ttest") == "hello world test" + + def test_empty_string(self): + assert normalize_for_match("") == "" + + def test_only_punctuation(self): + assert normalize_for_match("!!!...,,,") == "" + + def test_numbers_preserved(self): + assert normalize_for_match("5-axis cnc") == "5 axis cnc" + + def test_leading_trailing_whitespace(self): + assert normalize_for_match(" hello world ") == "hello world" + + +class TestFuzzyKeywordMatch: + def test_exact_match(self, llm_never): + assert fuzzy_keyword_match("cnc machining", "cnc machining", llm_never) is True + + def test_different_no_llm(self): + assert fuzzy_keyword_match("cnc", "cnc machining") is False + + def test_different_llm_says_no(self, llm_never): + assert fuzzy_keyword_match("cnc", "milling", llm_never) is False + + def test_different_llm_says_yes(self, llm_always): + # LLM callable gets to decide when exact match fails + assert fuzzy_keyword_match("shaft", "shafts", llm_always) is True + + def test_empty_a(self, llm_always): + assert fuzzy_keyword_match("", "cnc", llm_always) is False + + def test_empty_b(self, llm_always): + assert fuzzy_keyword_match("cnc", "", llm_always) is False + + def test_both_empty(self, llm_always): + # Even with llm_always, empty inputs short-circuit to False + assert fuzzy_keyword_match("", "", llm_always) is False + + def test_no_llm_check_fast_path_hit(self): + # When no llm_check provided, exact matches still work + assert fuzzy_keyword_match("same", "same") is True + + def test_no_llm_check_fast_path_miss(self): + # When no llm_check and not exact, returns False + assert fuzzy_keyword_match("same", "different") is False + + def test_llm_check_only_called_when_needed(self): + calls = [] + + def tracking_llm(a, b): + calls.append((a, b)) + return True + + # Exact match: LLM should not be called + fuzzy_keyword_match("cnc", "cnc", tracking_llm) + assert calls == [] + + # Different: LLM should be called once + fuzzy_keyword_match("shaft", "shafts", tracking_llm) + assert calls == [("shaft", "shafts")] + + +class TestFilenameStemToKeyword: + def test_hyphens_to_spaces(self): + assert filename_stem_to_keyword("precision-cnc-machining") == "precision cnc machining" + + def test_underscores_to_spaces(self): + assert filename_stem_to_keyword("precision_cnc_machining") == "precision cnc machining" + + def test_mixed_separators(self): + assert filename_stem_to_keyword("precision-cnc_machining") == "precision cnc machining" + + def test_uppercase(self): + assert filename_stem_to_keyword("CNC-Machining") == "cnc machining" + + def test_empty(self): + assert filename_stem_to_keyword("") == "" diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py new file mode 100644 index 0000000..e0f5319 --- /dev/null +++ b/tests/test_pipeline.py @@ -0,0 +1,460 @@ +"""Tests for the three pipeline entry points. + +BLM subprocess calls are mocked via `link_building_workflow.blm.run_blm_command`. +The pipeline module imports blm as `blm_mod` and calls `blm_mod.run_blm_command(...)`, +so we patch there. +""" + +from __future__ import annotations + +import subprocess +from pathlib import Path +from unittest.mock import MagicMock, patch + +import pytest + +from link_building_workflow import ( + BLMConfig, + Deps, + blm_generate_batch, + blm_ingest_cora, + run_cora_backlinks, +) + + +@pytest.fixture() +def xlsx_file(tmp_path: Path) -> Path: + p = tmp_path / "precision-cnc-machining.xlsx" + p.write_bytes(b"fake xlsx") + return p + + +def _mock_proc(stdout: str = "", stderr: str = "", returncode: int = 0) -> MagicMock: + m = MagicMock() + m.stdout = stdout + m.stderr = stderr + m.returncode = returncode + return m + + +# --------------------------------------------------------------------------- +# blm_ingest_cora +# --------------------------------------------------------------------------- + + +class TestBlmIngestCora: + def test_missing_xlsx_path(self, deps: Deps): + result = blm_ingest_cora(xlsx_path="", project_name="P", deps=deps) + assert result.ok is False + assert "xlsx_path is required" in result.error + + def test_missing_project_name(self, deps: Deps, xlsx_file: Path): + result = blm_ingest_cora( + xlsx_path=str(xlsx_file), project_name="", deps=deps + ) + assert result.ok is False + assert "project_name is required" in result.error + + def test_xlsx_not_found(self, deps: Deps): + result = blm_ingest_cora( + xlsx_path="/nope/missing.xlsx", project_name="P", deps=deps + ) + assert result.ok is False + assert "not found" in result.error + + def test_success( + self, deps: Deps, xlsx_file: Path, ingest_success_stdout: str + ): + proc = _mock_proc(stdout=ingest_success_stdout) + with patch( + "link_building_workflow.blm.run_blm_command", return_value=proc + ): + result = blm_ingest_cora( + xlsx_path=str(xlsx_file), + project_name="Test Project", + deps=deps, + ) + assert result.ok is True + assert result.step == "ingest" + assert result.ingest is not None + assert result.ingest.project_id == "42" + assert result.ingest.job_file == "jobs/test-project.json" + assert result.job_file == "jobs/test-project.json" + assert result.project_name == "Test Project" + assert "CORA ingest complete" in result.summary + + def test_nonzero_exit_reports_failure( + self, deps: Deps, xlsx_file: Path + ): + proc = _mock_proc(stdout="", stderr="boom", returncode=1) + with patch( + "link_building_workflow.blm.run_blm_command", return_value=proc + ): + result = blm_ingest_cora( + xlsx_path=str(xlsx_file), + project_name="P", + deps=deps, + ) + assert result.ok is False + assert "exit=1" in result.error + assert "boom" in result.error + + def test_timeout(self, deps: Deps, xlsx_file: Path): + with patch( + "link_building_workflow.blm.run_blm_command", + side_effect=subprocess.TimeoutExpired(cmd="python", timeout=300), + ): + result = blm_ingest_cora( + xlsx_path=str(xlsx_file), + project_name="P", + deps=deps, + ) + assert result.ok is False + assert "timed out" in result.error + + def test_uses_config_default_branded_plus_ratio( + self, deps: Deps, xlsx_file: Path, ingest_success_stdout: str + ): + # Caller passes None, so Deps default (0.7) should be used + proc = _mock_proc(stdout=ingest_success_stdout) + with patch( + "link_building_workflow.blm.run_blm_command", return_value=proc + ) as mock_run: + blm_ingest_cora( + xlsx_path=str(xlsx_file), + project_name="P", + deps=deps, + branded_plus_ratio=None, + ) + args = mock_run.call_args[0][0] + # 0.7 is the default, so -bp should NOT appear in args + assert "-bp" not in args + + def test_caller_override_branded_plus_ratio( + self, deps: Deps, xlsx_file: Path, ingest_success_stdout: str + ): + proc = _mock_proc(stdout=ingest_success_stdout) + with patch( + "link_building_workflow.blm.run_blm_command", return_value=proc + ) as mock_run: + blm_ingest_cora( + xlsx_path=str(xlsx_file), + project_name="P", + deps=deps, + branded_plus_ratio=0.85, + ) + args = mock_run.call_args[0][0] + assert "-bp" in args + assert args[args.index("-bp") + 1] == "0.85" + + +# --------------------------------------------------------------------------- +# blm_generate_batch +# --------------------------------------------------------------------------- + + +class TestBlmGenerateBatch: + def test_missing_job_file_arg(self, deps: Deps): + result = blm_generate_batch(job_file="", deps=deps) + assert result.ok is False + assert "job_file is required" in result.error + + def test_job_file_does_not_exist(self, deps: Deps): + result = blm_generate_batch( + job_file="/definitely/not/here.json", deps=deps + ) + assert result.ok is False + assert "not found" in result.error + + def test_relative_path_resolved_against_blm_dir( + self, deps: Deps, generate_success_stdout: str + ): + # Create a relative job file under the fake BLM dir + job_rel = "jobs/x.json" + (Path(deps.blm.blm_dir) / "jobs").mkdir() + (Path(deps.blm.blm_dir) / job_rel).write_text("{}") + + proc = _mock_proc(stdout=generate_success_stdout) + with patch( + "link_building_workflow.blm.run_blm_command", return_value=proc + ) as mock_run: + result = blm_generate_batch(job_file=job_rel, deps=deps) + assert result.ok is True + + # The resolved absolute path should have been passed to BLM + args = mock_run.call_args[0][0] + j_index = args.index("-j") + passed_path = args[j_index + 1] + assert passed_path.endswith("x.json") + assert Path(passed_path).is_absolute() + + def test_continue_on_error_flag_default( + self, deps: Deps, tmp_path: Path, generate_success_stdout: str + ): + job = tmp_path / "job.json" + job.write_text("{}") + + proc = _mock_proc(stdout=generate_success_stdout) + with patch( + "link_building_workflow.blm.run_blm_command", return_value=proc + ) as mock_run: + blm_generate_batch(job_file=str(job), deps=deps) + args = mock_run.call_args[0][0] + assert "--continue-on-error" in args + + def test_continue_on_error_disabled( + self, deps: Deps, tmp_path: Path, generate_success_stdout: str + ): + job = tmp_path / "job.json" + job.write_text("{}") + + proc = _mock_proc(stdout=generate_success_stdout) + with patch( + "link_building_workflow.blm.run_blm_command", return_value=proc + ) as mock_run: + blm_generate_batch( + job_file=str(job), deps=deps, continue_on_error=False + ) + args = mock_run.call_args[0][0] + assert "--continue-on-error" not in args + + def test_debug_flag(self, deps: Deps, tmp_path: Path, generate_success_stdout: str): + job = tmp_path / "job.json" + job.write_text("{}") + proc = _mock_proc(stdout=generate_success_stdout) + with patch( + "link_building_workflow.blm.run_blm_command", return_value=proc + ) as mock_run: + blm_generate_batch(job_file=str(job), deps=deps, debug=True) + assert "--debug" in mock_run.call_args[0][0] + + def test_nonzero_exit(self, deps: Deps, tmp_path: Path): + job = tmp_path / "job.json" + job.write_text("{}") + + proc = _mock_proc(stdout="", stderr="fail", returncode=2) + with patch( + "link_building_workflow.blm.run_blm_command", return_value=proc + ): + result = blm_generate_batch(job_file=str(job), deps=deps) + assert result.ok is False + assert "exit=2" in result.error + + def test_timeout(self, deps: Deps, tmp_path: Path): + job = tmp_path / "job.json" + job.write_text("{}") + with patch( + "link_building_workflow.blm.run_blm_command", + side_effect=subprocess.TimeoutExpired(cmd="python", timeout=300), + ): + result = blm_generate_batch(job_file=str(job), deps=deps) + assert result.ok is False + assert "timed out" in result.error + + +# --------------------------------------------------------------------------- +# run_cora_backlinks (full pipeline) +# --------------------------------------------------------------------------- + + +class TestRunCoraBacklinks: + def test_missing_money_site_url(self, deps: Deps, xlsx_file: Path): + result = run_cora_backlinks( + xlsx_path=str(xlsx_file), + project_name="P", + money_site_url="", + deps=deps, + ) + assert result.ok is False + assert "IMSURL" in result.error + + def test_full_success( + self, + deps: Deps, + xlsx_file: Path, + ingest_success_stdout: str, + generate_success_stdout: str, + ): + # ingest stdout must reference a job file that then exists on disk + # for blm_generate_batch's existence check to pass. + job_rel = "jobs/test-project.json" + (Path(deps.blm.blm_dir) / "jobs").mkdir() + (Path(deps.blm.blm_dir) / job_rel).write_text("{}") + + procs = [ + _mock_proc(stdout=ingest_success_stdout), + _mock_proc(stdout=generate_success_stdout), + ] + with patch( + "link_building_workflow.blm.run_blm_command", side_effect=procs + ) as mock_run: + result = run_cora_backlinks( + xlsx_path=str(xlsx_file), + project_name="Test Project", + money_site_url="https://example.com", + deps=deps, + ) + + assert result.ok is True + assert result.step == "complete" + assert result.ingest is not None + assert result.generate is not None + assert result.ingest.project_id == "42" + assert result.generate.job_moved_to == "jobs/done/test-project.json" + assert result.job_file == "jobs/done/test-project.json" + assert "Step 1" in result.summary and "Step 2" in result.summary + + # BLM was invoked twice (ingest, generate) + assert mock_run.call_count == 2 + ingest_args = mock_run.call_args_list[0][0][0] + generate_args = mock_run.call_args_list[1][0][0] + assert "ingest-cora" in ingest_args + assert "generate-batch" in generate_args + + def test_ingest_failure_skips_generate( + self, deps: Deps, xlsx_file: Path + ): + procs = [_mock_proc(stdout="", stderr="fail", returncode=1)] + with patch( + "link_building_workflow.blm.run_blm_command", side_effect=procs + ) as mock_run: + result = run_cora_backlinks( + xlsx_path=str(xlsx_file), + project_name="P", + money_site_url="https://example.com", + deps=deps, + ) + assert result.ok is False + assert result.step == "ingest" + assert mock_run.call_count == 1 # generate not called + + def test_generate_failure_preserves_ingest( + self, deps: Deps, xlsx_file: Path, ingest_success_stdout: str + ): + job_rel = "jobs/test-project.json" + (Path(deps.blm.blm_dir) / "jobs").mkdir() + (Path(deps.blm.blm_dir) / job_rel).write_text("{}") + + procs = [ + _mock_proc(stdout=ingest_success_stdout), + _mock_proc(stdout="", stderr="gen fail", returncode=3), + ] + with patch( + "link_building_workflow.blm.run_blm_command", side_effect=procs + ): + result = run_cora_backlinks( + xlsx_path=str(xlsx_file), + project_name="Test Project", + money_site_url="https://example.com", + deps=deps, + ) + assert result.ok is False + assert result.step == "generate" + # Ingest succeeded; its data is still on the result + assert result.ingest is not None + assert result.ingest.project_id == "42" + assert "gen fail" in result.error + + def test_on_progress_callback_invoked( + self, + deps: Deps, + xlsx_file: Path, + ingest_success_stdout: str, + generate_success_stdout: str, + ): + job_rel = "jobs/test-project.json" + (Path(deps.blm.blm_dir) / "jobs").mkdir() + (Path(deps.blm.blm_dir) / job_rel).write_text("{}") + + progress_calls: list[str] = [] + procs = [ + _mock_proc(stdout=ingest_success_stdout), + _mock_proc(stdout=generate_success_stdout), + ] + with patch( + "link_building_workflow.blm.run_blm_command", side_effect=procs + ): + result = run_cora_backlinks( + xlsx_path=str(xlsx_file), + project_name="Test Project", + money_site_url="https://example.com", + deps=deps, + on_progress=progress_calls.append, + ) + assert result.ok is True + assert len(progress_calls) >= 2 + assert any("Step 1" in m for m in progress_calls) + assert any("Step 2" in m for m in progress_calls) + # log_lines mirrors progress_calls + assert result.log_lines == progress_calls + + def test_on_progress_exception_does_not_break_pipeline( + self, + deps: Deps, + xlsx_file: Path, + ingest_success_stdout: str, + generate_success_stdout: str, + ): + job_rel = "jobs/test-project.json" + (Path(deps.blm.blm_dir) / "jobs").mkdir() + (Path(deps.blm.blm_dir) / job_rel).write_text("{}") + + def broken(_msg: str) -> None: + raise RuntimeError("progress callback failed") + + procs = [ + _mock_proc(stdout=ingest_success_stdout), + _mock_proc(stdout=generate_success_stdout), + ] + with patch( + "link_building_workflow.blm.run_blm_command", side_effect=procs + ): + result = run_cora_backlinks( + xlsx_path=str(xlsx_file), + project_name="Test Project", + money_site_url="https://example.com", + deps=deps, + on_progress=broken, + ) + # Pipeline still completed successfully despite broken callback + assert result.ok is True + + def test_uses_config_default_ratio_when_none( + self, + deps: Deps, + xlsx_file: Path, + ingest_success_stdout: str, + generate_success_stdout: str, + ): + # Verify the Deps-level default flows into build_ingest_args + blm_cfg = BLMConfig( + blm_dir=deps.blm.blm_dir, + username=deps.blm.username, + password=deps.blm.password, + timeout_seconds=deps.blm.timeout_seconds, + default_branded_plus_ratio=0.9, # non-default + python_exe=deps.blm.python_exe, + ) + new_deps = Deps(blm=blm_cfg, llm_check=deps.llm_check) + + job_rel = "jobs/test-project.json" + (Path(blm_cfg.blm_dir) / "jobs").mkdir() + (Path(blm_cfg.blm_dir) / job_rel).write_text("{}") + + procs = [ + _mock_proc(stdout=ingest_success_stdout), + _mock_proc(stdout=generate_success_stdout), + ] + with patch( + "link_building_workflow.blm.run_blm_command", side_effect=procs + ) as mock_run: + run_cora_backlinks( + xlsx_path=str(xlsx_file), + project_name="Test Project", + money_site_url="https://example.com", + deps=new_deps, + branded_plus_ratio=None, # should pick up 0.9 default + ) + ingest_args = mock_run.call_args_list[0][0][0] + assert "-bp" in ingest_args + assert ingest_args[ingest_args.index("-bp") + 1] == "0.9"