From 381d51e00188a3447dea4c551216a4ebb03cae2f Mon Sep 17 00:00:00 2001
From: bryanb <bryanb@peninsulaindustries.com>
Date: Wed, 22 Apr 2026 12:11:16 +0000
Subject: [PATCH] Initial commit: link building workflow extracted from
 CheddahBot

Standalone package wrapping Big-Link-Man (BLM) for Paperclip. Extracted
from cheddahbot/tools/linkbuilding.py and related modules, with
task-system coupling, folder watching, and AutoCora queue logic
stripped out.

Public API:
- Deps, BLMConfig, LLMCheck (injection types)
- normalize_for_match, fuzzy_keyword_match, filename_stem_to_keyword
- list_inbox_xlsx, find_xlsx_for_keyword, find_all_xlsx_for_keyword
- blm_ingest_cora, blm_generate_batch, run_cora_backlinks (pipelines)
- PipelineResult, IngestResult, GenerateResult (return types)

89 tests, 96% coverage.
---
 .gitignore                             |  14 +
 README.md                              | 211 ++++++++++++
 pyproject.toml                         |  53 +++
 src/link_building_workflow/__init__.py | 128 +++++++
 src/link_building_workflow/blm.py      | 146 ++++++++
 src/link_building_workflow/deps.py     |  63 ++++
 src/link_building_workflow/inbox.py    | 116 +++++++
 src/link_building_workflow/matching.py |  59 ++++
 src/link_building_workflow/pipeline.py | 318 +++++++++++++++++
 tests/conftest.py                      |  80 +++++
 tests/test_blm.py                      | 220 ++++++++++++
 tests/test_inbox.py                    | 170 +++++++++
 tests/test_matching.py                 |  97 ++++++
 tests/test_pipeline.py                 | 460 +++++++++++++++++++++++++
 14 files changed, 2135 insertions(+)
 create mode 100644 .gitignore
 create mode 100644 README.md
 create mode 100644 pyproject.toml
 create mode 100644 src/link_building_workflow/__init__.py
 create mode 100644 src/link_building_workflow/blm.py
 create mode 100644 src/link_building_workflow/deps.py
 create mode 100644 src/link_building_workflow/inbox.py
 create mode 100644 src/link_building_workflow/matching.py
 create mode 100644 src/link_building_workflow/pipeline.py
 create mode 100644 tests/conftest.py
 create mode 100644 tests/test_blm.py
 create mode 100644 tests/test_inbox.py
 create mode 100644 tests/test_matching.py
 create mode 100644 tests/test_pipeline.py

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..c572a56
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,14 @@
+__pycache__/
+*.py[cod]
+*.egg-info/
+.pytest_cache/
+.coverage
+.coverage.*
+htmlcov/
+coverage.xml
+.ruff_cache/
+.venv/
+dist/
+build/
+*.bak
+uv.lock
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..d4beaa4
--- /dev/null
+++ b/README.md
@@ -0,0 +1,211 @@
+# Linkman-Paperclip-Wrap
+
+A standalone Python package wrapping the Big-Link-Man (BLM) CLI for use by
+Paperclip agents. Extracted from CheddahBot (`cheddahbot/tools/linkbuilding.py`)
+and simplified for consumption by external callers.
+
+## What it does
+
+Given a task keyword, the package can:
+
+1. **Find a matching CORA `.xlsx`** in an inbox folder (e.g. `Cora-For-Humans/`)
+   using fuzzy keyword matching with singular/plural awareness.
+2. **Invoke Big-Link-Man** to run `ingest-cora` and `generate-batch` on that
+   xlsx, producing the backlink content.
+3. **Return a structured result** the caller can use to update task state.
+
+No folder watching, no task-system coupling, no notifications. The caller owns
+task state and polling cadence; this package is pure work.
+
+## Package layout
+
+```
+src/link_building_workflow/
+  deps.py       -- Deps, BLMConfig, LLMCheck types
+  matching.py   -- Keyword normalization and fuzzy matching
+  inbox.py      -- Inbox folder scanning (list / find-by-keyword)
+  blm.py        -- BLM subprocess wrapper and stdout parsers
+  pipeline.py   -- run_cora_backlinks, blm_ingest_cora, blm_generate_batch
+  __init__.py   -- Public API re-exports
+```
+
+## Installation
+
+```
+uv add git+https://git.peninsulaindustries.com/bryanb/Linkman-Paperclip-Wrap.git
+```
+
+Big-Link-Man itself is a separate dependency the caller provides. Install it
+on the same host and point `BLMConfig.blm_dir` at the checkout.
+
+## Public API
+
+All imports available from the top level:
+
+```python
+from link_building_workflow import (
+    # Dependency types
+    Deps, BLMConfig, LLMCheck,
+    # Matching primitives
+    normalize_for_match, fuzzy_keyword_match, filename_stem_to_keyword,
+    # Inbox scanning
+    InboxMatch, list_inbox_xlsx, find_xlsx_for_keyword, find_all_xlsx_for_keyword,
+    # Pipeline entry points
+    PipelineResult, run_cora_backlinks, blm_ingest_cora, blm_generate_batch,
+    # Low-level BLM (if you need to run a custom BLM command)
+    IngestResult, GenerateResult, build_ingest_args,
+    parse_ingest_output, parse_generate_output, run_blm_command,
+)
+```
+
+## Typical usage (Paperclip)
+
+The caller decides when a task is eligible to run (all required task fields
+filled in, xlsx present in the inbox). This package provides the primitives
+to check the xlsx gate and to execute the work.
+
+```python
+from link_building_workflow import (
+    Deps, BLMConfig, find_xlsx_for_keyword, run_cora_backlinks,
+)
+
+deps = Deps(
+    blm=BLMConfig(
+        blm_dir="/opt/big-link-man",
+        username="your-blm-user",
+        password="your-blm-pass",
+        timeout_seconds=1800,
+    ),
+    llm_check=your_plural_checker,  # callable[[str, str], bool]
+)
+
+def try_run_link_building(task):
+    # Caller gates 1-4: task-field checks (LB Method, Keyword, IMSURL, ...)
+    if not (task.keyword and task.imsurl):
+        return "blocked: missing task fields"
+
+    # Gate 5: does a matching xlsx exist yet?
+    match = find_xlsx_for_keyword(
+        "/data/Cora-For-Humans",
+        task.keyword,
+        deps.llm_check,
+    )
+    if match is None:
+        return "blocked: no xlsx in Cora-For-Humans"
+
+    # Execute
+    result = run_cora_backlinks(
+        xlsx_path=str(match.path),
+        project_name=task.keyword,
+        money_site_url=task.imsurl,
+        custom_anchors=task.custom_anchors or "",
+        cli_flags=task.cli_flags or "",
+        branded_plus_ratio=task.branded_plus_ratio,  # None -> BLMConfig default
+        deps=deps,
+    )
+
+    if result.ok:
+        # result.summary is a multi-line human-readable string
+        # result.ingest.project_id, result.generate.job_moved_to, etc.
+        return f"done: {result.summary}"
+    else:
+        # result.step tells you where it stopped: "ingest" or "generate"
+        # result.error has the details
+        return f"failed at {result.step}: {result.error}"
+```
+
+## The `LLMCheck` callable
+
+Used when the fast-path string equality fails during fuzzy matching. Should
+return `True` iff two keywords are the same modulo plural form ("shaft" vs
+"shafts", "company" vs "companies"). Return `False` for any other kind of
+difference. Implementations should cache -- the workflow may call this
+repeatedly with the same pair while scanning an inbox.
+
+Example implementation (the one CheddahBot uses):
+
+```python
+import httpx
+
+_cache = {}
+
+def openrouter_plural_check(a: str, b: str) -> bool:
+    key = (a, b) if a <= b else (b, a)
+    if key in _cache:
+        return _cache[key]
+    resp = httpx.post(
+        "https://openrouter.ai/api/v1/chat/completions",
+        headers={"Authorization": f"Bearer {OPENROUTER_API_KEY}"},
+        json={
+            "model": "anthropic/claude-haiku-4.5",
+            "max_tokens": 5,
+            "messages": [
+                {"role": "system", "content":
+                 "Reply with only 'YES' or 'NO'. YES iff the two keywords "
+                 "are identical except for singular/plural form."},
+                {"role": "user", "content": f'A: "{a}"\nB: "{b}"'},
+            ],
+        },
+        timeout=15,
+    )
+    result = "YES" in resp.json()["choices"][0]["message"]["content"].upper()
+    _cache[key] = result
+    return result
+```
+
+Tests may pass `lambda a, b: False` for the fast-path-only case, or any
+deterministic fake.
+
+## The `PipelineResult` dataclass
+
+Every pipeline entry point returns the same shape:
+
+| field           | meaning                                                        |
+|-----------------|----------------------------------------------------------------|
+| `ok`            | True if the pipeline completed the phase it was asked to do    |
+| `step`          | "ingest" / "generate" / "complete" (on success) or where it failed |
+| `ingest`        | `IngestResult` if ingest ran, else None                        |
+| `generate`      | `GenerateResult` if generate ran, else None                    |
+| `error`         | Human-readable error message (empty on success)                |
+| `summary`       | Multi-line human-readable summary, safe to post as a comment   |
+| `project_name`  | The BLM project name                                           |
+| `job_file`      | Path to the final job file (post-move on success)              |
+| `log_lines`     | Progress messages captured during the run                      |
+
+## What this package does NOT do
+
+- Does not watch folders. No threads, no polling loops.
+- Does not know about ClickUp, Linear, or any task system. The caller owns
+  task state and decides what status transitions mean.
+- Does not sync with shared-folder job queues (the old AutoCora queue).
+- Does not manage the Cora tool itself. It only consumes xlsx files that
+  Cora has already produced.
+- Does not pick up where BLM leaves off. When BLM finishes `generate-batch`,
+  the job is done from this package's perspective.
+
+These were deliberate drops during extraction. CheddahBot had folder-watch
+threads, ClickUp auto-matching, AutoCora queue submission, and a multi-inbox
+distribution loop. Paperclip owns that scheduling logic in its own code.
+
+## Development
+
+Requires Python 3.11+ and [uv](https://docs.astral.sh/uv/).
+
+```
+uv sync                    # install dev + test deps
+uv run pytest              # run the test suite (89 tests, ~96% coverage)
+uv run ruff check .        # lint
+```
+
+## Provenance
+
+Extracted from the CheddahBot repo, specifically:
+
+- `cheddahbot/tools/linkbuilding.py` -- pipeline logic and fuzzy matching
+- `cheddahbot/tools/autocora.py` -- only the fuzzy-match helpers were kept;
+  the shared-folder job queue and result polling were dropped
+- `cheddahbot/scheduler.py` -- folder-watch loops were dropped; their
+  matching logic was converted to a synchronous `find_xlsx_for_keyword` call
+
+The BLM invocation parameters, stdout parsing regexes, and default ratios
+match CheddahBot's production behavior exactly.
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000..2f8ec6e
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,53 @@
+[project]
+name = "link-building-workflow"
+version = "0.1.0"
+description = "CORA xlsx -> Big-Link-Man link building pipeline, extracted from CheddahBot for Paperclip"
+requires-python = ">=3.11"
+dependencies = []
+
+[build-system]
+requires = ["uv_build>=0.9,<1"]
+build-backend = "uv_build"
+
+[tool.uv.build-backend]
+module-root = "src"
+
+[dependency-groups]
+dev = [{ include-group = "lint" }, { include-group = "test" }]
+lint = ["ruff"]
+test = ["pytest", "pytest-cov"]
+
+[tool.uv]
+default-groups = ["dev", "test"]
+
+[tool.pytest.ini_options]
+testpaths = ["tests"]
+addopts = [
+    "-ra",
+    "--strict-markers",
+    "--strict-config",
+    "--cov=link_building_workflow",
+    "--cov-report=term-missing",
+]
+
+[tool.coverage.run]
+branch = true
+source = ["link_building_workflow"]
+
+[tool.coverage.report]
+exclude_lines = [
+    "pragma: no cover",
+    "if TYPE_CHECKING:",
+    "if __name__ == .__main__.:",
+]
+show_missing = true
+
+[tool.ruff]
+line-length = 100
+target-version = "py311"
+
+[tool.ruff.lint]
+select = ["E", "F", "I", "UP", "B", "SIM", "RUF"]
+
+[tool.ruff.lint.per-file-ignores]
+"tests/**/*.py" = ["S101", "PLR2004", "ANN"]
diff --git a/src/link_building_workflow/__init__.py b/src/link_building_workflow/__init__.py
new file mode 100644
index 0000000..eb2685a
--- /dev/null
+++ b/src/link_building_workflow/__init__.py
@@ -0,0 +1,128 @@
+"""Link building workflow: CORA xlsx -> Big-Link-Man pipeline.
+
+A standalone package extracted from CheddahBot for consumption by Paperclip
+(and anything else that wants to trigger link-building work). The caller
+owns task state; this package owns the work.
+
+Public API
+----------
+Matching primitives::
+
+    normalize_for_match(text) -> str
+    fuzzy_keyword_match(a, b, llm_check=None) -> bool
+    filename_stem_to_keyword(stem) -> str
+
+Inbox scanning::
+
+    list_inbox_xlsx(folder) -> list[Path]
+    find_xlsx_for_keyword(folder, keyword, llm_check=None) -> InboxMatch | None
+    find_all_xlsx_for_keyword(folder, keyword, llm_check=None) -> list[InboxMatch]
+
+Pipeline entry points::
+
+    run_cora_backlinks(xlsx_path, project_name, money_site_url, deps, ...) -> PipelineResult
+    blm_ingest_cora(xlsx_path, project_name, deps, ...) -> PipelineResult
+    blm_generate_batch(job_file, deps, ...) -> PipelineResult
+
+Dependency types::
+
+    Deps(blm, llm_check)
+    BLMConfig(blm_dir, username, password, timeout_seconds, ...)
+    LLMCheck = Callable[[str, str], bool]
+
+Typical Paperclip usage
+-----------------------
+
+    from link_building_workflow import (
+        Deps, BLMConfig,
+        find_xlsx_for_keyword,
+        run_cora_backlinks,
+    )
+
+    deps = Deps(
+        blm=BLMConfig(blm_dir="/opt/blm", username="...", password="..."),
+        llm_check=my_plural_checker,
+    )
+
+    # Gate: does a matching xlsx exist yet?
+    match = find_xlsx_for_keyword(
+        "/data/Cora-For-Humans", task.keyword, deps.llm_check
+    )
+    if match is None:
+        return "waiting for xlsx"
+
+    # All gates passed -- run the work
+    result = run_cora_backlinks(
+        xlsx_path=str(match.path),
+        project_name=task.keyword,
+        money_site_url=task.imsurl,
+        deps=deps,
+    )
+    if result.ok:
+        # success -- post result.summary as a comment, advance task state
+        ...
+    else:
+        # failure -- result.error has the reason, result.step is where it stopped
+        ...
+"""
+
+from __future__ import annotations
+
+from .blm import (
+    GenerateResult,
+    IngestResult,
+    build_ingest_args,
+    parse_generate_output,
+    parse_ingest_output,
+    run_blm_command,
+)
+from .deps import BLMConfig, Deps, LLMCheck
+from .inbox import (
+    InboxMatch,
+    find_all_xlsx_for_keyword,
+    find_xlsx_for_keyword,
+    list_inbox_xlsx,
+)
+from .matching import (
+    filename_stem_to_keyword,
+    fuzzy_keyword_match,
+    normalize_for_match,
+)
+from .pipeline import (
+    PipelineResult,
+    ProgressFn,
+    blm_generate_batch,
+    blm_ingest_cora,
+    run_cora_backlinks,
+)
+
+__all__ = [  # noqa: RUF022 -- grouped by module for readability
+    # deps
+    "Deps",
+    "BLMConfig",
+    "LLMCheck",
+    # matching
+    "normalize_for_match",
+    "fuzzy_keyword_match",
+    "filename_stem_to_keyword",
+    # inbox
+    "InboxMatch",
+    "list_inbox_xlsx",
+    "find_xlsx_for_keyword",
+    "find_all_xlsx_for_keyword",
+    # blm low-level
+    "IngestResult",
+    "GenerateResult",
+    "build_ingest_args",
+    "parse_ingest_output",
+    "parse_generate_output",
+    "run_blm_command",
+    # pipeline
+    "PipelineResult",
+    "ProgressFn",
+    "run_cora_backlinks",
+    "blm_ingest_cora",
+    "blm_generate_batch",
+]
+
+__version__ = "0.1.0"
diff --git a/src/link_building_workflow/blm.py b/src/link_building_workflow/blm.py
new file mode 100644
index 0000000..0bc8729
--- /dev/null
+++ b/src/link_building_workflow/blm.py
@@ -0,0 +1,146 @@
+"""Big-Link-Man CLI wrapper.
+
+BLM is an external Python tool. We invoke it via subprocess using whatever
+Python interpreter the caller configured in `BLMConfig.python_exe`. With
+BLM installed alongside the caller on the same host (the expected Paperclip
+setup), this is usually just "python".
+"""
+
+from __future__ import annotations
+
+import logging
+import re
+import subprocess
+from dataclasses import dataclass
+from pathlib import Path
+
+from .deps import BLMConfig
+
+log = logging.getLogger(__name__)
+
+
+@dataclass
+class IngestResult:
+    """Parsed output of an `ingest-cora` run."""
+
+    project_id: str = ""
+    job_file: str = ""
+    project_name: str = ""
+    main_keyword: str = ""
+
+    @property
+    def success(self) -> bool:
+        return bool(self.project_id and self.job_file)
+
+
+@dataclass
+class GenerateResult:
+    """Parsed output of a `generate-batch` run."""
+
+    job_moved_to: str = ""
+    success: bool = False
+    raw_output: str = ""
+
+
+def build_ingest_args(
+    *,
+    xlsx_path: str,
+    project_name: str,
+    money_site_url: str = "",
+    branded_plus_ratio: float = 0.7,
+    custom_anchors: str = "",
+    cli_flags: str = "",
+) -> list[str]:
+    """Construct the argv tail for `main.py ingest-cora ...`.
+
+    Does not include the interpreter or `main.py`. Does not include -u/-p
+    credentials -- those are injected by `run_blm_command`. Only emits
+    -bp when the ratio differs from BLM's own default of 0.7 to keep the
+    command line minimal when defaults are in play.
+    """
+    args = ["ingest-cora", "-f", xlsx_path, "-n", project_name]
+    if money_site_url:
+        args.extend(["-m", money_site_url])
+    if branded_plus_ratio and branded_plus_ratio != 0.7:
+        args.extend(["-bp", str(branded_plus_ratio)])
+    if custom_anchors:
+        args.extend(["-a", custom_anchors])
+    if cli_flags:
+        args.extend(cli_flags.strip().split())
+    return args
+
+
+def run_blm_command(
+    args: list[str],
+    blm: BLMConfig,
+) -> subprocess.CompletedProcess:
+    """Run a BLM CLI command and return the CompletedProcess.
+
+    Always injects -u/-p from `blm.username`/`blm.password` unless the
+    caller already put them in `args`. cwd is set to `blm.blm_dir` so BLM
+    can find its own relative paths (config files, jobs/ directory, etc.).
+
+    Raises subprocess.TimeoutExpired if the command exceeds
+    `blm.timeout_seconds`. Raises FileNotFoundError if `blm.blm_dir` or
+    `blm.python_exe` can't be found.
+    """
+    blm_path = Path(blm.blm_dir)
+    if not blm_path.exists():
+        raise FileNotFoundError(f"BLM directory not found: {blm.blm_dir}")
+
+    cmd: list[str] = [blm.python_exe, "main.py", *args]
+
+    if blm.username and "-u" not in args and "--username" not in args:
+        cmd.extend(["-u", blm.username])
+    if blm.password and "-p" not in args and "--password" not in args:
+        cmd.extend(["-p", blm.password])
+
+    log.info("Running BLM: %s (cwd=%s)", " ".join(cmd), blm.blm_dir)
+    result = subprocess.run(
+        cmd,
+        cwd=blm.blm_dir,
+        capture_output=True,
+        text=True,
+        timeout=blm.timeout_seconds,
+    )
+    log.info("BLM exit=%d", result.returncode)
+    if result.stdout:
+        log.debug("BLM stdout: %s", result.stdout[:1000])
+    if result.stderr:
+        log.debug("BLM stderr: %s", result.stderr[:1000])
+    return result
+
+
+_INGEST_PROJECT_RE = re.compile(r"^Success: Project '(.+)' created \(ID: (\d+)\)$")
+_INGEST_JOB_RE = re.compile(r"^Job file created: (.+)$")
+_INGEST_KEYWORD_RE = re.compile(r"^Main Keyword: (.+)$")
+_GENERATE_MOVED_RE = re.compile(r"^Job file moved to: (.+)$")
+
+
+def parse_ingest_output(stdout: str) -> IngestResult:
+    """Extract project id, job file path, and main keyword from ingest stdout."""
+    result = IngestResult()
+    for raw in stdout.splitlines():
+        line = raw.strip()
+        if m := _INGEST_PROJECT_RE.match(line):
+            result.project_name = m.group(1)
+            result.project_id = m.group(2)
+            continue
+        if m := _INGEST_JOB_RE.match(line):
+            result.job_file = m.group(1).strip()
+            continue
+        if m := _INGEST_KEYWORD_RE.match(line):
+            result.main_keyword = m.group(1).strip()
+            continue
+    return result
+
+
+def parse_generate_output(stdout: str) -> GenerateResult:
+    """Extract the post-run job file path from generate-batch stdout."""
+    result = GenerateResult(raw_output=stdout)
+    for raw in stdout.splitlines():
+        line = raw.strip()
+        if m := _GENERATE_MOVED_RE.match(line):
+            result.job_moved_to = m.group(1).strip()
+            result.success = True
+    return result
diff --git a/src/link_building_workflow/deps.py b/src/link_building_workflow/deps.py
new file mode 100644
index 0000000..7e4e9a7
--- /dev/null
+++ b/src/link_building_workflow/deps.py
@@ -0,0 +1,63 @@
+"""Dependency injection types for the link building workflow.
+
+The workflow is LLM-agnostic: the caller (Paperclip, tests, anything) implements
+the `LLMCheck` callable and passes a `Deps` instance into every public function.
+
+Nothing in this module touches the network, a task system, or an LLM directly.
+Task state (status, custom fields, comments) is owned by the caller -- this
+package only returns structured results and lets the caller decide what to do.
+"""
+
+from __future__ import annotations
+
+from collections.abc import Callable
+from dataclasses import dataclass
+
+# An LLM-backed equality check for singular/plural keyword matching.
+# Returns True iff `a` and `b` are the same SEO keyword modulo plural form
+# (e.g. "shaft" vs "shafts", "company" vs "companies"). Returns False for
+# any other kind of difference.
+#
+# Implementations should cache results; the workflow may call this repeatedly
+# with the same pair while scanning a folder of xlsx files against a task list.
+LLMCheck = Callable[[str, str], bool]
+
+
+@dataclass
+class BLMConfig:
+    """Configuration for invoking the Big-Link-Man CLI.
+
+    BLM is an external Python tool; the workflow shells out to it. With BLM
+    installed on the same host as the caller (the expected Paperclip setup),
+    the default of using the system `python` resolves BLM's own dependencies
+    if BLM was installed into the same environment. Override `python_exe` to
+    point at a specific interpreter when BLM has its own virtualenv.
+    """
+
+    # Path to the Big-Link-Man checkout. Must contain main.py.
+    blm_dir: str
+    # BLM auth, passed as -u / -p on every CLI call. Empty strings are
+    # skipped, so it's safe to leave these unset if BLM doesn't need them.
+    username: str = ""
+    password: str = ""
+    # Subprocess timeout per BLM invocation, in seconds.
+    # Default covers generate-batch runs of ~25 min plus headroom.
+    timeout_seconds: int = 1800
+    # Default branded+ ratio passed to ingest-cora if the caller doesn't
+    # supply one. BLM's own default is 0.7; we match it.
+    default_branded_plus_ratio: float = 0.7
+    # Python interpreter used to run BLM. Defaults to "python" (on PATH).
+    # Set to an absolute path like "/opt/blm/.venv/bin/python" if BLM has
+    # its own venv separate from the caller.
+    python_exe: str = "python"
+
+
+@dataclass
+class Deps:
+    """Container for everything the workflow needs from the outside world.
+
+    Construct this once per run and pass it through.
+    """
+
+    blm: BLMConfig
+    llm_check: LLMCheck
diff --git a/src/link_building_workflow/inbox.py b/src/link_building_workflow/inbox.py
new file mode 100644
index 0000000..429f645
--- /dev/null
+++ b/src/link_building_workflow/inbox.py
@@ -0,0 +1,116 @@
+"""Locate CORA .xlsx files in an inbox folder by keyword.
+
+The caller's task state tells them which keyword they're looking for; this
+module tells them whether a matching xlsx exists and where. No folder
+watching, no threading -- it's a one-shot scan that Paperclip calls when
+evaluating whether a task can run.
+"""
+
+from __future__ import annotations
+
+import logging
+from dataclasses import dataclass
+from pathlib import Path
+
+from .deps import LLMCheck
+from .matching import filename_stem_to_keyword, fuzzy_keyword_match, normalize_for_match
+
+log = logging.getLogger(__name__)
+
+
+@dataclass
+class InboxMatch:
+    """A single xlsx in the inbox that matched the requested keyword."""
+
+    path: Path
+    filename: str
+    stem_keyword: str  # the normalized keyword derived from the filename
+
+
+def list_inbox_xlsx(
+    folder: str | Path,
+    *,
+    skip_processed: bool = True,
+) -> list[Path]:
+    """Return all .xlsx files in `folder` that are candidates for matching.
+
+    Skips Office temp/lock files (names starting with "~$"). If
+    `skip_processed` is True (the default), also skips any file whose name
+    appears in `folder/processed/` -- that subfolder is treated as the
+    archive of already-handled files.
+
+    Returns an empty list if the folder doesn't exist. The caller decides
+    whether that's an error or a "no xlsx available yet" gate.
+    """
+    path = Path(folder)
+    if not path.exists():
+        log.warning("Inbox folder does not exist: %s", path)
+        return []
+
+    processed_names: set[str] = set()
+    if skip_processed:
+        processed_dir = path / "processed"
+        if processed_dir.exists():
+            processed_names = {f.name for f in processed_dir.glob("*.xlsx")}
+
+    candidates: list[Path] = []
+    for f in sorted(path.glob("*.xlsx")):
+        if f.name.startswith("~$"):
+            continue
+        if f.name in processed_names:
+            continue
+        candidates.append(f)
+    return candidates
+
+
+def find_xlsx_for_keyword(
+    folder: str | Path,
+    keyword: str,
+    llm_check: LLMCheck | None = None,
+    *,
+    skip_processed: bool = True,
+) -> InboxMatch | None:
+    """Find a single xlsx in `folder` whose filename matches `keyword`.
+
+    Returns the first match on a filename-stem fuzzy-match against the
+    requested keyword, or None if nothing matches.
+
+    `keyword` is normalized internally, so the caller can pass it in any
+    form (e.g. "Precision CNC Machining" or "precision-cnc-machining").
+
+    Uses `llm_check` for singular/plural equivalence; falls back to exact
+    match if `llm_check` is None. `skip_processed` controls whether files
+    already in `folder/processed/` are considered.
+    """
+    matches = find_all_xlsx_for_keyword(
+        folder, keyword, llm_check, skip_processed=skip_processed
+    )
+    return matches[0] if matches else None
+
+
+def find_all_xlsx_for_keyword(
+    folder: str | Path,
+    keyword: str,
+    llm_check: LLMCheck | None = None,
+    *,
+    skip_processed: bool = True,
+) -> list[InboxMatch]:
+    """Find every xlsx in `folder` whose filename matches `keyword`.
+
+    Same matching rules as `find_xlsx_for_keyword`, but returns all matches
+    instead of just the first. Useful when a keyword legitimately has
+    several xlsx variants (for example an original and a re-run) and the
+    caller wants to pick the newest by mtime.
+    """
+    target = normalize_for_match(keyword)
+    if not target:
+        return []
+
+    results: list[InboxMatch] = []
+    for f in list_inbox_xlsx(folder, skip_processed=skip_processed):
+        stem_kw = filename_stem_to_keyword(f.stem)
+        if fuzzy_keyword_match(target, stem_kw, llm_check):
+            results.append(
+                InboxMatch(path=f, filename=f.name, stem_keyword=stem_kw)
+            )
+    return results
diff --git a/src/link_building_workflow/matching.py b/src/link_building_workflow/matching.py
new file mode 100644
index 0000000..c94ffd1
--- /dev/null
+++ b/src/link_building_workflow/matching.py
@@ -0,0 +1,59 @@
+"""Keyword normalization and fuzzy matching.
+
+Pure functions with no I/O. The LLM check for singular/plural equivalence
+is injected, so tests can substitute a deterministic fake and production
+can plug in any model.
+"""
+
+from __future__ import annotations
+
+import re
+
+from .deps import LLMCheck
+
+
+def normalize_for_match(text: str) -> str:
+    """Normalize text for fuzzy matching.
+
+    Lowercases, replaces runs of non-alphanumeric characters with a single
+    space, and collapses whitespace. The result is suitable as input to
+    `fuzzy_keyword_match`.
+    """
+    text = text.lower().strip()
+    text = re.sub(r"[^a-z0-9\s]", " ", text)
+    text = re.sub(r"\s+", " ", text).strip()
+    return text
+
+
+def fuzzy_keyword_match(a: str, b: str, llm_check: LLMCheck | None = None) -> bool:
+    """Compare two normalized strings for keyword equivalence.
+
+    Fast path: exact match after normalization returns True immediately.
+    Slow path: delegate to `llm_check` to decide if the two keywords differ
+    only in singular vs plural form.
+
+    If `llm_check` is None and the fast path fails, returns False. Empty
+    inputs always return False.
+
+    Inputs are expected to be pre-normalized (call `normalize_for_match`
+    first). Passing un-normalized strings will still work but is wasteful
+    when matching against many candidates.
+    """
+    if not a or not b:
+        return False
+    if a == b:
+        return True
+    if llm_check is None:
+        return False
+    return llm_check(a, b)
+
+
+def filename_stem_to_keyword(stem: str) -> str:
+    """Convert a filename stem to a matchable keyword.
+
+    Example: "precision-cnc_machining" -> "precision cnc machining"
+
+    The returned value is already normalized.
+    """
+    stem = stem.lower().replace("-", " ").replace("_", " ")
+    return normalize_for_match(stem)
diff --git a/src/link_building_workflow/pipeline.py b/src/link_building_workflow/pipeline.py
new file mode 100644
index 0000000..4d698e7
--- /dev/null
+++ b/src/link_building_workflow/pipeline.py
@@ -0,0 +1,318 @@
+"""High-level link building pipelines.
+
+These functions do the work and return a structured result. They do NOT
+touch any task system. The caller (Paperclip) reads the returned
+`PipelineResult` and decides what to do: update task status, post a
+comment, move the xlsx to a processed folder, alert, etc.
+"""
+
+from __future__ import annotations
+
+import logging
+import subprocess
+from collections.abc import Callable
+from dataclasses import dataclass, field
+from pathlib import Path
+
+from . import blm as blm_mod
+from .blm import (
+    GenerateResult,
+    IngestResult,
+    build_ingest_args,
+    parse_generate_output,
+    parse_ingest_output,
+)
+from .deps import Deps
+
+log = logging.getLogger(__name__)
+
+
+ProgressFn = Callable[[str], None]
+
+
+@dataclass
+class PipelineResult:
+    """Outcome of a full or partial link building pipeline run.
+
+    `ok` is the single boolean the caller should branch on. `step` tells
+    them which phase the result is from: on failure it's where the
+    pipeline stopped; on success it's "ingest", "generate", or "complete"
+    depending on which entry point was called.
+
+    `summary` is a multi-line human-readable string safe to post as a
+    task comment or log. `error` is populated only when `ok` is False.
+    """
+
+    ok: bool
+    step: str  # "ingest" | "generate" | "complete"
+    ingest: IngestResult | None = None
+    generate: GenerateResult | None = None
+    error: str = ""
+    summary: str = ""
+    project_name: str = ""
+    job_file: str = ""
+    log_lines: list[str] = field(default_factory=list)
+
+
+def _err(
+    step: str,
+    message: str,
+    ingest: IngestResult | None = None,
+    generate: GenerateResult | None = None,
+    log_lines: list[str] | None = None,
+) -> PipelineResult:
+    return PipelineResult(
+        ok=False,
+        step=step,
+        ingest=ingest,
+        generate=generate,
+        error=message,
+        summary=f"Error during {step}: {message}",
+        log_lines=log_lines or [],
+    )
+
+
+def blm_ingest_cora(
+    *,
+    xlsx_path: str,
+    project_name: str,
+    deps: Deps,
+    money_site_url: str = "",
+    branded_plus_ratio: float | None = None,
+    custom_anchors: str = "",
+    cli_flags: str = "",
+) -> PipelineResult:
+    """Run only BLM's `ingest-cora` command.
+
+    Use when you want to parse a CORA .xlsx into a BLM project + job file
+    without immediately generating content. The returned result carries
+    the project id and the job file path; pass that path to
+    `blm_generate_batch` later to produce content.
+    """
+    if not xlsx_path:
+        return _err("ingest", "xlsx_path is required")
+    if not project_name:
+        return _err("ingest", "project_name is required")
+    if not Path(xlsx_path).exists():
+        return _err("ingest", f"CORA file not found: {xlsx_path}")
+
+    bp_ratio = (
+        branded_plus_ratio
+        if branded_plus_ratio is not None
+        else deps.blm.default_branded_plus_ratio
+    )
+
+    args = build_ingest_args(
+        xlsx_path=xlsx_path,
+        project_name=project_name,
+        money_site_url=money_site_url,
+        branded_plus_ratio=bp_ratio,
+        custom_anchors=custom_anchors,
+        cli_flags=cli_flags,
+    )
+
+    try:
+        proc = blm_mod.run_blm_command(args, deps.blm)
+    except subprocess.TimeoutExpired:
+        return _err(
+            "ingest", f"ingest-cora timed out after {deps.blm.timeout_seconds // 60}m"
+        )
+    except FileNotFoundError as e:
+        return _err("ingest", str(e))
+
+    ingest = parse_ingest_output(proc.stdout)
+    if proc.returncode != 0 or not ingest.success:
+        return _err(
+            "ingest",
+            f"ingest-cora failed (exit={proc.returncode}). "
+            f"stdout tail: {proc.stdout[-500:]}\nstderr tail: {proc.stderr[-500:]}",
+            ingest=ingest,
+        )
+
+    summary = (
+        f"CORA ingest complete.\n"
+        f"- Project: {ingest.project_name} (ID: {ingest.project_id})\n"
+        f"- Keyword: {ingest.main_keyword}\n"
+        f"- Job file: {ingest.job_file}"
+    )
+    return PipelineResult(
+        ok=True,
+        step="ingest",
+        ingest=ingest,
+        summary=summary,
+        project_name=ingest.project_name,
+        job_file=ingest.job_file,
+    )
+
+
+def blm_generate_batch(
+    *,
+    job_file: str,
+    deps: Deps,
+    continue_on_error: bool = True,
+    debug: bool = False,
+) -> PipelineResult:
+    """Run only BLM's `generate-batch` command on an existing job file.
+
+    `job_file` may be absolute or relative; relative paths are resolved
+    against `deps.blm.blm_dir`.
+    """
+    if not job_file:
+        return _err("generate", "job_file is required")
+
+    job_path = (
+        Path(job_file)
+        if Path(job_file).is_absolute()
+        else Path(deps.blm.blm_dir) / job_file
+    )
+    if not job_path.exists():
+        return _err("generate", f"Job file not found: {job_path}")
+
+    args = ["generate-batch", "-j", str(job_path)]
+    if continue_on_error:
+        args.append("--continue-on-error")
+    if debug:
+        args.append("--debug")
+
+    try:
+        proc = blm_mod.run_blm_command(args, deps.blm)
+    except subprocess.TimeoutExpired:
+        return _err(
+            "generate",
+            f"generate-batch timed out after {deps.blm.timeout_seconds // 60}m",
+        )
+    except FileNotFoundError as e:
+        return _err("generate", str(e))
+
+    gen = parse_generate_output(proc.stdout)
+    if proc.returncode != 0:
+        return _err(
+            "generate",
+            f"generate-batch failed (exit={proc.returncode}). "
+            f"stdout tail: {proc.stdout[-500:]}\nstderr tail: {proc.stderr[-500:]}",
+            generate=gen,
+        )
+
+    parts = ["Content generation complete."]
+    parts.append(f"- Status: {'Success' if gen.success else 'Completed'}")
+    if gen.job_moved_to:
+        parts.append(f"- Job moved to: {gen.job_moved_to}")
+
+    return PipelineResult(
+        ok=True,
+        step="generate",
+        generate=gen,
+        summary="\n".join(parts),
+        job_file=gen.job_moved_to or job_file,
+    )
+
+
+def run_cora_backlinks(
+    *,
+    xlsx_path: str,
+    project_name: str,
+    money_site_url: str,
+    deps: Deps,
+    branded_plus_ratio: float | None = None,
+    custom_anchors: str = "",
+    cli_flags: str = "",
+    on_progress: ProgressFn | None = None,
+) -> PipelineResult:
+    """Full Cora Backlinks pipeline: ingest-cora -> generate-batch.
+
+    Requires `money_site_url` (IMSURL) -- BLM cannot run interactively in
+    subprocess mode, so the URL must be supplied up front.
+
+    `on_progress` is an optional callback invoked with free-form status
+    strings ("Step 1/2: ..." etc.); pass one if you want live updates
+    streamed to a UI, log, or task comment as the pipeline runs. The
+    same strings are also captured in the returned `log_lines`.
+
+    On failure, the returned `PipelineResult` has `ok=False`, `step` set
+    to where it stopped ("ingest" or "generate"), and `error` populated.
+    On success, `step == "complete"` and both `ingest` and `generate` are
+    populated.
+    """
+    if not xlsx_path:
+        return _err("ingest", "xlsx_path is required")
+    if not project_name:
+        return _err("ingest", "project_name is required")
+    if not money_site_url:
+        return _err(
+            "ingest",
+            "money_site_url (IMSURL) is required; BLM runs non-interactively",
+        )
+
+    log_lines: list[str] = []
+
+    def _progress(msg: str) -> None:
+        log_lines.append(msg)
+        log.info("[LB Pipeline] %s", msg)
+        if on_progress is not None:
+            try:
+                on_progress(msg)
+            except Exception:
+                log.exception("on_progress callback raised; continuing")
+
+    _progress(f"Step 1/2: Ingesting CORA report for {project_name}...")
+
+    ingest = blm_ingest_cora(
+        xlsx_path=xlsx_path,
+        project_name=project_name,
+        deps=deps,
+        money_site_url=money_site_url,
+        branded_plus_ratio=branded_plus_ratio,
+        custom_anchors=custom_anchors,
+        cli_flags=cli_flags,
+    )
+    if not ingest.ok:
+        ingest.log_lines = log_lines
+        return ingest
+
+    assert ingest.ingest is not None
+    ing = ingest.ingest
+
+    _progress(f"Step 2/2: Generating content batch for {project_name}...")
+
+    gen = blm_generate_batch(
+        job_file=ing.job_file,
+        deps=deps,
+        continue_on_error=True,
+    )
+    if not gen.ok:
+        return PipelineResult(
+            ok=False,
+            step="generate",
+            ingest=ing,
+            generate=gen.generate,
+            error=gen.error,
+            summary=ingest.summary + "\n\n" + gen.summary,
+            project_name=project_name,
+            job_file=ing.job_file,
+            log_lines=log_lines,
+        )
+
+    assert gen.generate is not None
+    g = gen.generate
+
+    summary = (
+        f"## Step 1: Ingest CORA Report\n"
+        f"- Project: {project_name} (ID: {ing.project_id})\n"
+        f"- Keyword: {ing.main_keyword}\n"
+        f"- Job file: {ing.job_file}\n"
+        f"\n"
+        f"## Step 2: Generate Content Batch\n"
+        f"- Status: {'Success' if g.success else 'Completed'}\n"
+        + (f"- Job moved to: {g.job_moved_to}\n" if g.job_moved_to else "")
+    )
+
+    return PipelineResult(
+        ok=True,
+        step="complete",
+        ingest=ing,
+        generate=g,
+        summary=summary,
+        project_name=project_name,
+        job_file=g.job_moved_to or ing.job_file,
+        log_lines=log_lines,
+    )
diff --git a/tests/conftest.py b/tests/conftest.py
new file mode 100644
index 0000000..ccb9b85
--- /dev/null
+++ b/tests/conftest.py
@@ -0,0 +1,80 @@
+"""Shared test fixtures."""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+import pytest
+
+from link_building_workflow import BLMConfig, Deps, LLMCheck
+
+
+@pytest.fixture()
+def blm_dir(tmp_path: Path) -> Path:
+    """A fake BLM directory on disk so run_blm_command's existence check passes."""
+    d = tmp_path / "blm"
+    d.mkdir()
+    # Touch main.py so any accidental real subprocess call gets further; we
+    # still mock subprocess.run in tests, but this is a harmless safety net.
+    (d / "main.py").write_text("# fake\n")
+    return d
+
+
+@pytest.fixture()
+def blm_config(blm_dir: Path) -> BLMConfig:
+    return BLMConfig(
+        blm_dir=str(blm_dir),
+        username="testuser",
+        password="testpass",
+        timeout_seconds=300,
+        python_exe="python",
+    )
+
+
+@pytest.fixture()
+def llm_never() -> LLMCheck:
+    """LLM check that always returns False (fast-path only matches)."""
+    return lambda a, b: False
+
+
+@pytest.fixture()
+def llm_always() -> LLMCheck:
+    """LLM check that always returns True (treat everything as plural-equiv)."""
+    return lambda a, b: True
+
+
+@pytest.fixture()
+def deps(blm_config: BLMConfig, llm_never) -> Deps:
+    return Deps(blm=blm_config, llm_check=llm_never)
+
+
+# Canonical ingest stdout, matches the BLM output format the parser is tuned for
+@pytest.fixture()
+def ingest_success_stdout() -> str:
+    return (
+        "Authenticated as: testuser (User)\n"
+        "\n"
+        "Parsing CORA file: /tmp/test.xlsx\n"
+        "Main Keyword: precision cnc machining\n"
+        "Word Count: 1500\n"
+        "\n"
+        "Creating project: Test Project\n"
+        "Money Site URL: https://example.com\n"
+        "\n"
+        "Success: Project 'Test Project' created (ID: 42)\n"
+        "Main Keyword: precision cnc machining\n"
+        "Money Site URL: https://example.com\n"
+        "Job file created: jobs/test-project.json\n"
+    )
+
+
+@pytest.fixture()
+def generate_success_stdout() -> str:
+    return (
+        "Loading job file: jobs/test-project.json\n"
+        "Generating backlink 1 of 10...\n"
+        "Generating backlink 2 of 10...\n"
+        "...\n"
+        "All backlinks generated.\n"
+        "Job file moved to: jobs/done/test-project.json\n"
+    )
diff --git a/tests/test_blm.py b/tests/test_blm.py
new file mode 100644
index 0000000..d8717d8
--- /dev/null
+++ b/tests/test_blm.py
@@ -0,0 +1,220 @@
+"""Tests for the BLM CLI subprocess wrapper and output parsers."""
+
+from __future__ import annotations
+
+import subprocess
+from pathlib import Path
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+from link_building_workflow import BLMConfig
+from link_building_workflow.blm import (
+    build_ingest_args,
+    parse_generate_output,
+    parse_ingest_output,
+    run_blm_command,
+)
+
+
+class TestBuildIngestArgs:
+    def test_required_args_only(self):
+        args = build_ingest_args(xlsx_path="/tmp/f.xlsx", project_name="P")
+        assert args == ["ingest-cora", "-f", "/tmp/f.xlsx", "-n", "P"]
+
+    def test_with_money_site_url(self):
+        args = build_ingest_args(
+            xlsx_path="/tmp/f.xlsx",
+            project_name="P",
+            money_site_url="https://example.com",
+        )
+        assert "-m" in args
+        i = args.index("-m")
+        assert args[i + 1] == "https://example.com"
+
+    def test_branded_plus_ratio_default_omitted(self):
+        args = build_ingest_args(
+            xlsx_path="/tmp/f.xlsx", project_name="P", branded_plus_ratio=0.7
+        )
+        assert "-bp" not in args
+
+    def test_branded_plus_ratio_custom_included(self):
+        args = build_ingest_args(
+            xlsx_path="/tmp/f.xlsx", project_name="P", branded_plus_ratio=0.8
+        )
+        assert "-bp" in args
+        assert args[args.index("-bp") + 1] == "0.8"
+
+    def test_custom_anchors(self):
+        args = build_ingest_args(
+            xlsx_path="/tmp/f.xlsx",
+            project_name="P",
+            custom_anchors="a1,a2",
+        )
+        assert "-a" in args
+        assert args[args.index("-a") + 1] == "a1,a2"
+
+    def test_cli_flags_split_on_whitespace(self):
+        args = build_ingest_args(
+            xlsx_path="/tmp/f.xlsx",
+            project_name="P",
+            cli_flags="--foo --bar baz",
+        )
+        assert "--foo" in args
+        assert "--bar" in args
+        assert "baz" in args
+
+    def test_cli_flags_empty_string_no_extra_args(self):
+        args = build_ingest_args(
+            xlsx_path="/tmp/f.xlsx", project_name="P", cli_flags=""
+        )
+        assert args == ["ingest-cora", "-f", "/tmp/f.xlsx", "-n", "P"]
+
+
+class TestParseIngestOutput:
+    def test_full_success(self, ingest_success_stdout):
+        result = parse_ingest_output(ingest_success_stdout)
+        assert result.project_id == "42"
+        assert result.project_name == "Test Project"
+        assert result.main_keyword == "precision cnc machining"
+        assert result.job_file == "jobs/test-project.json"
+        assert result.success is True
+
+    def test_missing_project_line(self):
+        stdout = "Job file created: jobs/x.json\n"
+        result = parse_ingest_output(stdout)
+        assert result.project_id == ""
+        assert result.project_name == ""
+        assert result.success is False  # no project_id
+
+    def test_missing_job_line(self):
+        stdout = "Success: Project 'X' created (ID: 1)\n"
+        result = parse_ingest_output(stdout)
+        assert result.project_id == "1"
+        assert result.job_file == ""
+        assert result.success is False  # no job_file
+
+    def test_empty_stdout(self):
+        result = parse_ingest_output("")
+        assert result.project_id == ""
+        assert result.job_file == ""
+        assert result.success is False
+
+    def test_ignores_noise(self):
+        stdout = (
+            "Some random banner\n"
+            "DEBUG: lots of stuff\n"
+            "Success: Project 'Foo Bar' created (ID: 99)\n"
+            "WARNING: meaningless\n"
+            "Main Keyword: foo bar\n"
+            "Job file created: jobs/foo-bar.json\n"
+            "Done.\n"
+        )
+        result = parse_ingest_output(stdout)
+        assert result.project_id == "99"
+        assert result.project_name == "Foo Bar"
+        assert result.main_keyword == "foo bar"
+        assert result.job_file == "jobs/foo-bar.json"
+
+    def test_whitespace_around_job_file(self):
+        stdout = "Job file created:    jobs/x.json   \n"
+        result = parse_ingest_output(stdout)
+        assert result.job_file == "jobs/x.json"
+
+
+class TestParseGenerateOutput:
+    def test_success_with_move(self, generate_success_stdout):
+        result = parse_generate_output(generate_success_stdout)
+        assert result.success is True
+        assert result.job_moved_to == "jobs/done/test-project.json"
+        assert "Job file moved to" in result.raw_output
+
+    def test_no_move_line(self):
+        stdout = "Generating backlinks...\nSome error occurred.\n"
+        result = parse_generate_output(stdout)
+        assert result.success is False
+        assert result.job_moved_to == ""
+        assert result.raw_output == stdout
+
+    def test_empty_stdout(self):
+        result = parse_generate_output("")
+        assert result.success is False
+        assert result.job_moved_to == ""
+
+
+class TestRunBlmCommand:
+    def test_passes_cwd_and_interpreter(self, blm_config: BLMConfig):
+        mock_result = MagicMock(returncode=0, stdout="", stderr="")
+        with patch("subprocess.run", return_value=mock_result) as mock_run:
+            run_blm_command(["ingest-cora", "-f", "x.xlsx"], blm_config)
+        call = mock_run.call_args
+        cmd = call[0][0]
+        assert cmd[0] == "python"
+        assert cmd[1] == "main.py"
+        assert "ingest-cora" in cmd
+        assert call[1]["cwd"] == blm_config.blm_dir
+
+    def test_injects_credentials(self, blm_config: BLMConfig):
+        mock_result = MagicMock(returncode=0, stdout="", stderr="")
+        with patch("subprocess.run", return_value=mock_result) as mock_run:
+            run_blm_command(["ingest-cora"], blm_config)
+        cmd = mock_run.call_args[0][0]
+        assert "-u" in cmd
+        assert cmd[cmd.index("-u") + 1] == "testuser"
+        assert "-p" in cmd
+        assert cmd[cmd.index("-p") + 1] == "testpass"
+
+    def test_does_not_duplicate_user_flag(self, blm_config: BLMConfig):
+        mock_result = MagicMock(returncode=0, stdout="", stderr="")
+        with patch("subprocess.run", return_value=mock_result) as mock_run:
+            run_blm_command(["ingest-cora", "-u", "other"], blm_config)
+        cmd = mock_run.call_args[0][0]
+        # -u should appear once, with the caller's value preserved
+        assert cmd.count("-u") == 1
+        assert cmd[cmd.index("-u") + 1] == "other"
+
+    def test_does_not_duplicate_password_flag(self, blm_config: BLMConfig):
+        mock_result = MagicMock(returncode=0, stdout="", stderr="")
+        with patch("subprocess.run", return_value=mock_result) as mock_run:
+            run_blm_command(["ingest-cora", "-p", "otherpw"], blm_config)
+        cmd = mock_run.call_args[0][0]
+        assert cmd.count("-p") == 1
+        assert cmd[cmd.index("-p") + 1] == "otherpw"
+
+    def test_skips_credentials_when_not_configured(self, tmp_path: Path):
+        blm_dir = tmp_path / "blm"
+        blm_dir.mkdir()
+        config = BLMConfig(blm_dir=str(blm_dir))  # no user/pass
+        mock_result = MagicMock(returncode=0, stdout="", stderr="")
+        with patch("subprocess.run", return_value=mock_result) as mock_run:
+            run_blm_command(["ingest-cora"], config)
+        cmd = mock_run.call_args[0][0]
+        assert "-u" not in cmd
+        assert "-p" not in cmd
+
+    def test_raises_on_missing_blm_dir(self, tmp_path: Path):
+        config = BLMConfig(blm_dir=str(tmp_path / "nope"))
+        with pytest.raises(FileNotFoundError):
+            run_blm_command(["ingest-cora"], config)
+
+    def test_passes_timeout(self, blm_config: BLMConfig):
+        mock_result = MagicMock(returncode=0, stdout="", stderr="")
+        with patch("subprocess.run", return_value=mock_result) as mock_run:
+            run_blm_command(["ingest-cora"], blm_config)
+        assert mock_run.call_args[1]["timeout"] == blm_config.timeout_seconds
+
+    def test_propagates_timeout_expired(self, blm_config: BLMConfig):
+        with patch(
+            "subprocess.run",
+            side_effect=subprocess.TimeoutExpired(cmd="python", timeout=300),
+        ), pytest.raises(subprocess.TimeoutExpired):
+            run_blm_command(["ingest-cora"], blm_config)
+
+    def test_custom_python_exe(self, tmp_path: Path):
+        blm_dir = tmp_path / "blm"
+        blm_dir.mkdir()
+        config = BLMConfig(blm_dir=str(blm_dir), python_exe="/opt/venv/bin/python")
+        mock_result = MagicMock(returncode=0, stdout="", stderr="")
+        with patch("subprocess.run", return_value=mock_result) as mock_run:
+            run_blm_command(["ingest-cora"], config)
+        assert mock_run.call_args[0][0][0] == "/opt/venv/bin/python"
diff --git a/tests/test_inbox.py b/tests/test_inbox.py
new file mode 100644
index 0000000..3e72f17
--- /dev/null
+++ b/tests/test_inbox.py
@@ -0,0 +1,170 @@
+"""Tests for inbox folder scanning and keyword-based file lookup."""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+import pytest
+
+from link_building_workflow.inbox import (
+    find_all_xlsx_for_keyword,
+    find_xlsx_for_keyword,
+    list_inbox_xlsx,
+)
+
+
+@pytest.fixture()
+def inbox(tmp_path: Path) -> Path:
+    """An empty inbox folder."""
+    d = tmp_path / "cora-inbox"
+    d.mkdir()
+    return d
+
+
+def _touch(folder: Path, name: str) -> Path:
+    p = folder / name
+    p.write_bytes(b"fake xlsx")
+    return p
+
+
+class TestListInboxXlsx:
+    def test_missing_folder_returns_empty(self, tmp_path: Path):
+        assert list_inbox_xlsx(tmp_path / "does-not-exist") == []
+
+    def test_empty_folder(self, inbox: Path):
+        assert list_inbox_xlsx(inbox) == []
+
+    def test_lists_xlsx_only(self, inbox: Path):
+        _touch(inbox, "a.xlsx")
+        _touch(inbox, "readme.txt")
+        _touch(inbox, "b.xlsx")
+        result = list_inbox_xlsx(inbox)
+        names = [p.name for p in result]
+        assert names == ["a.xlsx", "b.xlsx"]
+
+    def test_skips_office_lock_files(self, inbox: Path):
+        _touch(inbox, "real.xlsx")
+        _touch(inbox, "~$real.xlsx")
+        result = list_inbox_xlsx(inbox)
+        assert [p.name for p in result] == ["real.xlsx"]
+
+    def test_skips_processed_by_default(self, inbox: Path):
+        _touch(inbox, "new.xlsx")
+        processed = inbox / "processed"
+        processed.mkdir()
+        _touch(processed, "old.xlsx")
+        # Also duplicate the name in root to prove it gets filtered
+        _touch(inbox, "old.xlsx")
+
+        result = list_inbox_xlsx(inbox)
+        assert [p.name for p in result] == ["new.xlsx"]
+
+    def test_skip_processed_disabled(self, inbox: Path):
+        _touch(inbox, "new.xlsx")
+        processed = inbox / "processed"
+        processed.mkdir()
+        _touch(inbox, "old.xlsx")  # same name as one we "processed"
+
+        result = list_inbox_xlsx(inbox, skip_processed=False)
+        assert sorted(p.name for p in result) == ["new.xlsx", "old.xlsx"]
+
+    def test_sorted_output(self, inbox: Path):
+        _touch(inbox, "c.xlsx")
+        _touch(inbox, "a.xlsx")
+        _touch(inbox, "b.xlsx")
+        result = list_inbox_xlsx(inbox)
+        assert [p.name for p in result] == ["a.xlsx", "b.xlsx", "c.xlsx"]
+
+
+class TestFindXlsxForKeyword:
+    def test_exact_match(self, inbox: Path, llm_never):
+        _touch(inbox, "precision-cnc-machining.xlsx")
+        match = find_xlsx_for_keyword(inbox, "precision cnc machining", llm_never)
+        assert match is not None
+        assert match.filename == "precision-cnc-machining.xlsx"
+        assert match.stem_keyword == "precision cnc machining"
+
+    def test_no_match(self, inbox: Path, llm_never):
+        _touch(inbox, "other-keyword.xlsx")
+        match = find_xlsx_for_keyword(inbox, "cnc machining", llm_never)
+        assert match is None
+
+    def test_missing_folder(self, tmp_path: Path, llm_never):
+        match = find_xlsx_for_keyword(
+            tmp_path / "no-such-dir", "cnc machining", llm_never
+        )
+        assert match is None
+
+    def test_empty_keyword(self, inbox: Path, llm_never):
+        _touch(inbox, "anything.xlsx")
+        match = find_xlsx_for_keyword(inbox, "", llm_never)
+        assert match is None
+
+    def test_keyword_with_hyphens(self, inbox: Path, llm_never):
+        # Caller may pass the keyword in hyphenated form; should still match
+        _touch(inbox, "precision-cnc-machining.xlsx")
+        match = find_xlsx_for_keyword(
+            inbox, "precision-cnc-machining", llm_never
+        )
+        assert match is not None
+
+    def test_keyword_case_insensitive(self, inbox: Path, llm_never):
+        _touch(inbox, "cnc-machining.xlsx")
+        match = find_xlsx_for_keyword(inbox, "CNC Machining", llm_never)
+        assert match is not None
+
+    def test_plural_match_via_llm(self, inbox: Path):
+        _touch(inbox, "cnc-shafts.xlsx")
+
+        def only_plural_of_shaft(a: str, b: str) -> bool:
+            return {a, b} == {"cnc shaft", "cnc shafts"}
+
+        # Singular keyword should match the plural filename via LLM
+        match = find_xlsx_for_keyword(inbox, "cnc shaft", only_plural_of_shaft)
+        assert match is not None
+        assert match.filename == "cnc-shafts.xlsx"
+
+    def test_first_match_returned(self, inbox: Path, llm_never):
+        # Two xlsx files both match; sorted order picks "a..." first
+        _touch(inbox, "b-cnc-machining.xlsx")
+        _touch(inbox, "a-cnc-machining.xlsx")
+        # These don't fuzzy match the keyword "cnc machining" because of
+        # the a-/b- prefix. So use a real collision:
+        _touch(inbox, "cnc-machining.xlsx")
+        match = find_xlsx_for_keyword(inbox, "cnc machining", llm_never)
+        assert match is not None
+        assert match.filename == "cnc-machining.xlsx"
+
+    def test_processed_files_ignored(self, inbox: Path, llm_never):
+        processed = inbox / "processed"
+        processed.mkdir()
+        _touch(processed, "cnc-machining.xlsx")
+        _touch(inbox, "cnc-machining.xlsx")
+        # Inbox file with same name as processed one is also skipped by
+        # list_inbox_xlsx, so no match available
+        match = find_xlsx_for_keyword(inbox, "cnc machining", llm_never)
+        assert match is None
+
+
+class TestFindAllXlsxForKeyword:
+    def test_returns_all_matches(self, inbox: Path):
+        _touch(inbox, "cnc-shaft.xlsx")
+        _touch(inbox, "cnc-shafts.xlsx")
+        _touch(inbox, "unrelated.xlsx")
+
+        def plural_ok(a: str, b: str) -> bool:
+            return {a, b} == {"cnc shaft", "cnc shafts"}
+
+        results = find_all_xlsx_for_keyword(inbox, "cnc shaft", plural_ok)
+        names = sorted(r.filename for r in results)
+        assert names == ["cnc-shaft.xlsx", "cnc-shafts.xlsx"]
+
+    def test_empty_when_no_matches(self, inbox: Path, llm_never):
+        _touch(inbox, "unrelated.xlsx")
+        results = find_all_xlsx_for_keyword(inbox, "cnc shaft", llm_never)
+        assert results == []
+
+    def test_empty_keyword_returns_empty(self, inbox: Path, llm_never):
+        _touch(inbox, "anything.xlsx")
+        results = find_all_xlsx_for_keyword(inbox, "", llm_never)
+        assert results == []
diff --git a/tests/test_matching.py b/tests/test_matching.py
new file mode 100644
index 0000000..760236a
--- /dev/null
+++ b/tests/test_matching.py
@@ -0,0 +1,97 @@
+"""Tests for keyword normalization and fuzzy matching."""
+
+from __future__ import annotations
+
+from link_building_workflow.matching import (
+    filename_stem_to_keyword,
+    fuzzy_keyword_match,
+    normalize_for_match,
+)
+
+
+class TestNormalizeForMatch:
+    def test_lowercases(self):
+        assert normalize_for_match("Hello World") == "hello world"
+
+    def test_strips_punctuation(self):
+        assert normalize_for_match("hello, world!") == "hello world"
+
+    def test_collapses_whitespace(self):
+        assert normalize_for_match("hello    world\n\ttest") == "hello world test"
+
+    def test_empty_string(self):
+        assert normalize_for_match("") == ""
+
+    def test_only_punctuation(self):
+        assert normalize_for_match("!!!...,,,") == ""
+
+    def test_numbers_preserved(self):
+        assert normalize_for_match("5-axis cnc") == "5 axis cnc"
+
+    def test_leading_trailing_whitespace(self):
+        assert normalize_for_match("   hello world   ") == "hello world"
+
+
+class TestFuzzyKeywordMatch:
+    def test_exact_match(self, llm_never):
+        assert fuzzy_keyword_match("cnc machining", "cnc machining", llm_never) is True
+
+    def test_different_no_llm(self):
+        assert fuzzy_keyword_match("cnc", "cnc machining") is False
+
+    def test_different_llm_says_no(self, llm_never):
+        assert fuzzy_keyword_match("cnc", "milling", llm_never) is False
+
+    def test_different_llm_says_yes(self, llm_always):
+        # LLM callable gets to decide when exact match fails
+        assert fuzzy_keyword_match("shaft", "shafts", llm_always) is True
+
+    def test_empty_a(self, llm_always):
+        assert fuzzy_keyword_match("", "cnc", llm_always) is False
+
+    def test_empty_b(self, llm_always):
+        assert fuzzy_keyword_match("cnc", "", llm_always) is False
+
+    def test_both_empty(self, llm_always):
+        # Even with llm_always, empty inputs short-circuit to False
+        assert fuzzy_keyword_match("", "", llm_always) is False
+
+    def test_no_llm_check_fast_path_hit(self):
+        # When no llm_check provided, exact matches still work
+        assert fuzzy_keyword_match("same", "same") is True
+
+    def test_no_llm_check_fast_path_miss(self):
+        # When no llm_check and not exact, returns False
+        assert fuzzy_keyword_match("same", "different") is False
+
+    def test_llm_check_only_called_when_needed(self):
+        calls = []
+
+        def tracking_llm(a, b):
+            calls.append((a, b))
+            return True
+
+        # Exact match: LLM should not be called
+        fuzzy_keyword_match("cnc", "cnc", tracking_llm)
+        assert calls == []
+
+        # Different: LLM should be called once
+        fuzzy_keyword_match("shaft", "shafts", tracking_llm)
+        assert calls == [("shaft", "shafts")]
+
+
+class TestFilenameStemToKeyword:
+    def test_hyphens_to_spaces(self):
+        assert filename_stem_to_keyword("precision-cnc-machining") == "precision cnc machining"
+
+    def test_underscores_to_spaces(self):
+        assert filename_stem_to_keyword("precision_cnc_machining") == "precision cnc machining"
+
+    def test_mixed_separators(self):
+        assert filename_stem_to_keyword("precision-cnc_machining") == "precision cnc machining"
+
+    def test_uppercase(self):
+        assert filename_stem_to_keyword("CNC-Machining") == "cnc machining"
+
+    def test_empty(self):
+        assert filename_stem_to_keyword("") == ""
diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py
new file mode 100644
index 0000000..e0f5319
--- /dev/null
+++ b/tests/test_pipeline.py
@@ -0,0 +1,460 @@
+"""Tests for the three pipeline entry points.
+
+BLM subprocess calls are mocked via `link_building_workflow.blm.run_blm_command`.
+The pipeline module imports blm as `blm_mod` and calls `blm_mod.run_blm_command(...)`,
+so we patch there.
+"""
+
+from __future__ import annotations
+
+import subprocess
+from pathlib import Path
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+from link_building_workflow import (
+    BLMConfig,
+    Deps,
+    blm_generate_batch,
+    blm_ingest_cora,
+    run_cora_backlinks,
+)
+
+
+@pytest.fixture()
+def xlsx_file(tmp_path: Path) -> Path:
+    p = tmp_path / "precision-cnc-machining.xlsx"
+    p.write_bytes(b"fake xlsx")
+    return p
+
+
+def _mock_proc(stdout: str = "", stderr: str = "", returncode: int = 0) -> MagicMock:
+    m = MagicMock()
+    m.stdout = stdout
+    m.stderr = stderr
+    m.returncode = returncode
+    return m
+
+
+# ---------------------------------------------------------------------------
+# blm_ingest_cora
+# ---------------------------------------------------------------------------
+
+
+class TestBlmIngestCora:
+    def test_missing_xlsx_path(self, deps: Deps):
+        result = blm_ingest_cora(xlsx_path="", project_name="P", deps=deps)
+        assert result.ok is False
+        assert "xlsx_path is required" in result.error
+
+    def test_missing_project_name(self, deps: Deps, xlsx_file: Path):
+        result = blm_ingest_cora(
+            xlsx_path=str(xlsx_file), project_name="", deps=deps
+        )
+        assert result.ok is False
+        assert "project_name is required" in result.error
+
+    def test_xlsx_not_found(self, deps: Deps):
+        result = blm_ingest_cora(
+            xlsx_path="/nope/missing.xlsx", project_name="P", deps=deps
+        )
+        assert result.ok is False
+        assert "not found" in result.error
+
+    def test_success(
+        self, deps: Deps, xlsx_file: Path, ingest_success_stdout: str
+    ):
+        proc = _mock_proc(stdout=ingest_success_stdout)
+        with patch(
+            "link_building_workflow.blm.run_blm_command", return_value=proc
+        ):
+            result = blm_ingest_cora(
+                xlsx_path=str(xlsx_file),
+                project_name="Test Project",
+                deps=deps,
+            )
+        assert result.ok is True
+        assert result.step == "ingest"
+        assert result.ingest is not None
+        assert result.ingest.project_id == "42"
+        assert result.ingest.job_file == "jobs/test-project.json"
+        assert result.job_file == "jobs/test-project.json"
+        assert result.project_name == "Test Project"
+        assert "CORA ingest complete" in result.summary
+
+    def test_nonzero_exit_reports_failure(
+        self, deps: Deps, xlsx_file: Path
+    ):
+        proc = _mock_proc(stdout="", stderr="boom", returncode=1)
+        with patch(
+            "link_building_workflow.blm.run_blm_command", return_value=proc
+        ):
+            result = blm_ingest_cora(
+                xlsx_path=str(xlsx_file),
+                project_name="P",
+                deps=deps,
+            )
+        assert result.ok is False
+        assert "exit=1" in result.error
+        assert "boom" in result.error
+
+    def test_timeout(self, deps: Deps, xlsx_file: Path):
+        with patch(
+            "link_building_workflow.blm.run_blm_command",
+            side_effect=subprocess.TimeoutExpired(cmd="python", timeout=300),
+        ):
+            result = blm_ingest_cora(
+                xlsx_path=str(xlsx_file),
+                project_name="P",
+                deps=deps,
+            )
+        assert result.ok is False
+        assert "timed out" in result.error
+
+    def test_uses_config_default_branded_plus_ratio(
+        self, deps: Deps, xlsx_file: Path, ingest_success_stdout: str
+    ):
+        # Caller passes None, so Deps default (0.7) should be used
+        proc = _mock_proc(stdout=ingest_success_stdout)
+        with patch(
+            "link_building_workflow.blm.run_blm_command", return_value=proc
+        ) as mock_run:
+            blm_ingest_cora(
+                xlsx_path=str(xlsx_file),
+                project_name="P",
+                deps=deps,
+                branded_plus_ratio=None,
+            )
+        args = mock_run.call_args[0][0]
+        # 0.7 is the default, so -bp should NOT appear in args
+        assert "-bp" not in args
+
+    def test_caller_override_branded_plus_ratio(
+        self, deps: Deps, xlsx_file: Path, ingest_success_stdout: str
+    ):
+        proc = _mock_proc(stdout=ingest_success_stdout)
+        with patch(
+            "link_building_workflow.blm.run_blm_command", return_value=proc
+        ) as mock_run:
+            blm_ingest_cora(
+                xlsx_path=str(xlsx_file),
+                project_name="P",
+                deps=deps,
+                branded_plus_ratio=0.85,
+            )
+        args = mock_run.call_args[0][0]
+        assert "-bp" in args
+        assert args[args.index("-bp") + 1] == "0.85"
+
+
+# ---------------------------------------------------------------------------
+# blm_generate_batch
+# ---------------------------------------------------------------------------
+
+
+class TestBlmGenerateBatch:
+    def test_missing_job_file_arg(self, deps: Deps):
+        result = blm_generate_batch(job_file="", deps=deps)
+        assert result.ok is False
+        assert "job_file is required" in result.error
+
+    def test_job_file_does_not_exist(self, deps: Deps):
+        result = blm_generate_batch(
+            job_file="/definitely/not/here.json", deps=deps
+        )
+        assert result.ok is False
+        assert "not found" in result.error
+
+    def test_relative_path_resolved_against_blm_dir(
+        self, deps: Deps, generate_success_stdout: str
+    ):
+        # Create a relative job file under the fake BLM dir
+        job_rel = "jobs/x.json"
+        (Path(deps.blm.blm_dir) / "jobs").mkdir()
+        (Path(deps.blm.blm_dir) / job_rel).write_text("{}")
+
+        proc = _mock_proc(stdout=generate_success_stdout)
+        with patch(
+            "link_building_workflow.blm.run_blm_command", return_value=proc
+        ) as mock_run:
+            result = blm_generate_batch(job_file=job_rel, deps=deps)
+        assert result.ok is True
+
+        # The resolved absolute path should have been passed to BLM
+        args = mock_run.call_args[0][0]
+        j_index = args.index("-j")
+        passed_path = args[j_index + 1]
+        assert passed_path.endswith("x.json")
+        assert Path(passed_path).is_absolute()
+
+    def test_continue_on_error_flag_default(
+        self, deps: Deps, tmp_path: Path, generate_success_stdout: str
+    ):
+        job = tmp_path / "job.json"
+        job.write_text("{}")
+
+        proc = _mock_proc(stdout=generate_success_stdout)
+        with patch(
+            "link_building_workflow.blm.run_blm_command", return_value=proc
+        ) as mock_run:
+            blm_generate_batch(job_file=str(job), deps=deps)
+        args = mock_run.call_args[0][0]
+        assert "--continue-on-error" in args
+
+    def test_continue_on_error_disabled(
+        self, deps: Deps, tmp_path: Path, generate_success_stdout: str
+    ):
+        job = tmp_path / "job.json"
+        job.write_text("{}")
+
+        proc = _mock_proc(stdout=generate_success_stdout)
+        with patch(
+            "link_building_workflow.blm.run_blm_command", return_value=proc
+        ) as mock_run:
+            blm_generate_batch(
+                job_file=str(job), deps=deps, continue_on_error=False
+            )
+        args = mock_run.call_args[0][0]
+        assert "--continue-on-error" not in args
+
+    def test_debug_flag(self, deps: Deps, tmp_path: Path, generate_success_stdout: str):
+        job = tmp_path / "job.json"
+        job.write_text("{}")
+        proc = _mock_proc(stdout=generate_success_stdout)
+        with patch(
+            "link_building_workflow.blm.run_blm_command", return_value=proc
+        ) as mock_run:
+            blm_generate_batch(job_file=str(job), deps=deps, debug=True)
+        assert "--debug" in mock_run.call_args[0][0]
+
+    def test_nonzero_exit(self, deps: Deps, tmp_path: Path):
+        job = tmp_path / "job.json"
+        job.write_text("{}")
+
+        proc = _mock_proc(stdout="", stderr="fail", returncode=2)
+        with patch(
+            "link_building_workflow.blm.run_blm_command", return_value=proc
+        ):
+            result = blm_generate_batch(job_file=str(job), deps=deps)
+        assert result.ok is False
+        assert "exit=2" in result.error
+
+    def test_timeout(self, deps: Deps, tmp_path: Path):
+        job = tmp_path / "job.json"
+        job.write_text("{}")
+        with patch(
+            "link_building_workflow.blm.run_blm_command",
+            side_effect=subprocess.TimeoutExpired(cmd="python", timeout=300),
+        ):
+            result = blm_generate_batch(job_file=str(job), deps=deps)
+        assert result.ok is False
+        assert "timed out" in result.error
+
+
+# ---------------------------------------------------------------------------
+# run_cora_backlinks (full pipeline)
+# ---------------------------------------------------------------------------
+
+
+class TestRunCoraBacklinks:
+    def test_missing_money_site_url(self, deps: Deps, xlsx_file: Path):
+        result = run_cora_backlinks(
+            xlsx_path=str(xlsx_file),
+            project_name="P",
+            money_site_url="",
+            deps=deps,
+        )
+        assert result.ok is False
+        assert "IMSURL" in result.error
+
+    def test_full_success(
+        self,
+        deps: Deps,
+        xlsx_file: Path,
+        ingest_success_stdout: str,
+        generate_success_stdout: str,
+    ):
+        # ingest stdout must reference a job file that then exists on disk
+        # for blm_generate_batch's existence check to pass.
+        job_rel = "jobs/test-project.json"
+        (Path(deps.blm.blm_dir) / "jobs").mkdir()
+        (Path(deps.blm.blm_dir) / job_rel).write_text("{}")
+
+        procs = [
+            _mock_proc(stdout=ingest_success_stdout),
+            _mock_proc(stdout=generate_success_stdout),
+        ]
+        with patch(
+            "link_building_workflow.blm.run_blm_command", side_effect=procs
+        ) as mock_run:
+            result = run_cora_backlinks(
+                xlsx_path=str(xlsx_file),
+                project_name="Test Project",
+                money_site_url="https://example.com",
+                deps=deps,
+            )
+
+        assert result.ok is True
+        assert result.step == "complete"
+        assert result.ingest is not None
+        assert result.generate is not None
+        assert result.ingest.project_id == "42"
+        assert result.generate.job_moved_to == "jobs/done/test-project.json"
+        assert result.job_file == "jobs/done/test-project.json"
+        assert "Step 1" in result.summary and "Step 2" in result.summary
+
+        # BLM was invoked twice (ingest, generate)
+        assert mock_run.call_count == 2
+        ingest_args = mock_run.call_args_list[0][0][0]
+        generate_args = mock_run.call_args_list[1][0][0]
+        assert "ingest-cora" in ingest_args
+        assert "generate-batch" in generate_args
+
+    def test_ingest_failure_skips_generate(
+        self, deps: Deps, xlsx_file: Path
+    ):
+        procs = [_mock_proc(stdout="", stderr="fail", returncode=1)]
+        with patch(
+            "link_building_workflow.blm.run_blm_command", side_effect=procs
+        ) as mock_run:
+            result = run_cora_backlinks(
+                xlsx_path=str(xlsx_file),
+                project_name="P",
+                money_site_url="https://example.com",
+                deps=deps,
+            )
+        assert result.ok is False
+        assert result.step == "ingest"
+        assert mock_run.call_count == 1  # generate not called
+
+    def test_generate_failure_preserves_ingest(
+        self, deps: Deps, xlsx_file: Path, ingest_success_stdout: str
+    ):
+        job_rel = "jobs/test-project.json"
+        (Path(deps.blm.blm_dir) / "jobs").mkdir()
+        (Path(deps.blm.blm_dir) / job_rel).write_text("{}")
+
+        procs = [
+            _mock_proc(stdout=ingest_success_stdout),
+            _mock_proc(stdout="", stderr="gen fail", returncode=3),
+        ]
+        with patch(
+            "link_building_workflow.blm.run_blm_command", side_effect=procs
+        ):
+            result = run_cora_backlinks(
+                xlsx_path=str(xlsx_file),
+                project_name="Test Project",
+                money_site_url="https://example.com",
+                deps=deps,
+            )
+        assert result.ok is False
+        assert result.step == "generate"
+        # Ingest succeeded; its data is still on the result
+        assert result.ingest is not None
+        assert result.ingest.project_id == "42"
+        assert "gen fail" in result.error
+
+    def test_on_progress_callback_invoked(
+        self,
+        deps: Deps,
+        xlsx_file: Path,
+        ingest_success_stdout: str,
+        generate_success_stdout: str,
+    ):
+        job_rel = "jobs/test-project.json"
+        (Path(deps.blm.blm_dir) / "jobs").mkdir()
+        (Path(deps.blm.blm_dir) / job_rel).write_text("{}")
+
+        progress_calls: list[str] = []
+        procs = [
+            _mock_proc(stdout=ingest_success_stdout),
+            _mock_proc(stdout=generate_success_stdout),
+        ]
+        with patch(
+            "link_building_workflow.blm.run_blm_command", side_effect=procs
+        ):
+            result = run_cora_backlinks(
+                xlsx_path=str(xlsx_file),
+                project_name="Test Project",
+                money_site_url="https://example.com",
+                deps=deps,
+                on_progress=progress_calls.append,
+            )
+        assert result.ok is True
+        assert len(progress_calls) >= 2
+        assert any("Step 1" in m for m in progress_calls)
+        assert any("Step 2" in m for m in progress_calls)
+        # log_lines mirrors progress_calls
+        assert result.log_lines == progress_calls
+
+    def test_on_progress_exception_does_not_break_pipeline(
+        self,
+        deps: Deps,
+        xlsx_file: Path,
+        ingest_success_stdout: str,
+        generate_success_stdout: str,
+    ):
+        job_rel = "jobs/test-project.json"
+        (Path(deps.blm.blm_dir) / "jobs").mkdir()
+        (Path(deps.blm.blm_dir) / job_rel).write_text("{}")
+
+        def broken(_msg: str) -> None:
+            raise RuntimeError("progress callback failed")
+
+        procs = [
+            _mock_proc(stdout=ingest_success_stdout),
+            _mock_proc(stdout=generate_success_stdout),
+        ]
+        with patch(
+            "link_building_workflow.blm.run_blm_command", side_effect=procs
+        ):
+            result = run_cora_backlinks(
+                xlsx_path=str(xlsx_file),
+                project_name="Test Project",
+                money_site_url="https://example.com",
+                deps=deps,
+                on_progress=broken,
+            )
+        # Pipeline still completed successfully despite broken callback
+        assert result.ok is True
+
+    def test_uses_config_default_ratio_when_none(
+        self,
+        deps: Deps,
+        xlsx_file: Path,
+        ingest_success_stdout: str,
+        generate_success_stdout: str,
+    ):
+        # Verify the Deps-level default flows into build_ingest_args
+        blm_cfg = BLMConfig(
+            blm_dir=deps.blm.blm_dir,
+            username=deps.blm.username,
+            password=deps.blm.password,
+            timeout_seconds=deps.blm.timeout_seconds,
+            default_branded_plus_ratio=0.9,  # non-default
+            python_exe=deps.blm.python_exe,
+        )
+        new_deps = Deps(blm=blm_cfg, llm_check=deps.llm_check)
+
+        job_rel = "jobs/test-project.json"
+        (Path(blm_cfg.blm_dir) / "jobs").mkdir()
+        (Path(blm_cfg.blm_dir) / job_rel).write_text("{}")
+
+        procs = [
+            _mock_proc(stdout=ingest_success_stdout),
+            _mock_proc(stdout=generate_success_stdout),
+        ]
+        with patch(
+            "link_building_workflow.blm.run_blm_command", side_effect=procs
+        ) as mock_run:
+            run_cora_backlinks(
+                xlsx_path=str(xlsx_file),
+                project_name="Test Project",
+                money_site_url="https://example.com",
+                deps=new_deps,
+                branded_plus_ratio=None,  # should pick up 0.9 default
+            )
+        ingest_args = mock_run.call_args_list[0][0][0]
+        assert "-bp" in ingest_args
+        assert ingest_args[ingest_args.index("-bp") + 1] == "0.9"