Replace fuzzy keyword matching with exact match + LLM plural check

Substring and word-overlap matching caused cross-contamination between similar keywords (e.g. "shaft manufacturing" matching "custom shaft manufacturing"). Now only exact matches pass immediately; non-exact pairs are checked via OpenRouter LLM call with session-level caching. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-18 18:53:24 -05:00 · 2026-03-18 18:53:24 -05:00 · 48d2e47835
parent b857d3cb8c
commit 48d2e47835
3 changed files with 93 additions and 29 deletions
--- a/cheddahbot/scheduler.py
+++ b/cheddahbot/scheduler.py
@ -8,6 +8,8 @@ import logging
 import re
 import shutil
 import threading
+
+import httpx
 from datetime import UTC, datetime
 from pathlib import Path
 from typing import TYPE_CHECKING
@ -87,6 +89,60 @@ class Scheduler:
        }
        self._active_executions: dict[str, dict] = {}
        self._active_lock = threading.Lock()
+        self._plural_cache: dict[tuple[str, str], bool] = {}
+
+    def _llm_plural_check(self, a: str, b: str) -> bool:
+        """Ask the chat brain if two keywords are the same aside from plural form.
+
+        Uses OpenRouter with the configured CHEDDAH_CHAT_MODEL. Results are
+        cached for the session to avoid repeat calls.
+        """
+        key = (a, b) if a <= b else (b, a)
+        if key in self._plural_cache:
+            return self._plural_cache[key]
+
+        api_key = self.config.openrouter_api_key
+        model = self.config.chat_model
+        if not api_key:
+            log.warning("LLM plural check: no OpenRouter API key, returning False")
+            return False
+
+        try:
+            resp = httpx.post(
+                "https://openrouter.ai/api/v1/chat/completions",
+                headers={"Authorization": f"Bearer {api_key}"},
+                json={
+                    "model": model,
+                    "max_tokens": 5,
+                    "messages": [
+                        {
+                            "role": "system",
+                            "content": (
+                                "You compare SEO keywords. Reply with ONLY 'YES' or 'NO'. "
+                                "Answer YES only if the two keywords are identical except for "
+                                "singular vs plural word forms (e.g. 'shaft' vs 'shafts', "
+                                "'company' vs 'companies'). Answer NO if they differ in any "
+                                "other way (extra words, different words, different meaning)."
+                            ),
+                        },
+                        {
+                            "role": "user",
+                            "content": f'Keyword A: "{a}"\nKeyword B: "{b}"',
+                        },
+                    ],
+                },
+                timeout=15,
+            )
+            resp.raise_for_status()
+            answer = resp.json()["choices"][0]["message"]["content"].strip()
+            result = "YES" in answer.upper()
+            log.debug("LLM plural check: '%s' vs '%s' → %s (%s)", a, b, result, answer)
+        except Exception as e:
+            log.warning("LLM plural check failed for '%s' vs '%s': %s", a, b, e)
+            result = False
+
+        self._plural_cache[key] = result
+        return result

    def start(self):
        """Start the scheduler, heartbeat, and ClickUp threads."""
@ -910,7 +966,7 @@ class Scheduler:
                continue

            keyword_norm = _normalize_for_match(str(keyword))
-            if _fuzzy_keyword_match(normalized_stem, keyword_norm):
+            if _fuzzy_keyword_match(normalized_stem, keyword_norm, self._llm_plural_check):
                return task

        return None
@ -1083,7 +1139,7 @@ class Scheduler:
                continue

            keyword_norm = _normalize_for_match(str(keyword))
-            if _fuzzy_keyword_match(normalized_stem, keyword_norm):
+            if _fuzzy_keyword_match(normalized_stem, keyword_norm, self._llm_plural_check):
                return task

        return None
@ -1170,7 +1226,7 @@ class Scheduler:
                continue

            keyword_norm = _normalize_for_match(str(keyword))
-            if not _fuzzy_keyword_match(stem, keyword_norm):
+            if not _fuzzy_keyword_match(stem, keyword_norm, self._llm_plural_check):
                continue

            matched_names.append(task.name)
@ -1296,7 +1352,7 @@ class Scheduler:
                if not keyword:
                    continue
                keyword_norm = _normalize_for_match(str(keyword))
-                if _fuzzy_keyword_match(normalized_stem, keyword_norm):
+                if _fuzzy_keyword_match(normalized_stem, keyword_norm, self._llm_plural_check):
                    task_ids.append(task.id)

        # Post comments
@ -1484,7 +1540,7 @@ class Scheduler:
            # Find which folders have a matching file
            matched_folders: set[str] = set()
            for stem, locs in file_locations.items():
-                if _fuzzy_keyword_match(keyword_norm, stem):
+                if _fuzzy_keyword_match(keyword_norm, stem, self._llm_plural_check):
                    matched_folders.update(locs)

            if not matched_folders:
--- a/cheddahbot/tools/linkbuilding.py
+++ b/cheddahbot/tools/linkbuilding.py
@ -10,6 +10,7 @@ import logging
 import os
 import re
 import subprocess
+from collections.abc import Callable
 from pathlib import Path

 from . import tool
@ -264,26 +265,20 @@ def _normalize_for_match(text: str) -> str:
    return text


-def _fuzzy_keyword_match(a: str, b: str) -> bool:
-    """Check if two normalized strings are a fuzzy match.
+def _fuzzy_keyword_match(a: str, b: str, llm_check: Callable[[str, str], bool] | None = None) -> bool:
+    """Check if two normalized strings match, allowing singular/plural differences.

-    Matches if: exact, substring in either direction, or >80% word overlap.
+    Fast path: exact match after normalization.
+    Slow path: ask an LLM if the two keywords are the same aside from plural form.
+    Falls back to False if no llm_check is provided and strings differ.
    """
    if not a or not b:
        return False
    if a == b:
        return True
-    if a in b or b in a:
-        return True
-
-    # Word overlap check
-    words_a = set(a.split())
-    words_b = set(b.split())
-    if not words_a or not words_b:
+    if llm_check is None:
        return False
-    overlap = len(words_a & words_b)
-    min_len = min(len(words_a), len(words_b))
-    return overlap / min_len >= 0.8 if min_len > 0 else False
+    return llm_check(a, b)


 def _complete_clickup_task(ctx: dict | None, task_id: str, message: str, status: str = "") -> None:
--- a/tests/test_linkbuilding.py
+++ b/tests/test_linkbuilding.py
@ -227,23 +227,36 @@ class TestFuzzyKeywordMatch:
    def test_exact_match(self):
        assert _fuzzy_keyword_match("precision cnc", "precision cnc") is True

-    def test_substring_match_a_in_b(self):
-        assert _fuzzy_keyword_match("cnc machining", "precision cnc machining services") is True
-
-    def test_substring_match_b_in_a(self):
-        assert _fuzzy_keyword_match("precision cnc machining services", "cnc machining") is True
-
-    def test_word_overlap(self):
-        assert _fuzzy_keyword_match("precision cnc machining", "cnc machining precision") is True
-
-    def test_no_match(self):
-        assert _fuzzy_keyword_match("precision cnc", "web design agency") is False
+    def test_no_match_without_llm(self):
+        """Without an llm_check, non-exact strings return False."""
+        assert _fuzzy_keyword_match("shaft", "shafts") is False
+        assert _fuzzy_keyword_match("shaft manufacturing", "custom shaft manufacturing") is False

    def test_empty_strings(self):
        assert _fuzzy_keyword_match("", "test") is False
        assert _fuzzy_keyword_match("test", "") is False
        assert _fuzzy_keyword_match("", "") is False

+    def test_llm_check_called_on_mismatch(self):
+        """When strings differ, llm_check is called and its result is returned."""
+        llm_yes = lambda a, b: True
+        llm_no = lambda a, b: False
+
+        assert _fuzzy_keyword_match("shaft", "shafts", llm_check=llm_yes) is True
+        assert _fuzzy_keyword_match("shaft", "shafts", llm_check=llm_no) is False
+
+    def test_llm_check_not_called_on_exact(self):
+        """Exact match should not call llm_check."""
+        def boom(a, b):
+            raise AssertionError("should not be called")
+
+        assert _fuzzy_keyword_match("shaft", "shaft", llm_check=boom) is True
+
+    def test_no_substring_match_without_llm(self):
+        """Substring matching is gone — different keywords must not match."""
+        assert _fuzzy_keyword_match("shaft manufacturing", "custom shaft manufacturing") is False
+        assert _fuzzy_keyword_match("cnc machining", "precision cnc machining services") is False
+

 class TestNormalizeForMatch:
    def test_lowercase_and_strip(self):