Replace fuzzy keyword matching with exact match + LLM plural check

Substring and word-overlap matching caused cross-contamination between
similar keywords (e.g. "shaft manufacturing" matching "custom shaft
manufacturing"). Now only exact matches pass immediately; non-exact
pairs are checked via OpenRouter LLM call with session-level caching.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
master
PeninsulaInd 2026-03-18 18:53:24 -05:00
parent b857d3cb8c
commit 48d2e47835
3 changed files with 93 additions and 29 deletions

View File

@ -8,6 +8,8 @@ import logging
import re
import shutil
import threading
import httpx
from datetime import UTC, datetime
from pathlib import Path
from typing import TYPE_CHECKING
@ -87,6 +89,60 @@ class Scheduler:
}
self._active_executions: dict[str, dict] = {}
self._active_lock = threading.Lock()
self._plural_cache: dict[tuple[str, str], bool] = {}
def _llm_plural_check(self, a: str, b: str) -> bool:
"""Ask the chat brain if two keywords are the same aside from plural form.
Uses OpenRouter with the configured CHEDDAH_CHAT_MODEL. Results are
cached for the session to avoid repeat calls.
"""
key = (a, b) if a <= b else (b, a)
if key in self._plural_cache:
return self._plural_cache[key]
api_key = self.config.openrouter_api_key
model = self.config.chat_model
if not api_key:
log.warning("LLM plural check: no OpenRouter API key, returning False")
return False
try:
resp = httpx.post(
"https://openrouter.ai/api/v1/chat/completions",
headers={"Authorization": f"Bearer {api_key}"},
json={
"model": model,
"max_tokens": 5,
"messages": [
{
"role": "system",
"content": (
"You compare SEO keywords. Reply with ONLY 'YES' or 'NO'. "
"Answer YES only if the two keywords are identical except for "
"singular vs plural word forms (e.g. 'shaft' vs 'shafts', "
"'company' vs 'companies'). Answer NO if they differ in any "
"other way (extra words, different words, different meaning)."
),
},
{
"role": "user",
"content": f'Keyword A: "{a}"\nKeyword B: "{b}"',
},
],
},
timeout=15,
)
resp.raise_for_status()
answer = resp.json()["choices"][0]["message"]["content"].strip()
result = "YES" in answer.upper()
log.debug("LLM plural check: '%s' vs '%s'%s (%s)", a, b, result, answer)
except Exception as e:
log.warning("LLM plural check failed for '%s' vs '%s': %s", a, b, e)
result = False
self._plural_cache[key] = result
return result
def start(self):
"""Start the scheduler, heartbeat, and ClickUp threads."""
@ -910,7 +966,7 @@ class Scheduler:
continue
keyword_norm = _normalize_for_match(str(keyword))
if _fuzzy_keyword_match(normalized_stem, keyword_norm):
if _fuzzy_keyword_match(normalized_stem, keyword_norm, self._llm_plural_check):
return task
return None
@ -1083,7 +1139,7 @@ class Scheduler:
continue
keyword_norm = _normalize_for_match(str(keyword))
if _fuzzy_keyword_match(normalized_stem, keyword_norm):
if _fuzzy_keyword_match(normalized_stem, keyword_norm, self._llm_plural_check):
return task
return None
@ -1170,7 +1226,7 @@ class Scheduler:
continue
keyword_norm = _normalize_for_match(str(keyword))
if not _fuzzy_keyword_match(stem, keyword_norm):
if not _fuzzy_keyword_match(stem, keyword_norm, self._llm_plural_check):
continue
matched_names.append(task.name)
@ -1296,7 +1352,7 @@ class Scheduler:
if not keyword:
continue
keyword_norm = _normalize_for_match(str(keyword))
if _fuzzy_keyword_match(normalized_stem, keyword_norm):
if _fuzzy_keyword_match(normalized_stem, keyword_norm, self._llm_plural_check):
task_ids.append(task.id)
# Post comments
@ -1484,7 +1540,7 @@ class Scheduler:
# Find which folders have a matching file
matched_folders: set[str] = set()
for stem, locs in file_locations.items():
if _fuzzy_keyword_match(keyword_norm, stem):
if _fuzzy_keyword_match(keyword_norm, stem, self._llm_plural_check):
matched_folders.update(locs)
if not matched_folders:

View File

@ -10,6 +10,7 @@ import logging
import os
import re
import subprocess
from collections.abc import Callable
from pathlib import Path
from . import tool
@ -264,26 +265,20 @@ def _normalize_for_match(text: str) -> str:
return text
def _fuzzy_keyword_match(a: str, b: str) -> bool:
"""Check if two normalized strings are a fuzzy match.
def _fuzzy_keyword_match(a: str, b: str, llm_check: Callable[[str, str], bool] | None = None) -> bool:
"""Check if two normalized strings match, allowing singular/plural differences.
Matches if: exact, substring in either direction, or >80% word overlap.
Fast path: exact match after normalization.
Slow path: ask an LLM if the two keywords are the same aside from plural form.
Falls back to False if no llm_check is provided and strings differ.
"""
if not a or not b:
return False
if a == b:
return True
if a in b or b in a:
return True
# Word overlap check
words_a = set(a.split())
words_b = set(b.split())
if not words_a or not words_b:
if llm_check is None:
return False
overlap = len(words_a & words_b)
min_len = min(len(words_a), len(words_b))
return overlap / min_len >= 0.8 if min_len > 0 else False
return llm_check(a, b)
def _complete_clickup_task(ctx: dict | None, task_id: str, message: str, status: str = "") -> None:

View File

@ -227,23 +227,36 @@ class TestFuzzyKeywordMatch:
def test_exact_match(self):
assert _fuzzy_keyword_match("precision cnc", "precision cnc") is True
def test_substring_match_a_in_b(self):
assert _fuzzy_keyword_match("cnc machining", "precision cnc machining services") is True
def test_substring_match_b_in_a(self):
assert _fuzzy_keyword_match("precision cnc machining services", "cnc machining") is True
def test_word_overlap(self):
assert _fuzzy_keyword_match("precision cnc machining", "cnc machining precision") is True
def test_no_match(self):
assert _fuzzy_keyword_match("precision cnc", "web design agency") is False
def test_no_match_without_llm(self):
"""Without an llm_check, non-exact strings return False."""
assert _fuzzy_keyword_match("shaft", "shafts") is False
assert _fuzzy_keyword_match("shaft manufacturing", "custom shaft manufacturing") is False
def test_empty_strings(self):
assert _fuzzy_keyword_match("", "test") is False
assert _fuzzy_keyword_match("test", "") is False
assert _fuzzy_keyword_match("", "") is False
def test_llm_check_called_on_mismatch(self):
"""When strings differ, llm_check is called and its result is returned."""
llm_yes = lambda a, b: True
llm_no = lambda a, b: False
assert _fuzzy_keyword_match("shaft", "shafts", llm_check=llm_yes) is True
assert _fuzzy_keyword_match("shaft", "shafts", llm_check=llm_no) is False
def test_llm_check_not_called_on_exact(self):
"""Exact match should not call llm_check."""
def boom(a, b):
raise AssertionError("should not be called")
assert _fuzzy_keyword_match("shaft", "shaft", llm_check=boom) is True
def test_no_substring_match_without_llm(self):
"""Substring matching is gone — different keywords must not match."""
assert _fuzzy_keyword_match("shaft manufacturing", "custom shaft manufacturing") is False
assert _fuzzy_keyword_match("cnc machining", "precision cnc machining services") is False
class TestNormalizeForMatch:
def test_lowercase_and_strip(self):