Replace fuzzy keyword matching with exact match + LLM plural check

Substring and word-overlap matching caused cross-contamination between
similar keywords (e.g. "shaft manufacturing" matching "custom shaft
manufacturing"). Now only exact matches pass immediately; non-exact
pairs are checked via OpenRouter LLM call with session-level caching.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
master
PeninsulaInd 2026-03-18 18:53:24 -05:00
parent b857d3cb8c
commit 48d2e47835
3 changed files with 93 additions and 29 deletions

View File

@ -8,6 +8,8 @@ import logging
import re import re
import shutil import shutil
import threading import threading
import httpx
from datetime import UTC, datetime from datetime import UTC, datetime
from pathlib import Path from pathlib import Path
from typing import TYPE_CHECKING from typing import TYPE_CHECKING
@ -87,6 +89,60 @@ class Scheduler:
} }
self._active_executions: dict[str, dict] = {} self._active_executions: dict[str, dict] = {}
self._active_lock = threading.Lock() self._active_lock = threading.Lock()
self._plural_cache: dict[tuple[str, str], bool] = {}
def _llm_plural_check(self, a: str, b: str) -> bool:
"""Ask the chat brain if two keywords are the same aside from plural form.
Uses OpenRouter with the configured CHEDDAH_CHAT_MODEL. Results are
cached for the session to avoid repeat calls.
"""
key = (a, b) if a <= b else (b, a)
if key in self._plural_cache:
return self._plural_cache[key]
api_key = self.config.openrouter_api_key
model = self.config.chat_model
if not api_key:
log.warning("LLM plural check: no OpenRouter API key, returning False")
return False
try:
resp = httpx.post(
"https://openrouter.ai/api/v1/chat/completions",
headers={"Authorization": f"Bearer {api_key}"},
json={
"model": model,
"max_tokens": 5,
"messages": [
{
"role": "system",
"content": (
"You compare SEO keywords. Reply with ONLY 'YES' or 'NO'. "
"Answer YES only if the two keywords are identical except for "
"singular vs plural word forms (e.g. 'shaft' vs 'shafts', "
"'company' vs 'companies'). Answer NO if they differ in any "
"other way (extra words, different words, different meaning)."
),
},
{
"role": "user",
"content": f'Keyword A: "{a}"\nKeyword B: "{b}"',
},
],
},
timeout=15,
)
resp.raise_for_status()
answer = resp.json()["choices"][0]["message"]["content"].strip()
result = "YES" in answer.upper()
log.debug("LLM plural check: '%s' vs '%s'%s (%s)", a, b, result, answer)
except Exception as e:
log.warning("LLM plural check failed for '%s' vs '%s': %s", a, b, e)
result = False
self._plural_cache[key] = result
return result
def start(self): def start(self):
"""Start the scheduler, heartbeat, and ClickUp threads.""" """Start the scheduler, heartbeat, and ClickUp threads."""
@ -910,7 +966,7 @@ class Scheduler:
continue continue
keyword_norm = _normalize_for_match(str(keyword)) keyword_norm = _normalize_for_match(str(keyword))
if _fuzzy_keyword_match(normalized_stem, keyword_norm): if _fuzzy_keyword_match(normalized_stem, keyword_norm, self._llm_plural_check):
return task return task
return None return None
@ -1083,7 +1139,7 @@ class Scheduler:
continue continue
keyword_norm = _normalize_for_match(str(keyword)) keyword_norm = _normalize_for_match(str(keyword))
if _fuzzy_keyword_match(normalized_stem, keyword_norm): if _fuzzy_keyword_match(normalized_stem, keyword_norm, self._llm_plural_check):
return task return task
return None return None
@ -1170,7 +1226,7 @@ class Scheduler:
continue continue
keyword_norm = _normalize_for_match(str(keyword)) keyword_norm = _normalize_for_match(str(keyword))
if not _fuzzy_keyword_match(stem, keyword_norm): if not _fuzzy_keyword_match(stem, keyword_norm, self._llm_plural_check):
continue continue
matched_names.append(task.name) matched_names.append(task.name)
@ -1296,7 +1352,7 @@ class Scheduler:
if not keyword: if not keyword:
continue continue
keyword_norm = _normalize_for_match(str(keyword)) keyword_norm = _normalize_for_match(str(keyword))
if _fuzzy_keyword_match(normalized_stem, keyword_norm): if _fuzzy_keyword_match(normalized_stem, keyword_norm, self._llm_plural_check):
task_ids.append(task.id) task_ids.append(task.id)
# Post comments # Post comments
@ -1484,7 +1540,7 @@ class Scheduler:
# Find which folders have a matching file # Find which folders have a matching file
matched_folders: set[str] = set() matched_folders: set[str] = set()
for stem, locs in file_locations.items(): for stem, locs in file_locations.items():
if _fuzzy_keyword_match(keyword_norm, stem): if _fuzzy_keyword_match(keyword_norm, stem, self._llm_plural_check):
matched_folders.update(locs) matched_folders.update(locs)
if not matched_folders: if not matched_folders:

View File

@ -10,6 +10,7 @@ import logging
import os import os
import re import re
import subprocess import subprocess
from collections.abc import Callable
from pathlib import Path from pathlib import Path
from . import tool from . import tool
@ -264,26 +265,20 @@ def _normalize_for_match(text: str) -> str:
return text return text
def _fuzzy_keyword_match(a: str, b: str) -> bool: def _fuzzy_keyword_match(a: str, b: str, llm_check: Callable[[str, str], bool] | None = None) -> bool:
"""Check if two normalized strings are a fuzzy match. """Check if two normalized strings match, allowing singular/plural differences.
Matches if: exact, substring in either direction, or >80% word overlap. Fast path: exact match after normalization.
Slow path: ask an LLM if the two keywords are the same aside from plural form.
Falls back to False if no llm_check is provided and strings differ.
""" """
if not a or not b: if not a or not b:
return False return False
if a == b: if a == b:
return True return True
if a in b or b in a: if llm_check is None:
return True
# Word overlap check
words_a = set(a.split())
words_b = set(b.split())
if not words_a or not words_b:
return False return False
overlap = len(words_a & words_b) return llm_check(a, b)
min_len = min(len(words_a), len(words_b))
return overlap / min_len >= 0.8 if min_len > 0 else False
def _complete_clickup_task(ctx: dict | None, task_id: str, message: str, status: str = "") -> None: def _complete_clickup_task(ctx: dict | None, task_id: str, message: str, status: str = "") -> None:

View File

@ -227,23 +227,36 @@ class TestFuzzyKeywordMatch:
def test_exact_match(self): def test_exact_match(self):
assert _fuzzy_keyword_match("precision cnc", "precision cnc") is True assert _fuzzy_keyword_match("precision cnc", "precision cnc") is True
def test_substring_match_a_in_b(self): def test_no_match_without_llm(self):
assert _fuzzy_keyword_match("cnc machining", "precision cnc machining services") is True """Without an llm_check, non-exact strings return False."""
assert _fuzzy_keyword_match("shaft", "shafts") is False
def test_substring_match_b_in_a(self): assert _fuzzy_keyword_match("shaft manufacturing", "custom shaft manufacturing") is False
assert _fuzzy_keyword_match("precision cnc machining services", "cnc machining") is True
def test_word_overlap(self):
assert _fuzzy_keyword_match("precision cnc machining", "cnc machining precision") is True
def test_no_match(self):
assert _fuzzy_keyword_match("precision cnc", "web design agency") is False
def test_empty_strings(self): def test_empty_strings(self):
assert _fuzzy_keyword_match("", "test") is False assert _fuzzy_keyword_match("", "test") is False
assert _fuzzy_keyword_match("test", "") is False assert _fuzzy_keyword_match("test", "") is False
assert _fuzzy_keyword_match("", "") is False assert _fuzzy_keyword_match("", "") is False
def test_llm_check_called_on_mismatch(self):
"""When strings differ, llm_check is called and its result is returned."""
llm_yes = lambda a, b: True
llm_no = lambda a, b: False
assert _fuzzy_keyword_match("shaft", "shafts", llm_check=llm_yes) is True
assert _fuzzy_keyword_match("shaft", "shafts", llm_check=llm_no) is False
def test_llm_check_not_called_on_exact(self):
"""Exact match should not call llm_check."""
def boom(a, b):
raise AssertionError("should not be called")
assert _fuzzy_keyword_match("shaft", "shaft", llm_check=boom) is True
def test_no_substring_match_without_llm(self):
"""Substring matching is gone — different keywords must not match."""
assert _fuzzy_keyword_match("shaft manufacturing", "custom shaft manufacturing") is False
assert _fuzzy_keyword_match("cnc machining", "precision cnc machining services") is False
class TestNormalizeForMatch: class TestNormalizeForMatch:
def test_lowercase_and_strip(self): def test_lowercase_and_strip(self):