Replace fuzzy keyword matching with exact match + LLM plural check
Substring and word-overlap matching caused cross-contamination between similar keywords (e.g. "shaft manufacturing" matching "custom shaft manufacturing"). Now only exact matches pass immediately; non-exact pairs are checked via OpenRouter LLM call with session-level caching. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>master
parent
b857d3cb8c
commit
48d2e47835
|
|
@ -8,6 +8,8 @@ import logging
|
|||
import re
|
||||
import shutil
|
||||
import threading
|
||||
|
||||
import httpx
|
||||
from datetime import UTC, datetime
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING
|
||||
|
|
@ -87,6 +89,60 @@ class Scheduler:
|
|||
}
|
||||
self._active_executions: dict[str, dict] = {}
|
||||
self._active_lock = threading.Lock()
|
||||
self._plural_cache: dict[tuple[str, str], bool] = {}
|
||||
|
||||
def _llm_plural_check(self, a: str, b: str) -> bool:
|
||||
"""Ask the chat brain if two keywords are the same aside from plural form.
|
||||
|
||||
Uses OpenRouter with the configured CHEDDAH_CHAT_MODEL. Results are
|
||||
cached for the session to avoid repeat calls.
|
||||
"""
|
||||
key = (a, b) if a <= b else (b, a)
|
||||
if key in self._plural_cache:
|
||||
return self._plural_cache[key]
|
||||
|
||||
api_key = self.config.openrouter_api_key
|
||||
model = self.config.chat_model
|
||||
if not api_key:
|
||||
log.warning("LLM plural check: no OpenRouter API key, returning False")
|
||||
return False
|
||||
|
||||
try:
|
||||
resp = httpx.post(
|
||||
"https://openrouter.ai/api/v1/chat/completions",
|
||||
headers={"Authorization": f"Bearer {api_key}"},
|
||||
json={
|
||||
"model": model,
|
||||
"max_tokens": 5,
|
||||
"messages": [
|
||||
{
|
||||
"role": "system",
|
||||
"content": (
|
||||
"You compare SEO keywords. Reply with ONLY 'YES' or 'NO'. "
|
||||
"Answer YES only if the two keywords are identical except for "
|
||||
"singular vs plural word forms (e.g. 'shaft' vs 'shafts', "
|
||||
"'company' vs 'companies'). Answer NO if they differ in any "
|
||||
"other way (extra words, different words, different meaning)."
|
||||
),
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
"content": f'Keyword A: "{a}"\nKeyword B: "{b}"',
|
||||
},
|
||||
],
|
||||
},
|
||||
timeout=15,
|
||||
)
|
||||
resp.raise_for_status()
|
||||
answer = resp.json()["choices"][0]["message"]["content"].strip()
|
||||
result = "YES" in answer.upper()
|
||||
log.debug("LLM plural check: '%s' vs '%s' → %s (%s)", a, b, result, answer)
|
||||
except Exception as e:
|
||||
log.warning("LLM plural check failed for '%s' vs '%s': %s", a, b, e)
|
||||
result = False
|
||||
|
||||
self._plural_cache[key] = result
|
||||
return result
|
||||
|
||||
def start(self):
|
||||
"""Start the scheduler, heartbeat, and ClickUp threads."""
|
||||
|
|
@ -910,7 +966,7 @@ class Scheduler:
|
|||
continue
|
||||
|
||||
keyword_norm = _normalize_for_match(str(keyword))
|
||||
if _fuzzy_keyword_match(normalized_stem, keyword_norm):
|
||||
if _fuzzy_keyword_match(normalized_stem, keyword_norm, self._llm_plural_check):
|
||||
return task
|
||||
|
||||
return None
|
||||
|
|
@ -1083,7 +1139,7 @@ class Scheduler:
|
|||
continue
|
||||
|
||||
keyword_norm = _normalize_for_match(str(keyword))
|
||||
if _fuzzy_keyword_match(normalized_stem, keyword_norm):
|
||||
if _fuzzy_keyword_match(normalized_stem, keyword_norm, self._llm_plural_check):
|
||||
return task
|
||||
|
||||
return None
|
||||
|
|
@ -1170,7 +1226,7 @@ class Scheduler:
|
|||
continue
|
||||
|
||||
keyword_norm = _normalize_for_match(str(keyword))
|
||||
if not _fuzzy_keyword_match(stem, keyword_norm):
|
||||
if not _fuzzy_keyword_match(stem, keyword_norm, self._llm_plural_check):
|
||||
continue
|
||||
|
||||
matched_names.append(task.name)
|
||||
|
|
@ -1296,7 +1352,7 @@ class Scheduler:
|
|||
if not keyword:
|
||||
continue
|
||||
keyword_norm = _normalize_for_match(str(keyword))
|
||||
if _fuzzy_keyword_match(normalized_stem, keyword_norm):
|
||||
if _fuzzy_keyword_match(normalized_stem, keyword_norm, self._llm_plural_check):
|
||||
task_ids.append(task.id)
|
||||
|
||||
# Post comments
|
||||
|
|
@ -1484,7 +1540,7 @@ class Scheduler:
|
|||
# Find which folders have a matching file
|
||||
matched_folders: set[str] = set()
|
||||
for stem, locs in file_locations.items():
|
||||
if _fuzzy_keyword_match(keyword_norm, stem):
|
||||
if _fuzzy_keyword_match(keyword_norm, stem, self._llm_plural_check):
|
||||
matched_folders.update(locs)
|
||||
|
||||
if not matched_folders:
|
||||
|
|
|
|||
|
|
@ -10,6 +10,7 @@ import logging
|
|||
import os
|
||||
import re
|
||||
import subprocess
|
||||
from collections.abc import Callable
|
||||
from pathlib import Path
|
||||
|
||||
from . import tool
|
||||
|
|
@ -264,26 +265,20 @@ def _normalize_for_match(text: str) -> str:
|
|||
return text
|
||||
|
||||
|
||||
def _fuzzy_keyword_match(a: str, b: str) -> bool:
|
||||
"""Check if two normalized strings are a fuzzy match.
|
||||
def _fuzzy_keyword_match(a: str, b: str, llm_check: Callable[[str, str], bool] | None = None) -> bool:
|
||||
"""Check if two normalized strings match, allowing singular/plural differences.
|
||||
|
||||
Matches if: exact, substring in either direction, or >80% word overlap.
|
||||
Fast path: exact match after normalization.
|
||||
Slow path: ask an LLM if the two keywords are the same aside from plural form.
|
||||
Falls back to False if no llm_check is provided and strings differ.
|
||||
"""
|
||||
if not a or not b:
|
||||
return False
|
||||
if a == b:
|
||||
return True
|
||||
if a in b or b in a:
|
||||
return True
|
||||
|
||||
# Word overlap check
|
||||
words_a = set(a.split())
|
||||
words_b = set(b.split())
|
||||
if not words_a or not words_b:
|
||||
if llm_check is None:
|
||||
return False
|
||||
overlap = len(words_a & words_b)
|
||||
min_len = min(len(words_a), len(words_b))
|
||||
return overlap / min_len >= 0.8 if min_len > 0 else False
|
||||
return llm_check(a, b)
|
||||
|
||||
|
||||
def _complete_clickup_task(ctx: dict | None, task_id: str, message: str, status: str = "") -> None:
|
||||
|
|
|
|||
|
|
@ -227,23 +227,36 @@ class TestFuzzyKeywordMatch:
|
|||
def test_exact_match(self):
|
||||
assert _fuzzy_keyword_match("precision cnc", "precision cnc") is True
|
||||
|
||||
def test_substring_match_a_in_b(self):
|
||||
assert _fuzzy_keyword_match("cnc machining", "precision cnc machining services") is True
|
||||
|
||||
def test_substring_match_b_in_a(self):
|
||||
assert _fuzzy_keyword_match("precision cnc machining services", "cnc machining") is True
|
||||
|
||||
def test_word_overlap(self):
|
||||
assert _fuzzy_keyword_match("precision cnc machining", "cnc machining precision") is True
|
||||
|
||||
def test_no_match(self):
|
||||
assert _fuzzy_keyword_match("precision cnc", "web design agency") is False
|
||||
def test_no_match_without_llm(self):
|
||||
"""Without an llm_check, non-exact strings return False."""
|
||||
assert _fuzzy_keyword_match("shaft", "shafts") is False
|
||||
assert _fuzzy_keyword_match("shaft manufacturing", "custom shaft manufacturing") is False
|
||||
|
||||
def test_empty_strings(self):
|
||||
assert _fuzzy_keyword_match("", "test") is False
|
||||
assert _fuzzy_keyword_match("test", "") is False
|
||||
assert _fuzzy_keyword_match("", "") is False
|
||||
|
||||
def test_llm_check_called_on_mismatch(self):
|
||||
"""When strings differ, llm_check is called and its result is returned."""
|
||||
llm_yes = lambda a, b: True
|
||||
llm_no = lambda a, b: False
|
||||
|
||||
assert _fuzzy_keyword_match("shaft", "shafts", llm_check=llm_yes) is True
|
||||
assert _fuzzy_keyword_match("shaft", "shafts", llm_check=llm_no) is False
|
||||
|
||||
def test_llm_check_not_called_on_exact(self):
|
||||
"""Exact match should not call llm_check."""
|
||||
def boom(a, b):
|
||||
raise AssertionError("should not be called")
|
||||
|
||||
assert _fuzzy_keyword_match("shaft", "shaft", llm_check=boom) is True
|
||||
|
||||
def test_no_substring_match_without_llm(self):
|
||||
"""Substring matching is gone — different keywords must not match."""
|
||||
assert _fuzzy_keyword_match("shaft manufacturing", "custom shaft manufacturing") is False
|
||||
assert _fuzzy_keyword_match("cnc machining", "precision cnc machining services") is False
|
||||
|
||||
|
||||
class TestNormalizeForMatch:
|
||||
def test_lowercase_and_strip(self):
|
||||
|
|
|
|||
Loading…
Reference in New Issue