diff --git a/cheddahbot/scheduler.py b/cheddahbot/scheduler.py index 92a80c6..41025d5 100644 --- a/cheddahbot/scheduler.py +++ b/cheddahbot/scheduler.py @@ -8,6 +8,8 @@ import logging import re import shutil import threading + +import httpx from datetime import UTC, datetime from pathlib import Path from typing import TYPE_CHECKING @@ -87,6 +89,60 @@ class Scheduler: } self._active_executions: dict[str, dict] = {} self._active_lock = threading.Lock() + self._plural_cache: dict[tuple[str, str], bool] = {} + + def _llm_plural_check(self, a: str, b: str) -> bool: + """Ask the chat brain if two keywords are the same aside from plural form. + + Uses OpenRouter with the configured CHEDDAH_CHAT_MODEL. Results are + cached for the session to avoid repeat calls. + """ + key = (a, b) if a <= b else (b, a) + if key in self._plural_cache: + return self._plural_cache[key] + + api_key = self.config.openrouter_api_key + model = self.config.chat_model + if not api_key: + log.warning("LLM plural check: no OpenRouter API key, returning False") + return False + + try: + resp = httpx.post( + "https://openrouter.ai/api/v1/chat/completions", + headers={"Authorization": f"Bearer {api_key}"}, + json={ + "model": model, + "max_tokens": 5, + "messages": [ + { + "role": "system", + "content": ( + "You compare SEO keywords. Reply with ONLY 'YES' or 'NO'. " + "Answer YES only if the two keywords are identical except for " + "singular vs plural word forms (e.g. 'shaft' vs 'shafts', " + "'company' vs 'companies'). Answer NO if they differ in any " + "other way (extra words, different words, different meaning)." + ), + }, + { + "role": "user", + "content": f'Keyword A: "{a}"\nKeyword B: "{b}"', + }, + ], + }, + timeout=15, + ) + resp.raise_for_status() + answer = resp.json()["choices"][0]["message"]["content"].strip() + result = "YES" in answer.upper() + log.debug("LLM plural check: '%s' vs '%s' → %s (%s)", a, b, result, answer) + except Exception as e: + log.warning("LLM plural check failed for '%s' vs '%s': %s", a, b, e) + result = False + + self._plural_cache[key] = result + return result def start(self): """Start the scheduler, heartbeat, and ClickUp threads.""" @@ -910,7 +966,7 @@ class Scheduler: continue keyword_norm = _normalize_for_match(str(keyword)) - if _fuzzy_keyword_match(normalized_stem, keyword_norm): + if _fuzzy_keyword_match(normalized_stem, keyword_norm, self._llm_plural_check): return task return None @@ -1083,7 +1139,7 @@ class Scheduler: continue keyword_norm = _normalize_for_match(str(keyword)) - if _fuzzy_keyword_match(normalized_stem, keyword_norm): + if _fuzzy_keyword_match(normalized_stem, keyword_norm, self._llm_plural_check): return task return None @@ -1170,7 +1226,7 @@ class Scheduler: continue keyword_norm = _normalize_for_match(str(keyword)) - if not _fuzzy_keyword_match(stem, keyword_norm): + if not _fuzzy_keyword_match(stem, keyword_norm, self._llm_plural_check): continue matched_names.append(task.name) @@ -1296,7 +1352,7 @@ class Scheduler: if not keyword: continue keyword_norm = _normalize_for_match(str(keyword)) - if _fuzzy_keyword_match(normalized_stem, keyword_norm): + if _fuzzy_keyword_match(normalized_stem, keyword_norm, self._llm_plural_check): task_ids.append(task.id) # Post comments @@ -1484,7 +1540,7 @@ class Scheduler: # Find which folders have a matching file matched_folders: set[str] = set() for stem, locs in file_locations.items(): - if _fuzzy_keyword_match(keyword_norm, stem): + if _fuzzy_keyword_match(keyword_norm, stem, self._llm_plural_check): matched_folders.update(locs) if not matched_folders: diff --git a/cheddahbot/tools/linkbuilding.py b/cheddahbot/tools/linkbuilding.py index 6deefab..20208ae 100644 --- a/cheddahbot/tools/linkbuilding.py +++ b/cheddahbot/tools/linkbuilding.py @@ -10,6 +10,7 @@ import logging import os import re import subprocess +from collections.abc import Callable from pathlib import Path from . import tool @@ -264,26 +265,20 @@ def _normalize_for_match(text: str) -> str: return text -def _fuzzy_keyword_match(a: str, b: str) -> bool: - """Check if two normalized strings are a fuzzy match. +def _fuzzy_keyword_match(a: str, b: str, llm_check: Callable[[str, str], bool] | None = None) -> bool: + """Check if two normalized strings match, allowing singular/plural differences. - Matches if: exact, substring in either direction, or >80% word overlap. + Fast path: exact match after normalization. + Slow path: ask an LLM if the two keywords are the same aside from plural form. + Falls back to False if no llm_check is provided and strings differ. """ if not a or not b: return False if a == b: return True - if a in b or b in a: - return True - - # Word overlap check - words_a = set(a.split()) - words_b = set(b.split()) - if not words_a or not words_b: + if llm_check is None: return False - overlap = len(words_a & words_b) - min_len = min(len(words_a), len(words_b)) - return overlap / min_len >= 0.8 if min_len > 0 else False + return llm_check(a, b) def _complete_clickup_task(ctx: dict | None, task_id: str, message: str, status: str = "") -> None: diff --git a/tests/test_linkbuilding.py b/tests/test_linkbuilding.py index 8eab673..3a994b4 100644 --- a/tests/test_linkbuilding.py +++ b/tests/test_linkbuilding.py @@ -227,23 +227,36 @@ class TestFuzzyKeywordMatch: def test_exact_match(self): assert _fuzzy_keyword_match("precision cnc", "precision cnc") is True - def test_substring_match_a_in_b(self): - assert _fuzzy_keyword_match("cnc machining", "precision cnc machining services") is True - - def test_substring_match_b_in_a(self): - assert _fuzzy_keyword_match("precision cnc machining services", "cnc machining") is True - - def test_word_overlap(self): - assert _fuzzy_keyword_match("precision cnc machining", "cnc machining precision") is True - - def test_no_match(self): - assert _fuzzy_keyword_match("precision cnc", "web design agency") is False + def test_no_match_without_llm(self): + """Without an llm_check, non-exact strings return False.""" + assert _fuzzy_keyword_match("shaft", "shafts") is False + assert _fuzzy_keyword_match("shaft manufacturing", "custom shaft manufacturing") is False def test_empty_strings(self): assert _fuzzy_keyword_match("", "test") is False assert _fuzzy_keyword_match("test", "") is False assert _fuzzy_keyword_match("", "") is False + def test_llm_check_called_on_mismatch(self): + """When strings differ, llm_check is called and its result is returned.""" + llm_yes = lambda a, b: True + llm_no = lambda a, b: False + + assert _fuzzy_keyword_match("shaft", "shafts", llm_check=llm_yes) is True + assert _fuzzy_keyword_match("shaft", "shafts", llm_check=llm_no) is False + + def test_llm_check_not_called_on_exact(self): + """Exact match should not call llm_check.""" + def boom(a, b): + raise AssertionError("should not be called") + + assert _fuzzy_keyword_match("shaft", "shaft", llm_check=boom) is True + + def test_no_substring_match_without_llm(self): + """Substring matching is gone — different keywords must not match.""" + assert _fuzzy_keyword_match("shaft manufacturing", "custom shaft manufacturing") is False + assert _fuzzy_keyword_match("cnc machining", "precision cnc machining services") is False + class TestNormalizeForMatch: def test_lowercase_and_strip(self):