98 lines
3.3 KiB
Python
98 lines
3.3 KiB
Python
"""Tests for keyword normalization and fuzzy matching."""
|
|
|
|
from __future__ import annotations
|
|
|
|
from link_building_workflow.matching import (
|
|
filename_stem_to_keyword,
|
|
fuzzy_keyword_match,
|
|
normalize_for_match,
|
|
)
|
|
|
|
|
|
class TestNormalizeForMatch:
|
|
def test_lowercases(self):
|
|
assert normalize_for_match("Hello World") == "hello world"
|
|
|
|
def test_strips_punctuation(self):
|
|
assert normalize_for_match("hello, world!") == "hello world"
|
|
|
|
def test_collapses_whitespace(self):
|
|
assert normalize_for_match("hello world\n\ttest") == "hello world test"
|
|
|
|
def test_empty_string(self):
|
|
assert normalize_for_match("") == ""
|
|
|
|
def test_only_punctuation(self):
|
|
assert normalize_for_match("!!!...,,,") == ""
|
|
|
|
def test_numbers_preserved(self):
|
|
assert normalize_for_match("5-axis cnc") == "5 axis cnc"
|
|
|
|
def test_leading_trailing_whitespace(self):
|
|
assert normalize_for_match(" hello world ") == "hello world"
|
|
|
|
|
|
class TestFuzzyKeywordMatch:
|
|
def test_exact_match(self, llm_never):
|
|
assert fuzzy_keyword_match("cnc machining", "cnc machining", llm_never) is True
|
|
|
|
def test_different_no_llm(self):
|
|
assert fuzzy_keyword_match("cnc", "cnc machining") is False
|
|
|
|
def test_different_llm_says_no(self, llm_never):
|
|
assert fuzzy_keyword_match("cnc", "milling", llm_never) is False
|
|
|
|
def test_different_llm_says_yes(self, llm_always):
|
|
# LLM callable gets to decide when exact match fails
|
|
assert fuzzy_keyword_match("shaft", "shafts", llm_always) is True
|
|
|
|
def test_empty_a(self, llm_always):
|
|
assert fuzzy_keyword_match("", "cnc", llm_always) is False
|
|
|
|
def test_empty_b(self, llm_always):
|
|
assert fuzzy_keyword_match("cnc", "", llm_always) is False
|
|
|
|
def test_both_empty(self, llm_always):
|
|
# Even with llm_always, empty inputs short-circuit to False
|
|
assert fuzzy_keyword_match("", "", llm_always) is False
|
|
|
|
def test_no_llm_check_fast_path_hit(self):
|
|
# When no llm_check provided, exact matches still work
|
|
assert fuzzy_keyword_match("same", "same") is True
|
|
|
|
def test_no_llm_check_fast_path_miss(self):
|
|
# When no llm_check and not exact, returns False
|
|
assert fuzzy_keyword_match("same", "different") is False
|
|
|
|
def test_llm_check_only_called_when_needed(self):
|
|
calls = []
|
|
|
|
def tracking_llm(a, b):
|
|
calls.append((a, b))
|
|
return True
|
|
|
|
# Exact match: LLM should not be called
|
|
fuzzy_keyword_match("cnc", "cnc", tracking_llm)
|
|
assert calls == []
|
|
|
|
# Different: LLM should be called once
|
|
fuzzy_keyword_match("shaft", "shafts", tracking_llm)
|
|
assert calls == [("shaft", "shafts")]
|
|
|
|
|
|
class TestFilenameStemToKeyword:
|
|
def test_hyphens_to_spaces(self):
|
|
assert filename_stem_to_keyword("precision-cnc-machining") == "precision cnc machining"
|
|
|
|
def test_underscores_to_spaces(self):
|
|
assert filename_stem_to_keyword("precision_cnc_machining") == "precision cnc machining"
|
|
|
|
def test_mixed_separators(self):
|
|
assert filename_stem_to_keyword("precision-cnc_machining") == "precision cnc machining"
|
|
|
|
def test_uppercase(self):
|
|
assert filename_stem_to_keyword("CNC-Machining") == "cnc machining"
|
|
|
|
def test_empty(self):
|
|
assert filename_stem_to_keyword("") == ""
|