171 lines
6.1 KiB
Python
171 lines
6.1 KiB
Python
"""Tests for inbox folder scanning and keyword-based file lookup."""
|
|
|
|
from __future__ import annotations
|
|
|
|
from pathlib import Path
|
|
|
|
import pytest
|
|
|
|
from link_building_workflow.inbox import (
|
|
find_all_xlsx_for_keyword,
|
|
find_xlsx_for_keyword,
|
|
list_inbox_xlsx,
|
|
)
|
|
|
|
|
|
@pytest.fixture()
|
|
def inbox(tmp_path: Path) -> Path:
|
|
"""An empty inbox folder."""
|
|
d = tmp_path / "cora-inbox"
|
|
d.mkdir()
|
|
return d
|
|
|
|
|
|
def _touch(folder: Path, name: str) -> Path:
|
|
p = folder / name
|
|
p.write_bytes(b"fake xlsx")
|
|
return p
|
|
|
|
|
|
class TestListInboxXlsx:
|
|
def test_missing_folder_returns_empty(self, tmp_path: Path):
|
|
assert list_inbox_xlsx(tmp_path / "does-not-exist") == []
|
|
|
|
def test_empty_folder(self, inbox: Path):
|
|
assert list_inbox_xlsx(inbox) == []
|
|
|
|
def test_lists_xlsx_only(self, inbox: Path):
|
|
_touch(inbox, "a.xlsx")
|
|
_touch(inbox, "readme.txt")
|
|
_touch(inbox, "b.xlsx")
|
|
result = list_inbox_xlsx(inbox)
|
|
names = [p.name for p in result]
|
|
assert names == ["a.xlsx", "b.xlsx"]
|
|
|
|
def test_skips_office_lock_files(self, inbox: Path):
|
|
_touch(inbox, "real.xlsx")
|
|
_touch(inbox, "~$real.xlsx")
|
|
result = list_inbox_xlsx(inbox)
|
|
assert [p.name for p in result] == ["real.xlsx"]
|
|
|
|
def test_skips_processed_by_default(self, inbox: Path):
|
|
_touch(inbox, "new.xlsx")
|
|
processed = inbox / "processed"
|
|
processed.mkdir()
|
|
_touch(processed, "old.xlsx")
|
|
# Also duplicate the name in root to prove it gets filtered
|
|
_touch(inbox, "old.xlsx")
|
|
|
|
result = list_inbox_xlsx(inbox)
|
|
assert [p.name for p in result] == ["new.xlsx"]
|
|
|
|
def test_skip_processed_disabled(self, inbox: Path):
|
|
_touch(inbox, "new.xlsx")
|
|
processed = inbox / "processed"
|
|
processed.mkdir()
|
|
_touch(inbox, "old.xlsx") # same name as one we "processed"
|
|
|
|
result = list_inbox_xlsx(inbox, skip_processed=False)
|
|
assert sorted(p.name for p in result) == ["new.xlsx", "old.xlsx"]
|
|
|
|
def test_sorted_output(self, inbox: Path):
|
|
_touch(inbox, "c.xlsx")
|
|
_touch(inbox, "a.xlsx")
|
|
_touch(inbox, "b.xlsx")
|
|
result = list_inbox_xlsx(inbox)
|
|
assert [p.name for p in result] == ["a.xlsx", "b.xlsx", "c.xlsx"]
|
|
|
|
|
|
class TestFindXlsxForKeyword:
|
|
def test_exact_match(self, inbox: Path, llm_never):
|
|
_touch(inbox, "precision-cnc-machining.xlsx")
|
|
match = find_xlsx_for_keyword(inbox, "precision cnc machining", llm_never)
|
|
assert match is not None
|
|
assert match.filename == "precision-cnc-machining.xlsx"
|
|
assert match.stem_keyword == "precision cnc machining"
|
|
|
|
def test_no_match(self, inbox: Path, llm_never):
|
|
_touch(inbox, "other-keyword.xlsx")
|
|
match = find_xlsx_for_keyword(inbox, "cnc machining", llm_never)
|
|
assert match is None
|
|
|
|
def test_missing_folder(self, tmp_path: Path, llm_never):
|
|
match = find_xlsx_for_keyword(
|
|
tmp_path / "no-such-dir", "cnc machining", llm_never
|
|
)
|
|
assert match is None
|
|
|
|
def test_empty_keyword(self, inbox: Path, llm_never):
|
|
_touch(inbox, "anything.xlsx")
|
|
match = find_xlsx_for_keyword(inbox, "", llm_never)
|
|
assert match is None
|
|
|
|
def test_keyword_with_hyphens(self, inbox: Path, llm_never):
|
|
# Caller may pass the keyword in hyphenated form; should still match
|
|
_touch(inbox, "precision-cnc-machining.xlsx")
|
|
match = find_xlsx_for_keyword(
|
|
inbox, "precision-cnc-machining", llm_never
|
|
)
|
|
assert match is not None
|
|
|
|
def test_keyword_case_insensitive(self, inbox: Path, llm_never):
|
|
_touch(inbox, "cnc-machining.xlsx")
|
|
match = find_xlsx_for_keyword(inbox, "CNC Machining", llm_never)
|
|
assert match is not None
|
|
|
|
def test_plural_match_via_llm(self, inbox: Path):
|
|
_touch(inbox, "cnc-shafts.xlsx")
|
|
|
|
def only_plural_of_shaft(a: str, b: str) -> bool:
|
|
return {a, b} == {"cnc shaft", "cnc shafts"}
|
|
|
|
# Singular keyword should match the plural filename via LLM
|
|
match = find_xlsx_for_keyword(inbox, "cnc shaft", only_plural_of_shaft)
|
|
assert match is not None
|
|
assert match.filename == "cnc-shafts.xlsx"
|
|
|
|
def test_first_match_returned(self, inbox: Path, llm_never):
|
|
# Two xlsx files both match; sorted order picks "a..." first
|
|
_touch(inbox, "b-cnc-machining.xlsx")
|
|
_touch(inbox, "a-cnc-machining.xlsx")
|
|
# These don't fuzzy match the keyword "cnc machining" because of
|
|
# the a-/b- prefix. So use a real collision:
|
|
_touch(inbox, "cnc-machining.xlsx")
|
|
match = find_xlsx_for_keyword(inbox, "cnc machining", llm_never)
|
|
assert match is not None
|
|
assert match.filename == "cnc-machining.xlsx"
|
|
|
|
def test_processed_files_ignored(self, inbox: Path, llm_never):
|
|
processed = inbox / "processed"
|
|
processed.mkdir()
|
|
_touch(processed, "cnc-machining.xlsx")
|
|
_touch(inbox, "cnc-machining.xlsx")
|
|
# Inbox file with same name as processed one is also skipped by
|
|
# list_inbox_xlsx, so no match available
|
|
match = find_xlsx_for_keyword(inbox, "cnc machining", llm_never)
|
|
assert match is None
|
|
|
|
|
|
class TestFindAllXlsxForKeyword:
|
|
def test_returns_all_matches(self, inbox: Path):
|
|
_touch(inbox, "cnc-shaft.xlsx")
|
|
_touch(inbox, "cnc-shafts.xlsx")
|
|
_touch(inbox, "unrelated.xlsx")
|
|
|
|
def plural_ok(a: str, b: str) -> bool:
|
|
return {a, b} == {"cnc shaft", "cnc shafts"}
|
|
|
|
results = find_all_xlsx_for_keyword(inbox, "cnc shaft", plural_ok)
|
|
names = sorted(r.filename for r in results)
|
|
assert names == ["cnc-shaft.xlsx", "cnc-shafts.xlsx"]
|
|
|
|
def test_empty_when_no_matches(self, inbox: Path, llm_never):
|
|
_touch(inbox, "unrelated.xlsx")
|
|
results = find_all_xlsx_for_keyword(inbox, "cnc shaft", llm_never)
|
|
assert results == []
|
|
|
|
def test_empty_keyword_returns_empty(self, inbox: Path, llm_never):
|
|
_touch(inbox, "anything.xlsx")
|
|
results = find_all_xlsx_for_keyword(inbox, "", llm_never)
|
|
assert results == []
|