443 lines
16 KiB
Python
443 lines
16 KiB
Python
"""
|
|
Unit tests for CORA parser module
|
|
"""
|
|
|
|
import pytest
|
|
from unittest.mock import Mock, patch, MagicMock
|
|
from pathlib import Path
|
|
from src.ingestion.parser import CORAParser, CORAParseError
|
|
|
|
|
|
class TestCORAParserInit:
|
|
"""Tests for CORAParser initialization"""
|
|
|
|
def test_parser_init_with_valid_file(self, tmp_path):
|
|
"""Test parser initialization with valid file"""
|
|
test_file = tmp_path / "test.xlsx"
|
|
test_file.touch()
|
|
|
|
with patch('openpyxl.load_workbook'):
|
|
parser = CORAParser(str(test_file))
|
|
assert parser.file_path == test_file
|
|
|
|
def test_parser_init_file_not_found(self):
|
|
"""Test parser raises error for non-existent file"""
|
|
with pytest.raises(CORAParseError, match="File not found"):
|
|
CORAParser("nonexistent_file.xlsx")
|
|
|
|
def test_parser_init_invalid_excel_file(self, tmp_path):
|
|
"""Test parser raises error for invalid Excel file"""
|
|
test_file = tmp_path / "invalid.xlsx"
|
|
test_file.write_text("not an excel file")
|
|
|
|
with pytest.raises(CORAParseError, match="Failed to open Excel file"):
|
|
CORAParser(str(test_file))
|
|
|
|
|
|
class TestCORAParserCellValue:
|
|
"""Tests for _get_cell_value helper method"""
|
|
|
|
def test_get_cell_value_returns_value(self):
|
|
"""Test getting valid cell value"""
|
|
mock_sheet = Mock()
|
|
mock_cell = Mock()
|
|
mock_cell.value = "test value"
|
|
mock_sheet.__getitem__ = Mock(return_value=mock_cell)
|
|
|
|
with patch('openpyxl.load_workbook'):
|
|
parser = CORAParser.__new__(CORAParser)
|
|
result = parser._get_cell_value(mock_sheet, "A1")
|
|
assert result == "test value"
|
|
|
|
def test_get_cell_value_returns_default_on_none(self):
|
|
"""Test default returned when cell is None"""
|
|
mock_sheet = Mock()
|
|
mock_cell = Mock()
|
|
mock_cell.value = None
|
|
mock_sheet.__getitem__ = Mock(return_value=mock_cell)
|
|
|
|
with patch('openpyxl.load_workbook'):
|
|
parser = CORAParser.__new__(CORAParser)
|
|
result = parser._get_cell_value(mock_sheet, "A1", default="default")
|
|
assert result == "default"
|
|
|
|
def test_get_cell_value_returns_default_on_zero(self):
|
|
"""Test default returned when cell is zero"""
|
|
mock_sheet = Mock()
|
|
mock_cell = Mock()
|
|
mock_cell.value = 0
|
|
mock_sheet.__getitem__ = Mock(return_value=mock_cell)
|
|
|
|
with patch('openpyxl.load_workbook'):
|
|
parser = CORAParser.__new__(CORAParser)
|
|
result = parser._get_cell_value(mock_sheet, "A1", default=100)
|
|
assert result == 100
|
|
|
|
def test_get_cell_value_returns_default_on_empty_string(self):
|
|
"""Test default returned when cell is empty string"""
|
|
mock_sheet = Mock()
|
|
mock_cell = Mock()
|
|
mock_cell.value = " "
|
|
mock_sheet.__getitem__ = Mock(return_value=mock_cell)
|
|
|
|
with patch('openpyxl.load_workbook'):
|
|
parser = CORAParser.__new__(CORAParser)
|
|
result = parser._get_cell_value(mock_sheet, "A1", default="default")
|
|
assert result == "default"
|
|
|
|
def test_get_cell_value_returns_default_on_exception(self):
|
|
"""Test default returned when exception occurs"""
|
|
mock_sheet = Mock()
|
|
mock_sheet.__getitem__ = Mock(side_effect=Exception("error"))
|
|
|
|
with patch('openpyxl.load_workbook'):
|
|
parser = CORAParser.__new__(CORAParser)
|
|
result = parser._get_cell_value(mock_sheet, "A1", default="default")
|
|
assert result == "default"
|
|
|
|
|
|
class TestCORAParserGetSheet:
|
|
"""Tests for _get_sheet helper method"""
|
|
|
|
def test_get_sheet_returns_sheet_when_exists(self):
|
|
"""Test getting existing sheet"""
|
|
mock_workbook = Mock()
|
|
mock_workbook.sheetnames = ["Sheet1", "Sheet2"]
|
|
mock_sheet = Mock()
|
|
mock_workbook.__getitem__ = Mock(return_value=mock_sheet)
|
|
|
|
with patch('openpyxl.load_workbook'):
|
|
parser = CORAParser.__new__(CORAParser)
|
|
parser.workbook = mock_workbook
|
|
result = parser._get_sheet("Sheet1")
|
|
assert result == mock_sheet
|
|
|
|
def test_get_sheet_raises_error_when_required_not_found(self):
|
|
"""Test error raised for missing required sheet"""
|
|
mock_workbook = Mock()
|
|
mock_workbook.sheetnames = ["Sheet1"]
|
|
|
|
with patch('openpyxl.load_workbook'):
|
|
parser = CORAParser.__new__(CORAParser)
|
|
parser.workbook = mock_workbook
|
|
with pytest.raises(CORAParseError, match="Required sheet"):
|
|
parser._get_sheet("MissingSheet", required=True)
|
|
|
|
def test_get_sheet_returns_none_when_optional_not_found(self):
|
|
"""Test None returned for missing optional sheet"""
|
|
mock_workbook = Mock()
|
|
mock_workbook.sheetnames = ["Sheet1"]
|
|
|
|
with patch('openpyxl.load_workbook'):
|
|
parser = CORAParser.__new__(CORAParser)
|
|
parser.workbook = mock_workbook
|
|
result = parser._get_sheet("MissingSheet", required=False)
|
|
assert result is None
|
|
|
|
|
|
class TestCORAParserExtractMainKeyword:
|
|
"""Tests for extract_main_keyword method"""
|
|
|
|
def test_extract_keyword_from_sheet(self, tmp_path):
|
|
"""Test extracting keyword from Strategic Overview B5"""
|
|
test_file = tmp_path / "test.xlsx"
|
|
test_file.touch()
|
|
|
|
mock_workbook = Mock()
|
|
mock_workbook.sheetnames = ["Strategic Overview"]
|
|
mock_sheet = Mock()
|
|
mock_cell = Mock()
|
|
mock_cell.value = "test keyword"
|
|
mock_sheet.__getitem__ = Mock(return_value=mock_cell)
|
|
mock_workbook.__getitem__ = Mock(return_value=mock_sheet)
|
|
|
|
with patch('openpyxl.load_workbook', return_value=mock_workbook):
|
|
parser = CORAParser(str(test_file))
|
|
result = parser.extract_main_keyword()
|
|
assert result == "test keyword"
|
|
|
|
def test_extract_keyword_from_filename(self, tmp_path):
|
|
"""Test extracting keyword from filename when sheet not available"""
|
|
test_file = tmp_path / "shaft_machining_goog_251011_C_US_L_EN_M3P1A_GMW.xlsx"
|
|
test_file.touch()
|
|
|
|
mock_workbook = Mock()
|
|
mock_workbook.sheetnames = []
|
|
|
|
with patch('openpyxl.load_workbook', return_value=mock_workbook):
|
|
parser = CORAParser(str(test_file))
|
|
result = parser.extract_main_keyword()
|
|
assert result == "shaft machining"
|
|
|
|
class TestCORAParserExtractStrategicOverview:
|
|
"""Tests for extract_strategic_overview method"""
|
|
|
|
def test_extract_strategic_overview_with_sheet(self, tmp_path):
|
|
"""Test extracting data from Strategic Overview sheet"""
|
|
test_file = tmp_path / "test.xlsx"
|
|
test_file.touch()
|
|
|
|
mock_workbook = Mock()
|
|
mock_workbook.sheetnames = ["Strategic Overview"]
|
|
mock_sheet = Mock()
|
|
|
|
def mock_getitem(self, cell_ref):
|
|
mock_cell = Mock()
|
|
values = {
|
|
"D24": 2000,
|
|
"D31": 5,
|
|
"D46": 0.15,
|
|
"D47": 0.10,
|
|
"D48": 0.05,
|
|
"B10": "{term1|term2|term3}"
|
|
}
|
|
mock_cell.value = values.get(cell_ref)
|
|
return mock_cell
|
|
|
|
mock_sheet.__getitem__ = mock_getitem
|
|
mock_workbook.__getitem__ = lambda self, name: mock_sheet if name == "Strategic Overview" else None
|
|
|
|
with patch('openpyxl.load_workbook', return_value=mock_workbook):
|
|
parser = CORAParser(str(test_file))
|
|
result = parser.extract_strategic_overview()
|
|
|
|
assert result["word_count"] == 2000
|
|
assert result["term_frequency"] == 5
|
|
assert result["related_search_density"] == 0.15
|
|
assert result["entity_density"] == 0.10
|
|
assert result["lsi_density"] == 0.05
|
|
assert result["spintax_related_search_terms"] == "{term1|term2|term3}"
|
|
|
|
def test_extract_strategic_overview_defaults_when_no_sheet(self, tmp_path):
|
|
"""Test default values when sheet not available"""
|
|
test_file = tmp_path / "test.xlsx"
|
|
test_file.touch()
|
|
|
|
mock_workbook = Mock()
|
|
mock_workbook.sheetnames = []
|
|
|
|
with patch('openpyxl.load_workbook', return_value=mock_workbook):
|
|
parser = CORAParser(str(test_file))
|
|
result = parser.extract_strategic_overview()
|
|
|
|
assert result["word_count"] == 1250
|
|
assert result["term_frequency"] == 3
|
|
assert result["related_search_density"] is None
|
|
|
|
|
|
class TestCORAParserExtractStructureMetrics:
|
|
"""Tests for extract_structure_metrics method"""
|
|
|
|
def test_extract_structure_metrics_with_sheet(self, tmp_path):
|
|
"""Test extracting data from Structure sheet"""
|
|
test_file = tmp_path / "test.xlsx"
|
|
test_file.touch()
|
|
|
|
mock_workbook = Mock()
|
|
mock_workbook.sheetnames = ["Structure"]
|
|
mock_sheet = Mock()
|
|
|
|
def mock_getitem(self, cell_ref):
|
|
mock_cell = Mock()
|
|
mock_cell.value = 1 if cell_ref.startswith("D") else None
|
|
return mock_cell
|
|
|
|
mock_sheet.__getitem__ = mock_getitem
|
|
mock_workbook.__getitem__ = lambda self, name: mock_sheet if name == "Structure" else None
|
|
|
|
with patch('openpyxl.load_workbook', return_value=mock_workbook):
|
|
parser = CORAParser(str(test_file))
|
|
result = parser.extract_structure_metrics()
|
|
|
|
assert result["title_exact_match"] == 1
|
|
assert result["h1_exact"] == 1
|
|
assert result["h2_total"] == 1
|
|
|
|
def test_extract_structure_metrics_defaults_when_no_sheet(self, tmp_path):
|
|
"""Test default values when sheet not available"""
|
|
test_file = tmp_path / "test.xlsx"
|
|
test_file.touch()
|
|
|
|
mock_workbook = Mock()
|
|
mock_workbook.sheetnames = []
|
|
|
|
with patch('openpyxl.load_workbook', return_value=mock_workbook):
|
|
parser = CORAParser(str(test_file))
|
|
result = parser.extract_structure_metrics()
|
|
|
|
assert result["title_exact_match"] is None
|
|
assert result["h1_exact"] is None
|
|
|
|
|
|
class TestCORAParserExtractEntities:
|
|
"""Tests for extract_entities method"""
|
|
|
|
def test_extract_entities_with_threshold(self, tmp_path):
|
|
"""Test extracting entities below threshold"""
|
|
test_file = tmp_path / "test.xlsx"
|
|
test_file.touch()
|
|
|
|
mock_workbook = Mock()
|
|
mock_workbook.sheetnames = ["Entities"]
|
|
mock_sheet = Mock()
|
|
|
|
mock_cell1 = Mock(value="entity1")
|
|
mock_cell2 = Mock(value="entity2")
|
|
mock_cell3 = Mock(value="entity3")
|
|
mock_filler = [Mock()] * 9
|
|
|
|
mock_rows = [
|
|
(mock_cell1, *mock_filler[:8], Mock(value=-0.2)),
|
|
(mock_cell2, *mock_filler[:8], Mock(value=-0.3)),
|
|
(mock_cell3, *mock_filler[:8], Mock(value=-0.1)),
|
|
]
|
|
|
|
mock_sheet.iter_rows = Mock(return_value=mock_rows)
|
|
mock_workbook.__getitem__ = lambda self, name: mock_sheet if name == "Entities" else None
|
|
|
|
with patch('openpyxl.load_workbook', return_value=mock_workbook):
|
|
parser = CORAParser(str(test_file))
|
|
result = parser.extract_entities()
|
|
|
|
assert len(result) == 2
|
|
assert "entity1" in result
|
|
assert "entity2" in result
|
|
assert "entity3" not in result
|
|
|
|
def test_extract_entities_returns_empty_when_no_sheet(self, tmp_path):
|
|
"""Test empty list when sheet not available"""
|
|
test_file = tmp_path / "test.xlsx"
|
|
test_file.touch()
|
|
|
|
mock_workbook = Mock()
|
|
mock_workbook.sheetnames = []
|
|
|
|
with patch('openpyxl.load_workbook', return_value=mock_workbook):
|
|
parser = CORAParser(str(test_file))
|
|
result = parser.extract_entities()
|
|
|
|
assert result == []
|
|
|
|
|
|
class TestCORAParserParseSpintax:
|
|
"""Tests for parse_spintax_to_list method"""
|
|
|
|
def test_parse_spintax_with_braces(self, tmp_path):
|
|
"""Test parsing spintax with braces"""
|
|
test_file = tmp_path / "test.xlsx"
|
|
test_file.touch()
|
|
|
|
with patch('openpyxl.load_workbook'):
|
|
parser = CORAParser.__new__(CORAParser)
|
|
result = parser.parse_spintax_to_list("{term1|term2|term3}")
|
|
|
|
assert len(result) == 3
|
|
assert "term1" in result
|
|
assert "term2" in result
|
|
assert "term3" in result
|
|
|
|
def test_parse_spintax_without_braces(self, tmp_path):
|
|
"""Test parsing spintax without braces"""
|
|
test_file = tmp_path / "test.xlsx"
|
|
test_file.touch()
|
|
|
|
with patch('openpyxl.load_workbook'):
|
|
parser = CORAParser.__new__(CORAParser)
|
|
result = parser.parse_spintax_to_list("term1|term2|term3")
|
|
|
|
assert len(result) == 3
|
|
|
|
def test_parse_spintax_returns_empty_on_none(self, tmp_path):
|
|
"""Test empty list returned for None"""
|
|
test_file = tmp_path / "test.xlsx"
|
|
test_file.touch()
|
|
|
|
with patch('openpyxl.load_workbook'):
|
|
parser = CORAParser.__new__(CORAParser)
|
|
result = parser.parse_spintax_to_list(None)
|
|
|
|
assert result == []
|
|
|
|
|
|
class TestCORAParserParse:
|
|
"""Tests for full parse method"""
|
|
|
|
def test_parse_full_file(self, tmp_path):
|
|
"""Test parsing complete CORA file"""
|
|
test_file = tmp_path / "shaft_machining_goog_test.xlsx"
|
|
test_file.touch()
|
|
|
|
mock_workbook = Mock()
|
|
mock_workbook.sheetnames = ["Strategic Overview", "Structure", "Entities"]
|
|
mock_workbook.close = Mock()
|
|
|
|
mock_so_sheet = Mock()
|
|
def mock_so_getitem(self, cell_ref):
|
|
mock_cell = Mock()
|
|
values = {
|
|
"B5": "shaft machining",
|
|
"D24": 1500,
|
|
"D31": 4,
|
|
"D46": 0.12,
|
|
"D47": 0.08,
|
|
"D48": 0.06,
|
|
"B10": "{term1|term2}"
|
|
}
|
|
mock_cell.value = values.get(cell_ref)
|
|
return mock_cell
|
|
mock_so_sheet.__getitem__ = mock_so_getitem
|
|
|
|
mock_structure_sheet = Mock()
|
|
def mock_structure_getitem(self, cell_ref):
|
|
mock_cell = Mock()
|
|
mock_cell.value = 1
|
|
return mock_cell
|
|
mock_structure_sheet.__getitem__ = mock_structure_getitem
|
|
|
|
mock_entities_sheet = Mock()
|
|
mock_filler = [Mock()] * 9
|
|
mock_rows = [
|
|
(Mock(value="entity1"), *mock_filler[:8], Mock(value=-0.2)),
|
|
]
|
|
mock_entities_sheet.iter_rows = Mock(return_value=mock_rows)
|
|
|
|
def mock_wb_getitem(self, sheet_name):
|
|
if sheet_name == "Strategic Overview":
|
|
return mock_so_sheet
|
|
elif sheet_name == "Structure":
|
|
return mock_structure_sheet
|
|
elif sheet_name == "Entities":
|
|
return mock_entities_sheet
|
|
return None
|
|
|
|
mock_workbook.__getitem__ = mock_wb_getitem
|
|
|
|
with patch('openpyxl.load_workbook', return_value=mock_workbook):
|
|
parser = CORAParser(str(test_file))
|
|
result = parser.parse()
|
|
|
|
assert result["main_keyword"] == "shaft machining"
|
|
assert result["word_count"] == 1500
|
|
assert result["term_frequency"] == 4
|
|
assert result["related_search_density"] == 0.12
|
|
assert len(result["entities"]) == 1
|
|
assert "entity1" in result["entities"]
|
|
assert len(result["related_searches"]) == 2
|
|
assert result["custom_anchor_text"] == []
|
|
|
|
def test_parse_with_custom_anchors(self, tmp_path):
|
|
"""Test parsing with custom anchor text"""
|
|
test_file = tmp_path / "test_goog_test.xlsx"
|
|
test_file.touch()
|
|
|
|
mock_workbook = Mock()
|
|
mock_workbook.sheetnames = []
|
|
mock_workbook.close = Mock()
|
|
|
|
with patch('openpyxl.load_workbook', return_value=mock_workbook):
|
|
parser = CORAParser(str(test_file))
|
|
result = parser.parse(custom_anchor_text=["anchor1", "anchor2"])
|
|
|
|
assert result["custom_anchor_text"] == ["anchor1", "anchor2"]
|
|
|