Big-Link-Man/tests/unit/test_cora_parser.py

463 lines
17 KiB
Python

"""
Unit tests for CORA parser module
"""
import pytest
from unittest.mock import Mock, patch, MagicMock
from pathlib import Path
from src.ingestion.parser import CORAParser, CORAParseError
class TestCORAParserInit:
"""Tests for CORAParser initialization"""
def test_parser_init_with_valid_file(self, tmp_path):
"""Test parser initialization with valid file"""
test_file = tmp_path / "test.xlsx"
test_file.touch()
with patch('openpyxl.load_workbook'):
parser = CORAParser(str(test_file))
assert parser.file_path == test_file
def test_parser_init_file_not_found(self):
"""Test parser raises error for non-existent file"""
with pytest.raises(CORAParseError, match="File not found"):
CORAParser("nonexistent_file.xlsx")
def test_parser_init_invalid_excel_file(self, tmp_path):
"""Test parser raises error for invalid Excel file"""
test_file = tmp_path / "invalid.xlsx"
test_file.write_text("not an excel file")
with pytest.raises(CORAParseError, match="Failed to open Excel file"):
CORAParser(str(test_file))
class TestCORAParserCellValue:
"""Tests for _get_cell_value helper method"""
def test_get_cell_value_returns_value(self):
"""Test getting valid cell value"""
mock_sheet = Mock()
mock_cell = Mock()
mock_cell.value = "test value"
mock_sheet.__getitem__ = Mock(return_value=mock_cell)
with patch('openpyxl.load_workbook'):
parser = CORAParser.__new__(CORAParser)
result = parser._get_cell_value(mock_sheet, "A1")
assert result == "test value"
def test_get_cell_value_returns_default_on_none(self):
"""Test default returned when cell is None"""
mock_sheet = Mock()
mock_cell = Mock()
mock_cell.value = None
mock_sheet.__getitem__ = Mock(return_value=mock_cell)
with patch('openpyxl.load_workbook'):
parser = CORAParser.__new__(CORAParser)
result = parser._get_cell_value(mock_sheet, "A1", default="default")
assert result == "default"
def test_get_cell_value_returns_default_on_zero(self):
"""Test default returned when cell is zero"""
mock_sheet = Mock()
mock_cell = Mock()
mock_cell.value = 0
mock_sheet.__getitem__ = Mock(return_value=mock_cell)
with patch('openpyxl.load_workbook'):
parser = CORAParser.__new__(CORAParser)
result = parser._get_cell_value(mock_sheet, "A1", default=100)
assert result == 100
def test_get_cell_value_returns_default_on_empty_string(self):
"""Test default returned when cell is empty string"""
mock_sheet = Mock()
mock_cell = Mock()
mock_cell.value = " "
mock_sheet.__getitem__ = Mock(return_value=mock_cell)
with patch('openpyxl.load_workbook'):
parser = CORAParser.__new__(CORAParser)
result = parser._get_cell_value(mock_sheet, "A1", default="default")
assert result == "default"
def test_get_cell_value_returns_default_on_exception(self):
"""Test default returned when exception occurs"""
mock_sheet = Mock()
mock_sheet.__getitem__ = Mock(side_effect=Exception("error"))
with patch('openpyxl.load_workbook'):
parser = CORAParser.__new__(CORAParser)
result = parser._get_cell_value(mock_sheet, "A1", default="default")
assert result == "default"
class TestCORAParserGetSheet:
"""Tests for _get_sheet helper method"""
def test_get_sheet_returns_sheet_when_exists(self):
"""Test getting existing sheet"""
mock_workbook = Mock()
mock_workbook.sheetnames = ["Sheet1", "Sheet2"]
mock_sheet = Mock()
mock_workbook.__getitem__ = Mock(return_value=mock_sheet)
with patch('openpyxl.load_workbook'):
parser = CORAParser.__new__(CORAParser)
parser.workbook = mock_workbook
result = parser._get_sheet("Sheet1")
assert result == mock_sheet
def test_get_sheet_raises_error_when_required_not_found(self):
"""Test error raised for missing required sheet"""
mock_workbook = Mock()
mock_workbook.sheetnames = ["Sheet1"]
with patch('openpyxl.load_workbook'):
parser = CORAParser.__new__(CORAParser)
parser.workbook = mock_workbook
with pytest.raises(CORAParseError, match="Required sheet"):
parser._get_sheet("MissingSheet", required=True)
def test_get_sheet_returns_none_when_optional_not_found(self):
"""Test None returned for missing optional sheet"""
mock_workbook = Mock()
mock_workbook.sheetnames = ["Sheet1"]
with patch('openpyxl.load_workbook'):
parser = CORAParser.__new__(CORAParser)
parser.workbook = mock_workbook
result = parser._get_sheet("MissingSheet", required=False)
assert result is None
class TestCORAParserExtractMainKeyword:
"""Tests for extract_main_keyword method"""
def test_extract_keyword_from_sheet(self, tmp_path):
"""Test extracting keyword from Strategic Overview B5"""
test_file = tmp_path / "test.xlsx"
test_file.touch()
mock_workbook = Mock()
mock_workbook.sheetnames = ["Strategic Overview"]
mock_sheet = Mock()
mock_cell = Mock()
mock_cell.value = "test keyword"
mock_sheet.__getitem__ = Mock(return_value=mock_cell)
mock_workbook.__getitem__ = Mock(return_value=mock_sheet)
with patch('openpyxl.load_workbook', return_value=mock_workbook):
parser = CORAParser(str(test_file))
result = parser.extract_main_keyword()
assert result == "test keyword"
def test_extract_keyword_from_filename(self, tmp_path):
"""Test extracting keyword from filename when sheet not available"""
test_file = tmp_path / "shaft_machining_goog_251011_C_US_L_EN_M3P1A_GMW.xlsx"
test_file.touch()
mock_workbook = Mock()
mock_workbook.sheetnames = []
with patch('openpyxl.load_workbook', return_value=mock_workbook):
parser = CORAParser(str(test_file))
result = parser.extract_main_keyword()
assert result == "shaft machining"
class TestCORAParserExtractStrategicOverview:
"""Tests for extract_strategic_overview method"""
def test_extract_strategic_overview_with_sheet(self, tmp_path):
"""Test extracting data from Strategic Overview sheet"""
test_file = tmp_path / "test.xlsx"
test_file.touch()
mock_workbook = Mock()
mock_workbook.sheetnames = ["Strategic Overview"]
mock_sheet = Mock()
def mock_getitem(self, cell_ref):
mock_cell = Mock()
values = {
"D24": 2000,
"D31": 5,
"D46": 0.15,
"D47": 0.10,
"D48": 0.05,
"B10": "{term1|term2|term3}"
}
mock_cell.value = values.get(cell_ref)
return mock_cell
mock_sheet.__getitem__ = mock_getitem
mock_workbook.__getitem__ = lambda self, name: mock_sheet if name == "Strategic Overview" else None
with patch('openpyxl.load_workbook', return_value=mock_workbook):
parser = CORAParser(str(test_file))
result = parser.extract_strategic_overview()
assert result["word_count"] == 2000
assert result["term_frequency"] == 5
assert result["related_search_density"] == 0.15
assert result["entity_density"] == 0.10
assert result["lsi_density"] == 0.05
assert result["spintax_related_search_terms"] == "{term1|term2|term3}"
def test_extract_strategic_overview_missing_sheet_raises_error(self, tmp_path):
"""Test error raised when Strategic Overview sheet not available"""
test_file = tmp_path / "test.xlsx"
test_file.touch()
mock_workbook = Mock()
mock_workbook.sheetnames = []
with patch('openpyxl.load_workbook', return_value=mock_workbook):
parser = CORAParser(str(test_file))
with pytest.raises(CORAParseError, match="Required sheet 'Strategic Overview' not found"):
parser.extract_strategic_overview()
class TestCORAParserExtractStructureMetrics:
"""Tests for extract_structure_metrics method"""
def test_extract_structure_metrics_with_sheet(self, tmp_path):
"""Test extracting data from Structure sheet"""
test_file = tmp_path / "test.xlsx"
test_file.touch()
mock_workbook = Mock()
mock_workbook.sheetnames = ["Structure"]
mock_sheet = Mock()
def mock_getitem(self, cell_ref):
mock_cell = Mock()
mock_cell.value = 1 if cell_ref.startswith("D") else None
return mock_cell
mock_sheet.__getitem__ = mock_getitem
mock_workbook.__getitem__ = lambda self, name: mock_sheet if name == "Structure" else None
with patch('openpyxl.load_workbook', return_value=mock_workbook):
parser = CORAParser(str(test_file))
result = parser.extract_structure_metrics()
assert result["title_exact_match"] == 1
assert result["h1_exact"] == 1
assert result["h2_total"] == 1
def test_extract_structure_metrics_missing_sheet_raises_error(self, tmp_path):
"""Test error raised when Structure sheet not available"""
test_file = tmp_path / "test.xlsx"
test_file.touch()
mock_workbook = Mock()
mock_workbook.sheetnames = []
with patch('openpyxl.load_workbook', return_value=mock_workbook):
parser = CORAParser(str(test_file))
with pytest.raises(CORAParseError, match="Required sheet 'Structure' not found"):
parser.extract_structure_metrics()
class TestCORAParserExtractEntities:
"""Tests for extract_entities method"""
def test_extract_entities_with_threshold(self, tmp_path):
"""Test extracting entities below threshold"""
test_file = tmp_path / "test.xlsx"
test_file.touch()
mock_workbook = Mock()
mock_workbook.sheetnames = ["Entities"]
mock_sheet = Mock()
mock_cell1 = Mock(value="entity1")
mock_cell2 = Mock(value="entity2")
mock_cell3 = Mock(value="entity3")
mock_filler = [Mock()] * 9
mock_rows = [
(mock_cell1, *mock_filler[:8], Mock(value=-0.2)),
(mock_cell2, *mock_filler[:8], Mock(value=-0.3)),
(mock_cell3, *mock_filler[:8], Mock(value=-0.1)),
]
mock_sheet.iter_rows = Mock(return_value=mock_rows)
mock_workbook.__getitem__ = lambda self, name: mock_sheet if name == "Entities" else None
with patch('openpyxl.load_workbook', return_value=mock_workbook):
parser = CORAParser(str(test_file))
result = parser.extract_entities()
assert len(result) == 2
assert "entity1" in result
assert "entity2" in result
assert "entity3" not in result
def test_extract_entities_returns_empty_when_no_sheet(self, tmp_path):
"""Test empty list when sheet not available"""
test_file = tmp_path / "test.xlsx"
test_file.touch()
mock_workbook = Mock()
mock_workbook.sheetnames = []
with patch('openpyxl.load_workbook', return_value=mock_workbook):
parser = CORAParser(str(test_file))
result = parser.extract_entities()
assert result == []
class TestCORAParserParseSpintax:
"""Tests for parse_spintax_to_list method"""
def test_parse_spintax_with_braces(self, tmp_path):
"""Test parsing spintax with braces"""
test_file = tmp_path / "test.xlsx"
test_file.touch()
with patch('openpyxl.load_workbook'):
parser = CORAParser.__new__(CORAParser)
result = parser.parse_spintax_to_list("{term1|term2|term3}")
assert len(result) == 3
assert "term1" in result
assert "term2" in result
assert "term3" in result
def test_parse_spintax_without_braces(self, tmp_path):
"""Test parsing spintax without braces"""
test_file = tmp_path / "test.xlsx"
test_file.touch()
with patch('openpyxl.load_workbook'):
parser = CORAParser.__new__(CORAParser)
result = parser.parse_spintax_to_list("term1|term2|term3")
assert len(result) == 3
def test_parse_spintax_returns_empty_on_none(self, tmp_path):
"""Test empty list returned for None"""
test_file = tmp_path / "test.xlsx"
test_file.touch()
with patch('openpyxl.load_workbook'):
parser = CORAParser.__new__(CORAParser)
result = parser.parse_spintax_to_list(None)
assert result == []
class TestCORAParserParse:
"""Tests for full parse method"""
def test_parse_full_file(self, tmp_path):
"""Test parsing complete CORA file"""
test_file = tmp_path / "shaft_machining_goog_test.xlsx"
test_file.touch()
mock_workbook = Mock()
mock_workbook.sheetnames = ["Strategic Overview", "Structure", "Entities"]
mock_workbook.close = Mock()
mock_so_sheet = Mock()
def mock_so_getitem(self, cell_ref):
mock_cell = Mock()
values = {
"B5": "shaft machining",
"D24": 1500,
"D31": 4,
"D46": 0.12,
"D47": 0.08,
"D48": 0.06,
"B10": "{term1|term2}"
}
mock_cell.value = values.get(cell_ref)
return mock_cell
mock_so_sheet.__getitem__ = mock_so_getitem
mock_structure_sheet = Mock()
def mock_structure_getitem(self, cell_ref):
mock_cell = Mock()
mock_cell.value = 1
return mock_cell
mock_structure_sheet.__getitem__ = mock_structure_getitem
mock_entities_sheet = Mock()
mock_filler = [Mock()] * 9
mock_rows = [
(Mock(value="entity1"), *mock_filler[:8], Mock(value=-0.2)),
]
mock_entities_sheet.iter_rows = Mock(return_value=mock_rows)
def mock_wb_getitem(self, sheet_name):
if sheet_name == "Strategic Overview":
return mock_so_sheet
elif sheet_name == "Structure":
return mock_structure_sheet
elif sheet_name == "Entities":
return mock_entities_sheet
return None
mock_workbook.__getitem__ = mock_wb_getitem
with patch('openpyxl.load_workbook', return_value=mock_workbook):
parser = CORAParser(str(test_file))
result = parser.parse()
assert result["main_keyword"] == "shaft machining"
assert result["word_count"] == 1500
assert result["term_frequency"] == 4
assert result["related_search_density"] == 0.12
assert len(result["entities"]) == 1
assert "entity1" in result["entities"]
assert len(result["related_searches"]) == 2
assert result["custom_anchor_text"] == []
def test_parse_with_custom_anchors(self, tmp_path):
"""Test parsing with custom anchor text"""
test_file = tmp_path / "test_goog_test.xlsx"
test_file.touch()
mock_workbook = Mock()
mock_workbook.sheetnames = ["Strategic Overview", "Structure"]
mock_workbook.close = Mock()
mock_so_sheet = Mock()
def mock_so_getitem(self, cell_ref):
mock_cell = Mock()
values = {"B5": "test", "D24": 1000, "D31": 2}
mock_cell.value = values.get(cell_ref)
return mock_cell
mock_so_sheet.__getitem__ = mock_so_getitem
mock_structure_sheet = Mock()
def mock_structure_getitem(self, cell_ref):
mock_cell = Mock()
mock_cell.value = None
return mock_cell
mock_structure_sheet.__getitem__ = mock_structure_getitem
def mock_wb_getitem(self, sheet_name):
if sheet_name == "Strategic Overview":
return mock_so_sheet
elif sheet_name == "Structure":
return mock_structure_sheet
return None
mock_workbook.__getitem__ = mock_wb_getitem
with patch('openpyxl.load_workbook', return_value=mock_workbook):
parser = CORAParser(str(test_file))
result = parser.parse(custom_anchor_text=["anchor1", "anchor2"])
assert result["custom_anchor_text"] == ["anchor1", "anchor2"]
assert result["main_keyword"] == "test"