""" Unit tests for CORA parser module """ import pytest from unittest.mock import Mock, patch, MagicMock from pathlib import Path from src.ingestion.parser import CORAParser, CORAParseError class TestCORAParserInit: """Tests for CORAParser initialization""" def test_parser_init_with_valid_file(self, tmp_path): """Test parser initialization with valid file""" test_file = tmp_path / "test.xlsx" test_file.touch() with patch('openpyxl.load_workbook'): parser = CORAParser(str(test_file)) assert parser.file_path == test_file def test_parser_init_file_not_found(self): """Test parser raises error for non-existent file""" with pytest.raises(CORAParseError, match="File not found"): CORAParser("nonexistent_file.xlsx") def test_parser_init_invalid_excel_file(self, tmp_path): """Test parser raises error for invalid Excel file""" test_file = tmp_path / "invalid.xlsx" test_file.write_text("not an excel file") with pytest.raises(CORAParseError, match="Failed to open Excel file"): CORAParser(str(test_file)) class TestCORAParserCellValue: """Tests for _get_cell_value helper method""" def test_get_cell_value_returns_value(self): """Test getting valid cell value""" mock_sheet = Mock() mock_cell = Mock() mock_cell.value = "test value" mock_sheet.__getitem__ = Mock(return_value=mock_cell) with patch('openpyxl.load_workbook'): parser = CORAParser.__new__(CORAParser) result = parser._get_cell_value(mock_sheet, "A1") assert result == "test value" def test_get_cell_value_returns_default_on_none(self): """Test default returned when cell is None""" mock_sheet = Mock() mock_cell = Mock() mock_cell.value = None mock_sheet.__getitem__ = Mock(return_value=mock_cell) with patch('openpyxl.load_workbook'): parser = CORAParser.__new__(CORAParser) result = parser._get_cell_value(mock_sheet, "A1", default="default") assert result == "default" def test_get_cell_value_returns_default_on_zero(self): """Test default returned when cell is zero""" mock_sheet = Mock() mock_cell = Mock() mock_cell.value = 0 mock_sheet.__getitem__ = Mock(return_value=mock_cell) with patch('openpyxl.load_workbook'): parser = CORAParser.__new__(CORAParser) result = parser._get_cell_value(mock_sheet, "A1", default=100) assert result == 100 def test_get_cell_value_returns_default_on_empty_string(self): """Test default returned when cell is empty string""" mock_sheet = Mock() mock_cell = Mock() mock_cell.value = " " mock_sheet.__getitem__ = Mock(return_value=mock_cell) with patch('openpyxl.load_workbook'): parser = CORAParser.__new__(CORAParser) result = parser._get_cell_value(mock_sheet, "A1", default="default") assert result == "default" def test_get_cell_value_returns_default_on_exception(self): """Test default returned when exception occurs""" mock_sheet = Mock() mock_sheet.__getitem__ = Mock(side_effect=Exception("error")) with patch('openpyxl.load_workbook'): parser = CORAParser.__new__(CORAParser) result = parser._get_cell_value(mock_sheet, "A1", default="default") assert result == "default" class TestCORAParserGetSheet: """Tests for _get_sheet helper method""" def test_get_sheet_returns_sheet_when_exists(self): """Test getting existing sheet""" mock_workbook = Mock() mock_workbook.sheetnames = ["Sheet1", "Sheet2"] mock_sheet = Mock() mock_workbook.__getitem__ = Mock(return_value=mock_sheet) with patch('openpyxl.load_workbook'): parser = CORAParser.__new__(CORAParser) parser.workbook = mock_workbook result = parser._get_sheet("Sheet1") assert result == mock_sheet def test_get_sheet_raises_error_when_required_not_found(self): """Test error raised for missing required sheet""" mock_workbook = Mock() mock_workbook.sheetnames = ["Sheet1"] with patch('openpyxl.load_workbook'): parser = CORAParser.__new__(CORAParser) parser.workbook = mock_workbook with pytest.raises(CORAParseError, match="Required sheet"): parser._get_sheet("MissingSheet", required=True) def test_get_sheet_returns_none_when_optional_not_found(self): """Test None returned for missing optional sheet""" mock_workbook = Mock() mock_workbook.sheetnames = ["Sheet1"] with patch('openpyxl.load_workbook'): parser = CORAParser.__new__(CORAParser) parser.workbook = mock_workbook result = parser._get_sheet("MissingSheet", required=False) assert result is None class TestCORAParserExtractMainKeyword: """Tests for extract_main_keyword method""" def test_extract_keyword_from_sheet(self, tmp_path): """Test extracting keyword from Strategic Overview B5""" test_file = tmp_path / "test.xlsx" test_file.touch() mock_workbook = Mock() mock_workbook.sheetnames = ["Strategic Overview"] mock_sheet = Mock() mock_cell = Mock() mock_cell.value = "test keyword" mock_sheet.__getitem__ = Mock(return_value=mock_cell) mock_workbook.__getitem__ = Mock(return_value=mock_sheet) with patch('openpyxl.load_workbook', return_value=mock_workbook): parser = CORAParser(str(test_file)) result = parser.extract_main_keyword() assert result == "test keyword" def test_extract_keyword_from_filename(self, tmp_path): """Test extracting keyword from filename when sheet not available""" test_file = tmp_path / "shaft_machining_goog_251011_C_US_L_EN_M3P1A_GMW.xlsx" test_file.touch() mock_workbook = Mock() mock_workbook.sheetnames = [] with patch('openpyxl.load_workbook', return_value=mock_workbook): parser = CORAParser(str(test_file)) result = parser.extract_main_keyword() assert result == "shaft machining" class TestCORAParserExtractStrategicOverview: """Tests for extract_strategic_overview method""" def test_extract_strategic_overview_with_sheet(self, tmp_path): """Test extracting data from Strategic Overview sheet""" test_file = tmp_path / "test.xlsx" test_file.touch() mock_workbook = Mock() mock_workbook.sheetnames = ["Strategic Overview"] mock_sheet = Mock() def mock_getitem(self, cell_ref): mock_cell = Mock() values = { "D24": 2000, "D31": 5, "D46": 0.15, "D47": 0.10, "D48": 0.05, "B10": "{term1|term2|term3}" } mock_cell.value = values.get(cell_ref) return mock_cell mock_sheet.__getitem__ = mock_getitem mock_workbook.__getitem__ = lambda self, name: mock_sheet if name == "Strategic Overview" else None with patch('openpyxl.load_workbook', return_value=mock_workbook): parser = CORAParser(str(test_file)) result = parser.extract_strategic_overview() assert result["word_count"] == 2000 assert result["term_frequency"] == 5 assert result["related_search_density"] == 0.15 assert result["entity_density"] == 0.10 assert result["lsi_density"] == 0.05 assert result["spintax_related_search_terms"] == "{term1|term2|term3}" def test_extract_strategic_overview_missing_sheet_raises_error(self, tmp_path): """Test error raised when Strategic Overview sheet not available""" test_file = tmp_path / "test.xlsx" test_file.touch() mock_workbook = Mock() mock_workbook.sheetnames = [] with patch('openpyxl.load_workbook', return_value=mock_workbook): parser = CORAParser(str(test_file)) with pytest.raises(CORAParseError, match="Required sheet 'Strategic Overview' not found"): parser.extract_strategic_overview() class TestCORAParserExtractStructureMetrics: """Tests for extract_structure_metrics method""" def test_extract_structure_metrics_with_sheet(self, tmp_path): """Test extracting data from Structure sheet""" test_file = tmp_path / "test.xlsx" test_file.touch() mock_workbook = Mock() mock_workbook.sheetnames = ["Structure"] mock_sheet = Mock() def mock_getitem(self, cell_ref): mock_cell = Mock() mock_cell.value = 1 if cell_ref.startswith("D") else None return mock_cell mock_sheet.__getitem__ = mock_getitem mock_workbook.__getitem__ = lambda self, name: mock_sheet if name == "Structure" else None with patch('openpyxl.load_workbook', return_value=mock_workbook): parser = CORAParser(str(test_file)) result = parser.extract_structure_metrics() assert result["title_exact_match"] == 1 assert result["h1_exact"] == 1 assert result["h2_total"] == 1 def test_extract_structure_metrics_missing_sheet_raises_error(self, tmp_path): """Test error raised when Structure sheet not available""" test_file = tmp_path / "test.xlsx" test_file.touch() mock_workbook = Mock() mock_workbook.sheetnames = [] with patch('openpyxl.load_workbook', return_value=mock_workbook): parser = CORAParser(str(test_file)) with pytest.raises(CORAParseError, match="Required sheet 'Structure' not found"): parser.extract_structure_metrics() class TestCORAParserExtractEntities: """Tests for extract_entities method""" def test_extract_entities_with_threshold(self, tmp_path): """Test extracting entities below threshold""" test_file = tmp_path / "test.xlsx" test_file.touch() mock_workbook = Mock() mock_workbook.sheetnames = ["Entities"] mock_sheet = Mock() mock_cell1 = Mock(value="entity1") mock_cell2 = Mock(value="entity2") mock_cell3 = Mock(value="entity3") mock_filler = [Mock()] * 9 mock_rows = [ (mock_cell1, *mock_filler[:8], Mock(value=-0.2)), (mock_cell2, *mock_filler[:8], Mock(value=-0.3)), (mock_cell3, *mock_filler[:8], Mock(value=-0.1)), ] mock_sheet.iter_rows = Mock(return_value=mock_rows) mock_workbook.__getitem__ = lambda self, name: mock_sheet if name == "Entities" else None with patch('openpyxl.load_workbook', return_value=mock_workbook): parser = CORAParser(str(test_file)) result = parser.extract_entities() assert len(result) == 2 assert "entity1" in result assert "entity2" in result assert "entity3" not in result def test_extract_entities_returns_empty_when_no_sheet(self, tmp_path): """Test empty list when sheet not available""" test_file = tmp_path / "test.xlsx" test_file.touch() mock_workbook = Mock() mock_workbook.sheetnames = [] with patch('openpyxl.load_workbook', return_value=mock_workbook): parser = CORAParser(str(test_file)) result = parser.extract_entities() assert result == [] class TestCORAParserParseSpintax: """Tests for parse_spintax_to_list method""" def test_parse_spintax_with_braces(self, tmp_path): """Test parsing spintax with braces""" test_file = tmp_path / "test.xlsx" test_file.touch() with patch('openpyxl.load_workbook'): parser = CORAParser.__new__(CORAParser) result = parser.parse_spintax_to_list("{term1|term2|term3}") assert len(result) == 3 assert "term1" in result assert "term2" in result assert "term3" in result def test_parse_spintax_without_braces(self, tmp_path): """Test parsing spintax without braces""" test_file = tmp_path / "test.xlsx" test_file.touch() with patch('openpyxl.load_workbook'): parser = CORAParser.__new__(CORAParser) result = parser.parse_spintax_to_list("term1|term2|term3") assert len(result) == 3 def test_parse_spintax_returns_empty_on_none(self, tmp_path): """Test empty list returned for None""" test_file = tmp_path / "test.xlsx" test_file.touch() with patch('openpyxl.load_workbook'): parser = CORAParser.__new__(CORAParser) result = parser.parse_spintax_to_list(None) assert result == [] class TestCORAParserParse: """Tests for full parse method""" def test_parse_full_file(self, tmp_path): """Test parsing complete CORA file""" test_file = tmp_path / "shaft_machining_goog_test.xlsx" test_file.touch() mock_workbook = Mock() mock_workbook.sheetnames = ["Strategic Overview", "Structure", "Entities"] mock_workbook.close = Mock() mock_so_sheet = Mock() def mock_so_getitem(self, cell_ref): mock_cell = Mock() values = { "B5": "shaft machining", "D24": 1500, "D31": 4, "D46": 0.12, "D47": 0.08, "D48": 0.06, "B10": "{term1|term2}" } mock_cell.value = values.get(cell_ref) return mock_cell mock_so_sheet.__getitem__ = mock_so_getitem mock_structure_sheet = Mock() def mock_structure_getitem(self, cell_ref): mock_cell = Mock() mock_cell.value = 1 return mock_cell mock_structure_sheet.__getitem__ = mock_structure_getitem mock_entities_sheet = Mock() mock_filler = [Mock()] * 9 mock_rows = [ (Mock(value="entity1"), *mock_filler[:8], Mock(value=-0.2)), ] mock_entities_sheet.iter_rows = Mock(return_value=mock_rows) def mock_wb_getitem(self, sheet_name): if sheet_name == "Strategic Overview": return mock_so_sheet elif sheet_name == "Structure": return mock_structure_sheet elif sheet_name == "Entities": return mock_entities_sheet return None mock_workbook.__getitem__ = mock_wb_getitem with patch('openpyxl.load_workbook', return_value=mock_workbook): parser = CORAParser(str(test_file)) result = parser.parse() assert result["main_keyword"] == "shaft machining" assert result["word_count"] == 1500 assert result["term_frequency"] == 4 assert result["related_search_density"] == 0.12 assert len(result["entities"]) == 1 assert "entity1" in result["entities"] assert len(result["related_searches"]) == 2 assert result["custom_anchor_text"] == [] def test_parse_with_custom_anchors(self, tmp_path): """Test parsing with custom anchor text""" test_file = tmp_path / "test_goog_test.xlsx" test_file.touch() mock_workbook = Mock() mock_workbook.sheetnames = ["Strategic Overview", "Structure"] mock_workbook.close = Mock() mock_so_sheet = Mock() def mock_so_getitem(self, cell_ref): mock_cell = Mock() values = {"B5": "test", "D24": 1000, "D31": 2} mock_cell.value = values.get(cell_ref) return mock_cell mock_so_sheet.__getitem__ = mock_so_getitem mock_structure_sheet = Mock() def mock_structure_getitem(self, cell_ref): mock_cell = Mock() mock_cell.value = None return mock_cell mock_structure_sheet.__getitem__ = mock_structure_getitem def mock_wb_getitem(self, sheet_name): if sheet_name == "Strategic Overview": return mock_so_sheet elif sheet_name == "Structure": return mock_structure_sheet return None mock_workbook.__getitem__ = mock_wb_getitem with patch('openpyxl.load_workbook', return_value=mock_workbook): parser = CORAParser(str(test_file)) result = parser.parse(custom_anchor_text=["anchor1", "anchor2"]) assert result["custom_anchor_text"] == ["anchor1", "anchor2"] assert result["main_keyword"] == "test"