Big-Link-Man/tests/unit/test_rule_engine.py

452 lines
15 KiB
Python

"""
Unit tests for content rule engine
"""
import pytest
from unittest.mock import Mock
from src.generation.rule_engine import (
ContentRuleEngine,
ContentHTMLParser,
ValidationResult,
ValidationIssue
)
from src.database.models import Project
from src.core.config import Config
@pytest.fixture
def mock_config():
"""Mock configuration for tests"""
config = Mock()
config.get = Mock(side_effect=lambda key, default={}: {
"content_rules.universal": {
"min_content_length": 1000,
"max_content_length": 5000,
"title_exact_match_required": True,
"h1_exact_match_required": True,
"h2_exact_match_min": 1,
"h3_exact_match_min": 1,
"faq_section_required": True,
"image_alt_text_keyword_required": True,
"image_alt_text_entity_required": True
},
"content_rules.cora_validation": {
"enabled": True,
"tier_1_strict": True,
"tier_2_plus_warn_only": True,
"round_averages_down": True
}
}.get(key, default))
return config
@pytest.fixture
def sample_project():
"""Sample project with CORA data"""
project = Mock(spec=Project)
project.id = 1
project.main_keyword = "shaft machining"
project.tier = 1
project.entities = ["CNC", "lathe", "precision"]
project.related_searches = ["shaft machining process", "machining techniques"]
project.h1_exact = 1
project.h1_related_search = 0
project.h1_entities = 1
project.h2_total = 5
project.h2_exact = 1
project.h2_related_search = 2
project.h2_entities = 2
project.h3_total = 8
project.h3_exact = 1
project.h3_related_search = 3
project.h3_entities = 3
return project
class TestContentHTMLParser:
"""Tests for HTML parser"""
def test_parse_title(self):
html = "<html><head><title>Shaft Machining Guide</title></head></html>"
parser = ContentHTMLParser()
parser.feed(html)
assert parser.title == "Shaft Machining Guide"
def test_parse_meta_description(self):
html = '<html><head><meta name="description" content="Complete guide to shaft machining"></head></html>'
parser = ContentHTMLParser()
parser.feed(html)
assert parser.meta_description == "Complete guide to shaft machining"
def test_parse_headings(self):
html = """
<html><body>
<h1>Main Heading about Shaft Machining</h1>
<h2>Understanding CNC</h2>
<h2>Shaft Machining Process</h2>
<h3>What is a lathe?</h3>
<h3>Precision techniques</h3>
<h3>FAQ about shaft machining</h3>
</body></html>
"""
parser = ContentHTMLParser()
parser.feed(html)
assert len(parser.h1_tags) == 1
assert "Shaft Machining" in parser.h1_tags[0]
assert len(parser.h2_tags) == 2
assert len(parser.h3_tags) == 3
def test_parse_images(self):
html = """
<html><body>
<img src="image1.jpg" alt="Shaft machining with CNC lathe">
<img src="image2.jpg" alt="Precision tools">
</body></html>
"""
parser = ContentHTMLParser()
parser.feed(html)
assert len(parser.images) == 2
assert parser.images[0]["alt"] == "Shaft machining with CNC lathe"
assert parser.images[1]["alt"] == "Precision tools"
def test_parse_links(self):
html = """
<html><body>
<a href="/home">Home Page</a>
<a href="/article">Related Article</a>
</body></html>
"""
parser = ContentHTMLParser()
parser.feed(html)
assert len(parser.links) == 2
assert parser.links[0]["href"] == "/home"
assert "Home Page" in parser.links[0]["text"]
def test_parse_text_content(self):
html = """
<html><body>
<h1>Title</h1>
<p>This is some content about shaft machining and CNC operations.</p>
<p>More content here with precision lathe work.</p>
</body></html>
"""
parser = ContentHTMLParser()
parser.feed(html)
assert "shaft machining" in parser.text_content.lower()
assert "CNC" in parser.text_content
assert len(parser.text_content.split()) > 10
class TestValidationResult:
"""Tests for ValidationResult class"""
def test_initial_state(self):
result = ValidationResult(passed=True)
assert result.passed is True
assert len(result.errors) == 0
assert len(result.warnings) == 0
def test_add_error(self):
result = ValidationResult(passed=True)
result.add_error("test_rule", "Test error", expected=5, actual=3)
assert result.passed is False
assert len(result.errors) == 1
assert result.errors[0].rule_name == "test_rule"
assert result.errors[0].severity == "error"
def test_add_warning(self):
result = ValidationResult(passed=True)
result.add_warning("test_rule", "Test warning", expected=5, actual=4)
assert result.passed is True
assert len(result.warnings) == 1
assert result.warnings[0].severity == "warning"
def test_to_dict(self):
result = ValidationResult(passed=False)
result.add_error("rule1", "Error message", expected=5, actual=3)
result.add_warning("rule2", "Warning message", expected=10, actual=8)
data = result.to_dict()
assert data["passed"] is False
assert len(data["errors"]) == 1
assert len(data["warnings"]) == 1
assert data["errors"][0]["rule"] == "rule1"
assert data["warnings"][0]["rule"] == "rule2"
class TestUniversalRules:
"""Tests for universal rule validation"""
def test_content_length_validation(self, mock_config, sample_project):
engine = ContentRuleEngine(mock_config)
short_html = "<html><body><h1>Shaft machining</h1><p>Short content.</p></body></html>"
result = engine.validate(short_html, sample_project)
assert not result.passed
assert any("too short" in e.message for e in result.errors)
def test_title_keyword_required(self, mock_config, sample_project):
engine = ContentRuleEngine(mock_config)
html_without_keyword = "<html><head><title>Generic Title</title></head><body>" + "word " * 1500 + "</body></html>"
result = engine.validate(html_without_keyword, sample_project)
assert any("title" in e.rule_name.lower() for e in result.errors)
def test_h1_keyword_required(self, mock_config, sample_project):
engine = ContentRuleEngine(mock_config)
html = """
<html>
<head><title>Shaft Machining Guide</title></head>
<body>
<h1>Generic Heading</h1>
<p>""" + "word " * 1500 + """</p>
</body>
</html>
"""
result = engine.validate(html, sample_project)
assert any("h1" in e.rule_name.lower() for e in result.errors)
def test_h2_keyword_minimum(self, mock_config, sample_project):
engine = ContentRuleEngine(mock_config)
html = """
<html>
<head><title>Shaft Machining Guide</title></head>
<body>
<h1>Shaft Machining Basics</h1>
<h2>Generic Topic</h2>
<p>""" + "word " * 1500 + """</p>
</body>
</html>
"""
result = engine.validate(html, sample_project)
assert any("h2_exact_match_min" in e.rule_name for e in result.errors)
def test_faq_section_required(self, mock_config, sample_project):
engine = ContentRuleEngine(mock_config)
html = """
<html>
<head><title>Shaft Machining Guide</title></head>
<body>
<h1>Shaft Machining Basics</h1>
<h2>Shaft Machining Process</h2>
<p>""" + "word " * 1500 + """</p>
</body>
</html>
"""
result = engine.validate(html, sample_project)
assert any("faq" in e.rule_name.lower() for e in result.errors)
def test_image_alt_text_validation(self, mock_config, sample_project):
engine = ContentRuleEngine(mock_config)
html = """
<html>
<head><title>Shaft Machining Guide</title></head>
<body>
<h1>Shaft Machining Basics</h1>
<h2>FAQ about shaft machining</h2>
<h2>Shaft Machining Techniques</h2>
<h3>What is shaft machining?</h3>
<img src="test.jpg" alt="Generic image">
<p>""" + "word " * 1500 + """</p>
</body>
</html>
"""
result = engine.validate(html, sample_project)
assert any("image_alt_text" in e.rule_name for e in result.errors)
class TestCORAValidation:
"""Tests for CORA-specific validation"""
def test_tier_1_strict_validation(self, mock_config, sample_project):
engine = ContentRuleEngine(mock_config)
sample_project.tier = 1
sample_project.h2_total = 5
html = """
<html>
<head><title>Shaft Machining Guide</title></head>
<body>
<h1>Shaft Machining with CNC</h1>
<h2>Shaft Machining Process</h2>
<h2>Understanding CNC</h2>
<h3>What is shaft machining?</h3>
<h3>FAQ</h3>
<img src="test.jpg" alt="Shaft machining with CNC lathe">
<p>""" + "word " * 1500 + """</p>
</body>
</html>
"""
result = engine.validate(html, sample_project)
h2_errors = [e for e in result.errors if "h2_total" in e.rule_name]
assert len(h2_errors) > 0
assert h2_errors[0].expected == 5
assert h2_errors[0].actual == 2
def test_tier_2_warning_only(self, mock_config, sample_project):
engine = ContentRuleEngine(mock_config)
sample_project.tier = 2
sample_project.h2_total = 5
html = """
<html>
<head><title>Shaft Machining Guide</title></head>
<body>
<h1>Shaft Machining with CNC</h1>
<h2>Shaft Machining Process</h2>
<h2>Understanding CNC</h2>
<h3>What is shaft machining?</h3>
<h3>FAQ</h3>
<img src="test.jpg" alt="Shaft machining with CNC lathe">
<p>""" + "word " * 1500 + """</p>
</body>
</html>
"""
result = engine.validate(html, sample_project)
h2_warnings = [w for w in result.warnings if "h2_total" in w.rule_name]
assert len(h2_warnings) > 0
h2_errors = [e for e in result.errors if "h2_total" in e.rule_name]
assert len(h2_errors) == 0
def test_keyword_entity_counting(self, mock_config, sample_project):
engine = ContentRuleEngine(mock_config)
html = """
<html>
<head><title>Shaft Machining Guide</title></head>
<body>
<h1>Shaft Machining Basics</h1>
<h2>Shaft Machining Process</h2>
<h2>Understanding CNC Operations</h2>
<h2>Working with Precision Lathe</h2>
<h3>What is shaft machining?</h3>
<h3>CNC Techniques</h3>
<h3>FAQ</h3>
<img src="test.jpg" alt="Shaft machining with CNC">
<p>""" + "word " * 1500 + """</p>
</body>
</html>
"""
parser = ContentHTMLParser()
parser.feed(html)
counts = engine._count_keyword_entities(parser, sample_project)
assert counts["h1_exact"] == 1
assert counts["h2_exact"] == 1
assert counts["h2_entities"] >= 2
assert counts["h3_exact"] == 1
def test_round_averages_down(self, mock_config, sample_project):
engine = ContentRuleEngine(mock_config)
sample_project.h2_total = 5.6
html = """
<html>
<head><title>Shaft Machining Guide</title></head>
<body>
<h1>Shaft Machining with CNC</h1>
<h2>Shaft Machining Process</h2>
<h2>Understanding CNC</h2>
<h2>Lathe Operations</h2>
<h2>Precision Work</h2>
<h2>Best Practices</h2>
<h3>What is shaft machining?</h3>
<h3>FAQ</h3>
<img src="test.jpg" alt="Shaft machining with CNC">
<p>""" + "word " * 1500 + """</p>
</body>
</html>
"""
result = engine.validate(html, sample_project)
h2_issues = [e for e in result.errors if "h2_total" in e.rule_name]
if h2_issues:
assert h2_issues[0].expected == 5
class TestValidContent:
"""Tests for content that should pass validation"""
def test_fully_compliant_content(self, mock_config, sample_project):
engine = ContentRuleEngine(mock_config)
html = """
<html>
<head><title>Complete Guide to Shaft Machining</title></head>
<body>
<h1>Shaft Machining: CNC Operations</h1>
<h2>Shaft Machining Process Explained</h2>
<p>Content about the main process...</p>
<h2>Understanding CNC Technology</h2>
<p>More content...</p>
<h2>Working with Precision Lathe</h2>
<p>Additional information...</p>
<h2>Shaft Machining Techniques</h2>
<p>Techniques details...</p>
<h2>Best Practices in CNC</h2>
<p>Best practices...</p>
<h3>What is shaft machining?</h3>
<p>Definition and explanation...</p>
<h3>CNC Lathe Operations</h3>
<p>Operations details...</p>
<h3>Precision Techniques</h3>
<p>Techniques information...</p>
<h3>Shaft Machining Process Guide</h3>
<p>Process details...</p>
<h3>Understanding Machining Techniques</h3>
<p>Techniques overview...</p>
<h3>CNC Setup and Shaft Machining Process</h3>
<p>Setup instructions...</p>
<h3>Lathe Maintenance for Machining Techniques</h3>
<p>Maintenance tips...</p>
<h3>FAQ: Common Questions about Shaft Machining</h3>
<p>Frequently asked questions...</p>
<img src="image1.jpg" alt="Shaft machining with CNC lathe">
<img src="image2.jpg" alt="Precision shaft machining setup">
<p>""" + " ".join(["shaft machining process details and information"] * 250) + """</p>
</body>
</html>
"""
result = engine.validate(html, sample_project)
assert result.passed is True
assert len(result.errors) == 0