411 lines
16 KiB
Python
411 lines
16 KiB
Python
"""
|
|
Unit tests for content injection module
|
|
"""
|
|
|
|
import pytest
|
|
from unittest.mock import Mock, MagicMock, patch
|
|
from src.interlinking.content_injection import (
|
|
inject_interlinks,
|
|
_inject_tiered_links,
|
|
_inject_homepage_link,
|
|
_inject_see_also_section,
|
|
_get_anchor_texts_for_tier,
|
|
_try_inject_link,
|
|
_find_and_wrap_anchor_text,
|
|
_insert_link_into_random_paragraph,
|
|
_extract_homepage_url,
|
|
_insert_before_closing_tags
|
|
)
|
|
from src.database.models import GeneratedContent, Project
|
|
|
|
|
|
@pytest.fixture
|
|
def mock_project():
|
|
"""Create a mock Project"""
|
|
project = Mock(spec=Project)
|
|
project.id = 1
|
|
project.main_keyword = "shaft machining"
|
|
project.related_searches = ["cnc shaft machining", "precision shaft machining"]
|
|
project.entities = ["lathe", "milling", "CNC"]
|
|
return project
|
|
|
|
|
|
@pytest.fixture
|
|
def mock_content():
|
|
"""Create a mock GeneratedContent"""
|
|
content = Mock(spec=GeneratedContent)
|
|
content.id = 1
|
|
content.project_id = 1
|
|
content.tier = "tier1"
|
|
content.title = "Guide to Shaft Machining"
|
|
content.content = "<p>Shaft machining is an important process. Learn about shaft machining here.</p>"
|
|
return content
|
|
|
|
|
|
@pytest.fixture
|
|
def mock_content_repo():
|
|
"""Create a mock GeneratedContentRepository"""
|
|
repo = Mock()
|
|
repo.update = Mock(return_value=None)
|
|
return repo
|
|
|
|
|
|
@pytest.fixture
|
|
def mock_link_repo():
|
|
"""Create a mock ArticleLinkRepository"""
|
|
repo = Mock()
|
|
repo.create = Mock(return_value=None)
|
|
return repo
|
|
|
|
|
|
class TestExtractHomepageUrl:
|
|
"""Tests for homepage URL extraction"""
|
|
|
|
def test_extract_from_https_url(self):
|
|
url = "https://example.com/article-slug.html"
|
|
result = _extract_homepage_url(url)
|
|
assert result == "https://example.com/"
|
|
|
|
def test_extract_from_http_url(self):
|
|
url = "http://example.com/article.html"
|
|
result = _extract_homepage_url(url)
|
|
assert result == "http://example.com/"
|
|
|
|
def test_extract_from_cdn_url(self):
|
|
url = "https://site.b-cdn.net/my-article.html"
|
|
result = _extract_homepage_url(url)
|
|
assert result == "https://site.b-cdn.net/"
|
|
|
|
def test_extract_from_custom_domain(self):
|
|
url = "https://www.custom.com/path/to/article.html"
|
|
result = _extract_homepage_url(url)
|
|
assert result == "https://www.custom.com/"
|
|
|
|
def test_extract_with_port(self):
|
|
url = "https://example.com:8080/article.html"
|
|
result = _extract_homepage_url(url)
|
|
assert result == "https://example.com:8080/"
|
|
|
|
|
|
class TestInsertBeforeClosingTags:
|
|
"""Tests for inserting content before closing tags"""
|
|
|
|
def test_insert_after_last_paragraph(self):
|
|
html = "<p>First paragraph</p><p>Last paragraph</p>"
|
|
content = "<h3>New Section</h3>"
|
|
result = _insert_before_closing_tags(html, content)
|
|
assert "<h3>New Section</h3>" in result
|
|
assert result.index("Last paragraph") < result.index("<h3>New Section</h3>")
|
|
|
|
def test_insert_with_body_tag(self):
|
|
html = "<body><p>Content</p></body>"
|
|
content = "<h3>See Also</h3>"
|
|
result = _insert_before_closing_tags(html, content)
|
|
assert "<h3>See Also</h3>" in result
|
|
|
|
def test_insert_with_no_paragraphs(self):
|
|
html = "<div>Some content</div>"
|
|
content = "<h3>Section</h3>"
|
|
result = _insert_before_closing_tags(html, content)
|
|
assert "<h3>Section</h3>" in result
|
|
|
|
|
|
class TestFindAndWrapAnchorText:
|
|
"""Tests for finding and wrapping anchor text"""
|
|
|
|
def test_find_exact_match(self):
|
|
html = "<p>This is about shaft machining processes.</p>"
|
|
anchor = "shaft machining"
|
|
url = "https://example.com"
|
|
result, found = _find_and_wrap_anchor_text(html, anchor, url)
|
|
assert found
|
|
assert f'<a href="{url}">' in result
|
|
assert "shaft machining</a>" in result
|
|
|
|
def test_case_insensitive_match(self):
|
|
html = "<p>This is about Shaft Machining processes.</p>"
|
|
anchor = "shaft machining"
|
|
url = "https://example.com"
|
|
result, found = _find_and_wrap_anchor_text(html, anchor, url)
|
|
assert found
|
|
assert f'<a href="{url}">' in result
|
|
|
|
def test_match_within_phrase(self):
|
|
html = "<p>The shaft machining process is complex.</p>"
|
|
anchor = "shaft machining"
|
|
url = "https://example.com"
|
|
result, found = _find_and_wrap_anchor_text(html, anchor, url)
|
|
assert found
|
|
assert f'<a href="{url}">' in result
|
|
|
|
def test_no_match(self):
|
|
html = "<p>This is about something else.</p>"
|
|
anchor = "shaft machining"
|
|
url = "https://example.com"
|
|
result, found = _find_and_wrap_anchor_text(html, anchor, url)
|
|
assert not found
|
|
assert result == html
|
|
|
|
def test_skip_existing_links(self):
|
|
html = '<p>Read about <a href="other.html">shaft machining</a> here. Also shaft machining is important.</p>'
|
|
anchor = "shaft machining"
|
|
url = "https://example.com"
|
|
result, found = _find_and_wrap_anchor_text(html, anchor, url)
|
|
assert found
|
|
# Should link the second occurrence, not the one already linked
|
|
assert result.count(f'<a href="{url}">') == 1
|
|
|
|
|
|
class TestInsertLinkIntoRandomParagraph:
|
|
"""Tests for inserting link into random paragraph"""
|
|
|
|
def test_insert_into_paragraph(self):
|
|
html = "<p>This is a long paragraph with many words and sentences. It has enough content.</p>"
|
|
anchor = "shaft machining"
|
|
url = "https://example.com"
|
|
result = _insert_link_into_random_paragraph(html, anchor, url)
|
|
assert f'<a href="{url}">{anchor}</a>' in result
|
|
|
|
def test_insert_with_multiple_paragraphs(self):
|
|
html = "<p>First paragraph.</p><p>Second paragraph with more text.</p><p>Third paragraph.</p>"
|
|
anchor = "test link"
|
|
url = "https://example.com"
|
|
result = _insert_link_into_random_paragraph(html, anchor, url)
|
|
assert f'<a href="{url}">{anchor}</a>' in result
|
|
|
|
def test_no_valid_paragraphs(self):
|
|
html = "<p>Hi</p><p>Ok</p>"
|
|
anchor = "test"
|
|
url = "https://example.com"
|
|
result = _insert_link_into_random_paragraph(html, anchor, url)
|
|
# Should return original HTML if no valid paragraphs
|
|
assert result == html or f'<a href="{url}">' in result
|
|
|
|
|
|
class TestGetAnchorTextsForTier:
|
|
"""Tests for anchor text generation with job config overrides"""
|
|
|
|
def test_default_mode(self, mock_project):
|
|
job_config = {"anchor_text_config": {"mode": "default"}}
|
|
with patch('src.interlinking.content_injection.get_anchor_text_for_tier') as mock_get:
|
|
mock_get.return_value = ["anchor1", "anchor2"]
|
|
result = _get_anchor_texts_for_tier("tier1", mock_project, job_config)
|
|
assert result == ["anchor1", "anchor2"]
|
|
|
|
def test_override_mode(self, mock_project):
|
|
custom = ["custom anchor 1", "custom anchor 2"]
|
|
job_config = {"anchor_text_config": {"mode": "override", "custom_text": custom}}
|
|
result = _get_anchor_texts_for_tier("tier1", mock_project, job_config)
|
|
assert result == custom
|
|
|
|
def test_append_mode(self, mock_project):
|
|
custom = ["custom anchor"]
|
|
job_config = {"anchor_text_config": {"mode": "append", "custom_text": custom}}
|
|
with patch('src.interlinking.content_injection.get_anchor_text_for_tier') as mock_get:
|
|
mock_get.return_value = ["default1", "default2"]
|
|
result = _get_anchor_texts_for_tier("tier1", mock_project, job_config)
|
|
assert result == ["default1", "default2", "custom anchor"]
|
|
|
|
def test_no_config(self, mock_project):
|
|
job_config = None
|
|
with patch('src.interlinking.content_injection.get_anchor_text_for_tier') as mock_get:
|
|
mock_get.return_value = ["default"]
|
|
result = _get_anchor_texts_for_tier("tier1", mock_project, job_config)
|
|
assert result == ["default"]
|
|
|
|
|
|
class TestTryInjectLink:
|
|
"""Tests for link injection attempts"""
|
|
|
|
def test_inject_with_found_anchor(self):
|
|
html = "<p>This is about shaft machining here.</p>"
|
|
anchors = ["shaft machining", "other anchor"]
|
|
url = "https://example.com"
|
|
result, injected = _try_inject_link(html, anchors, url)
|
|
assert injected
|
|
assert f'<a href="{url}">' in result
|
|
|
|
def test_inject_with_fallback(self):
|
|
html = "<p>This is a paragraph about something else entirely.</p>"
|
|
anchors = ["shaft machining"]
|
|
url = "https://example.com"
|
|
result, injected = _try_inject_link(html, anchors, url)
|
|
assert injected
|
|
assert f'<a href="{url}">' in result
|
|
|
|
def test_no_anchors(self):
|
|
html = "<p>Content</p>"
|
|
anchors = []
|
|
url = "https://example.com"
|
|
result, injected = _try_inject_link(html, anchors, url)
|
|
assert not injected
|
|
assert result == html
|
|
|
|
|
|
class TestInjectSeeAlsoSection:
|
|
"""Tests for See Also section injection"""
|
|
|
|
def test_inject_see_also_with_multiple_articles(self, mock_content, mock_link_repo):
|
|
html = "<p>Article content here.</p>"
|
|
article_urls = [
|
|
{"content_id": 1, "title": "Article 1", "url": "https://example.com/article1.html"},
|
|
{"content_id": 2, "title": "Article 2", "url": "https://example.com/article2.html"},
|
|
{"content_id": 3, "title": "Article 3", "url": "https://example.com/article3.html"}
|
|
]
|
|
mock_content.id = 1
|
|
|
|
result = _inject_see_also_section(html, mock_content, article_urls, mock_link_repo)
|
|
|
|
assert "<h3>See Also</h3>" in result
|
|
assert "<ul>" in result
|
|
assert "Article 2" in result
|
|
assert "Article 3" in result
|
|
assert "Article 1" not in result # Current article excluded
|
|
assert mock_link_repo.create.call_count == 2
|
|
|
|
def test_inject_see_also_with_single_article(self, mock_content, mock_link_repo):
|
|
html = "<p>Content</p>"
|
|
article_urls = [
|
|
{"content_id": 1, "title": "Only Article", "url": "https://example.com/article.html"}
|
|
]
|
|
mock_content.id = 1
|
|
|
|
result = _inject_see_also_section(html, mock_content, article_urls, mock_link_repo)
|
|
|
|
# No other articles, should return original HTML
|
|
assert result == html or "<h3>See Also</h3>" not in result
|
|
|
|
|
|
class TestInjectHomepageLink:
|
|
"""Tests for homepage link injection"""
|
|
|
|
def test_inject_homepage_link(self, mock_content, mock_project, mock_link_repo):
|
|
html = "<p>This is about content and going Home is great.</p>"
|
|
article_url = "https://example.com/article.html"
|
|
|
|
result = _inject_homepage_link(html, mock_content, article_url, mock_project, mock_link_repo)
|
|
|
|
assert '<a href="https://example.com/index.html">' in result
|
|
assert 'Home</a>' in result
|
|
mock_link_repo.create.assert_called_once()
|
|
call_args = mock_link_repo.create.call_args
|
|
assert call_args[1]['link_type'] == 'homepage'
|
|
|
|
def test_inject_homepage_link_not_found_in_content(self, mock_content, mock_project, mock_link_repo):
|
|
html = "<p>This is about something totally different and unrelated content here.</p>"
|
|
article_url = "https://www.example.com/article.html"
|
|
|
|
result = _inject_homepage_link(html, mock_content, article_url, mock_project, mock_link_repo)
|
|
|
|
# Should still inject via fallback (using "Home" anchor text)
|
|
assert '<a href="https://www.example.com/index.html">' in result
|
|
assert 'Home</a>' in result
|
|
|
|
|
|
class TestInjectTieredLinks:
|
|
"""Tests for tiered link injection"""
|
|
|
|
def test_tier1_money_site_link(self, mock_content, mock_project, mock_link_repo):
|
|
html = "<p>Learn about shaft machining processes.</p>"
|
|
tiered_links = {"tier": 1, "money_site_url": "https://moneysite.com"}
|
|
job_config = None
|
|
|
|
with patch('src.interlinking.content_injection.get_anchor_text_for_tier') as mock_get:
|
|
mock_get.return_value = ["shaft machining", "machining"]
|
|
result = _inject_tiered_links(html, mock_content, tiered_links, mock_project, job_config, mock_link_repo)
|
|
|
|
assert '<a href="https://moneysite.com">' in result
|
|
mock_link_repo.create.assert_called_once()
|
|
call_args = mock_link_repo.create.call_args
|
|
assert call_args[1]['link_type'] == 'tiered'
|
|
assert call_args[1]['to_url'] == 'https://moneysite.com'
|
|
|
|
def test_tier2_lower_tier_links(self, mock_content, mock_project, mock_link_repo):
|
|
html = "<p>This article discusses shaft machining and CNC processes and precision work.</p>"
|
|
mock_content.tier = "tier2"
|
|
tiered_links = {
|
|
"tier": 2,
|
|
"lower_tier": 1,
|
|
"lower_tier_urls": [
|
|
"https://site1.com/article1.html",
|
|
"https://site2.com/article2.html"
|
|
]
|
|
}
|
|
job_config = None
|
|
|
|
with patch('src.interlinking.content_injection.get_anchor_text_for_tier') as mock_get:
|
|
mock_get.return_value = ["shaft machining", "CNC processes"]
|
|
result = _inject_tiered_links(html, mock_content, tiered_links, mock_project, job_config, mock_link_repo)
|
|
|
|
# Should create links for both URLs
|
|
assert mock_link_repo.create.call_count == 2
|
|
|
|
def test_tier1_no_money_site(self, mock_content, mock_project, mock_link_repo):
|
|
html = "<p>Content</p>"
|
|
tiered_links = {"tier": 1}
|
|
job_config = None
|
|
|
|
result = _inject_tiered_links(html, mock_content, tiered_links, mock_project, job_config, mock_link_repo)
|
|
|
|
# Should return original HTML with warning
|
|
assert result == html
|
|
mock_link_repo.create.assert_not_called()
|
|
|
|
|
|
class TestInjectInterlinks:
|
|
"""Tests for main inject_interlinks function"""
|
|
|
|
def test_empty_content_records(self, mock_project, mock_content_repo, mock_link_repo):
|
|
inject_interlinks([], [], {}, mock_project, None, mock_content_repo, mock_link_repo)
|
|
# Should not crash, just log warning
|
|
mock_content_repo.update.assert_not_called()
|
|
|
|
def test_successful_injection(self, mock_content, mock_project, mock_content_repo, mock_link_repo):
|
|
article_urls = [
|
|
{"content_id": 1, "title": "Article 1", "url": "https://example.com/article1.html"},
|
|
{"content_id": 2, "title": "Article 2", "url": "https://example.com/article2.html"}
|
|
]
|
|
tiered_links = {"tier": 1, "money_site_url": "https://moneysite.com"}
|
|
job_config = None
|
|
|
|
with patch('src.interlinking.content_injection._inject_tiered_links') as mock_tiered, \
|
|
patch('src.interlinking.content_injection._inject_homepage_link') as mock_home, \
|
|
patch('src.interlinking.content_injection._inject_see_also_section') as mock_see_also:
|
|
|
|
mock_tiered.return_value = "<p>Updated content</p>"
|
|
mock_home.return_value = "<p>Updated content</p>"
|
|
mock_see_also.return_value = "<p>Updated content</p>"
|
|
|
|
inject_interlinks(
|
|
[mock_content],
|
|
article_urls,
|
|
tiered_links,
|
|
mock_project,
|
|
job_config,
|
|
mock_content_repo,
|
|
mock_link_repo
|
|
)
|
|
|
|
mock_content_repo.update.assert_called_once()
|
|
|
|
def test_missing_url_for_content(self, mock_content, mock_project, mock_content_repo, mock_link_repo):
|
|
article_urls = [
|
|
{"content_id": 2, "title": "Article 2", "url": "https://example.com/article2.html"}
|
|
]
|
|
tiered_links = {"tier": 1, "money_site_url": "https://moneysite.com"}
|
|
mock_content.id = 1 # ID not in article_urls
|
|
|
|
inject_interlinks(
|
|
[mock_content],
|
|
article_urls,
|
|
tiered_links,
|
|
mock_project,
|
|
None,
|
|
mock_content_repo,
|
|
mock_link_repo
|
|
)
|
|
|
|
# Should skip this content
|
|
mock_content_repo.update.assert_not_called()
|
|
|