Big-Link-Man/tests/unit/test_content_injection.py

411 lines
16 KiB
Python

"""
Unit tests for content injection module
"""
import pytest
from unittest.mock import Mock, MagicMock, patch
from src.interlinking.content_injection import (
inject_interlinks,
_inject_tiered_links,
_inject_homepage_link,
_inject_see_also_section,
_get_anchor_texts_for_tier,
_try_inject_link,
_find_and_wrap_anchor_text,
_insert_link_into_random_paragraph,
_extract_homepage_url,
_insert_before_closing_tags
)
from src.database.models import GeneratedContent, Project
@pytest.fixture
def mock_project():
"""Create a mock Project"""
project = Mock(spec=Project)
project.id = 1
project.main_keyword = "shaft machining"
project.related_searches = ["cnc shaft machining", "precision shaft machining"]
project.entities = ["lathe", "milling", "CNC"]
return project
@pytest.fixture
def mock_content():
"""Create a mock GeneratedContent"""
content = Mock(spec=GeneratedContent)
content.id = 1
content.project_id = 1
content.tier = "tier1"
content.title = "Guide to Shaft Machining"
content.content = "<p>Shaft machining is an important process. Learn about shaft machining here.</p>"
return content
@pytest.fixture
def mock_content_repo():
"""Create a mock GeneratedContentRepository"""
repo = Mock()
repo.update = Mock(return_value=None)
return repo
@pytest.fixture
def mock_link_repo():
"""Create a mock ArticleLinkRepository"""
repo = Mock()
repo.create = Mock(return_value=None)
return repo
class TestExtractHomepageUrl:
"""Tests for homepage URL extraction"""
def test_extract_from_https_url(self):
url = "https://example.com/article-slug.html"
result = _extract_homepage_url(url)
assert result == "https://example.com/"
def test_extract_from_http_url(self):
url = "http://example.com/article.html"
result = _extract_homepage_url(url)
assert result == "http://example.com/"
def test_extract_from_cdn_url(self):
url = "https://site.b-cdn.net/my-article.html"
result = _extract_homepage_url(url)
assert result == "https://site.b-cdn.net/"
def test_extract_from_custom_domain(self):
url = "https://www.custom.com/path/to/article.html"
result = _extract_homepage_url(url)
assert result == "https://www.custom.com/"
def test_extract_with_port(self):
url = "https://example.com:8080/article.html"
result = _extract_homepage_url(url)
assert result == "https://example.com:8080/"
class TestInsertBeforeClosingTags:
"""Tests for inserting content before closing tags"""
def test_insert_after_last_paragraph(self):
html = "<p>First paragraph</p><p>Last paragraph</p>"
content = "<h3>New Section</h3>"
result = _insert_before_closing_tags(html, content)
assert "<h3>New Section</h3>" in result
assert result.index("Last paragraph") < result.index("<h3>New Section</h3>")
def test_insert_with_body_tag(self):
html = "<body><p>Content</p></body>"
content = "<h3>See Also</h3>"
result = _insert_before_closing_tags(html, content)
assert "<h3>See Also</h3>" in result
def test_insert_with_no_paragraphs(self):
html = "<div>Some content</div>"
content = "<h3>Section</h3>"
result = _insert_before_closing_tags(html, content)
assert "<h3>Section</h3>" in result
class TestFindAndWrapAnchorText:
"""Tests for finding and wrapping anchor text"""
def test_find_exact_match(self):
html = "<p>This is about shaft machining processes.</p>"
anchor = "shaft machining"
url = "https://example.com"
result, found = _find_and_wrap_anchor_text(html, anchor, url)
assert found
assert f'<a href="{url}">' in result
assert "shaft machining</a>" in result
def test_case_insensitive_match(self):
html = "<p>This is about Shaft Machining processes.</p>"
anchor = "shaft machining"
url = "https://example.com"
result, found = _find_and_wrap_anchor_text(html, anchor, url)
assert found
assert f'<a href="{url}">' in result
def test_match_within_phrase(self):
html = "<p>The shaft machining process is complex.</p>"
anchor = "shaft machining"
url = "https://example.com"
result, found = _find_and_wrap_anchor_text(html, anchor, url)
assert found
assert f'<a href="{url}">' in result
def test_no_match(self):
html = "<p>This is about something else.</p>"
anchor = "shaft machining"
url = "https://example.com"
result, found = _find_and_wrap_anchor_text(html, anchor, url)
assert not found
assert result == html
def test_skip_existing_links(self):
html = '<p>Read about <a href="other.html">shaft machining</a> here. Also shaft machining is important.</p>'
anchor = "shaft machining"
url = "https://example.com"
result, found = _find_and_wrap_anchor_text(html, anchor, url)
assert found
# Should link the second occurrence, not the one already linked
assert result.count(f'<a href="{url}">') == 1
class TestInsertLinkIntoRandomParagraph:
"""Tests for inserting link into random paragraph"""
def test_insert_into_paragraph(self):
html = "<p>This is a long paragraph with many words and sentences. It has enough content.</p>"
anchor = "shaft machining"
url = "https://example.com"
result = _insert_link_into_random_paragraph(html, anchor, url)
assert f'<a href="{url}">{anchor}</a>' in result
def test_insert_with_multiple_paragraphs(self):
html = "<p>First paragraph.</p><p>Second paragraph with more text.</p><p>Third paragraph.</p>"
anchor = "test link"
url = "https://example.com"
result = _insert_link_into_random_paragraph(html, anchor, url)
assert f'<a href="{url}">{anchor}</a>' in result
def test_no_valid_paragraphs(self):
html = "<p>Hi</p><p>Ok</p>"
anchor = "test"
url = "https://example.com"
result = _insert_link_into_random_paragraph(html, anchor, url)
# Should return original HTML if no valid paragraphs
assert result == html or f'<a href="{url}">' in result
class TestGetAnchorTextsForTier:
"""Tests for anchor text generation with job config overrides"""
def test_default_mode(self, mock_project):
job_config = {"anchor_text_config": {"mode": "default"}}
with patch('src.interlinking.content_injection.get_anchor_text_for_tier') as mock_get:
mock_get.return_value = ["anchor1", "anchor2"]
result = _get_anchor_texts_for_tier("tier1", mock_project, job_config)
assert result == ["anchor1", "anchor2"]
def test_override_mode(self, mock_project):
custom = ["custom anchor 1", "custom anchor 2"]
job_config = {"anchor_text_config": {"mode": "override", "custom_text": custom}}
result = _get_anchor_texts_for_tier("tier1", mock_project, job_config)
assert result == custom
def test_append_mode(self, mock_project):
custom = ["custom anchor"]
job_config = {"anchor_text_config": {"mode": "append", "custom_text": custom}}
with patch('src.interlinking.content_injection.get_anchor_text_for_tier') as mock_get:
mock_get.return_value = ["default1", "default2"]
result = _get_anchor_texts_for_tier("tier1", mock_project, job_config)
assert result == ["default1", "default2", "custom anchor"]
def test_no_config(self, mock_project):
job_config = None
with patch('src.interlinking.content_injection.get_anchor_text_for_tier') as mock_get:
mock_get.return_value = ["default"]
result = _get_anchor_texts_for_tier("tier1", mock_project, job_config)
assert result == ["default"]
class TestTryInjectLink:
"""Tests for link injection attempts"""
def test_inject_with_found_anchor(self):
html = "<p>This is about shaft machining here.</p>"
anchors = ["shaft machining", "other anchor"]
url = "https://example.com"
result, injected = _try_inject_link(html, anchors, url)
assert injected
assert f'<a href="{url}">' in result
def test_inject_with_fallback(self):
html = "<p>This is a paragraph about something else entirely.</p>"
anchors = ["shaft machining"]
url = "https://example.com"
result, injected = _try_inject_link(html, anchors, url)
assert injected
assert f'<a href="{url}">' in result
def test_no_anchors(self):
html = "<p>Content</p>"
anchors = []
url = "https://example.com"
result, injected = _try_inject_link(html, anchors, url)
assert not injected
assert result == html
class TestInjectSeeAlsoSection:
"""Tests for See Also section injection"""
def test_inject_see_also_with_multiple_articles(self, mock_content, mock_link_repo):
html = "<p>Article content here.</p>"
article_urls = [
{"content_id": 1, "title": "Article 1", "url": "https://example.com/article1.html"},
{"content_id": 2, "title": "Article 2", "url": "https://example.com/article2.html"},
{"content_id": 3, "title": "Article 3", "url": "https://example.com/article3.html"}
]
mock_content.id = 1
result = _inject_see_also_section(html, mock_content, article_urls, mock_link_repo)
assert "<h3>See Also</h3>" in result
assert "<ul>" in result
assert "Article 2" in result
assert "Article 3" in result
assert "Article 1" not in result # Current article excluded
assert mock_link_repo.create.call_count == 2
def test_inject_see_also_with_single_article(self, mock_content, mock_link_repo):
html = "<p>Content</p>"
article_urls = [
{"content_id": 1, "title": "Only Article", "url": "https://example.com/article.html"}
]
mock_content.id = 1
result = _inject_see_also_section(html, mock_content, article_urls, mock_link_repo)
# No other articles, should return original HTML
assert result == html or "<h3>See Also</h3>" not in result
class TestInjectHomepageLink:
"""Tests for homepage link injection"""
def test_inject_homepage_link(self, mock_content, mock_project, mock_link_repo):
html = "<p>This is about content and going Home is great.</p>"
article_url = "https://example.com/article.html"
result = _inject_homepage_link(html, mock_content, article_url, mock_project, mock_link_repo)
assert '<a href="https://example.com/index.html">' in result
assert 'Home</a>' in result
mock_link_repo.create.assert_called_once()
call_args = mock_link_repo.create.call_args
assert call_args[1]['link_type'] == 'homepage'
def test_inject_homepage_link_not_found_in_content(self, mock_content, mock_project, mock_link_repo):
html = "<p>This is about something totally different and unrelated content here.</p>"
article_url = "https://www.example.com/article.html"
result = _inject_homepage_link(html, mock_content, article_url, mock_project, mock_link_repo)
# Should still inject via fallback (using "Home" anchor text)
assert '<a href="https://www.example.com/index.html">' in result
assert 'Home</a>' in result
class TestInjectTieredLinks:
"""Tests for tiered link injection"""
def test_tier1_money_site_link(self, mock_content, mock_project, mock_link_repo):
html = "<p>Learn about shaft machining processes.</p>"
tiered_links = {"tier": 1, "money_site_url": "https://moneysite.com"}
job_config = None
with patch('src.interlinking.content_injection.get_anchor_text_for_tier') as mock_get:
mock_get.return_value = ["shaft machining", "machining"]
result = _inject_tiered_links(html, mock_content, tiered_links, mock_project, job_config, mock_link_repo)
assert '<a href="https://moneysite.com">' in result
mock_link_repo.create.assert_called_once()
call_args = mock_link_repo.create.call_args
assert call_args[1]['link_type'] == 'tiered'
assert call_args[1]['to_url'] == 'https://moneysite.com'
def test_tier2_lower_tier_links(self, mock_content, mock_project, mock_link_repo):
html = "<p>This article discusses shaft machining and CNC processes and precision work.</p>"
mock_content.tier = "tier2"
tiered_links = {
"tier": 2,
"lower_tier": 1,
"lower_tier_urls": [
"https://site1.com/article1.html",
"https://site2.com/article2.html"
]
}
job_config = None
with patch('src.interlinking.content_injection.get_anchor_text_for_tier') as mock_get:
mock_get.return_value = ["shaft machining", "CNC processes"]
result = _inject_tiered_links(html, mock_content, tiered_links, mock_project, job_config, mock_link_repo)
# Should create links for both URLs
assert mock_link_repo.create.call_count == 2
def test_tier1_no_money_site(self, mock_content, mock_project, mock_link_repo):
html = "<p>Content</p>"
tiered_links = {"tier": 1}
job_config = None
result = _inject_tiered_links(html, mock_content, tiered_links, mock_project, job_config, mock_link_repo)
# Should return original HTML with warning
assert result == html
mock_link_repo.create.assert_not_called()
class TestInjectInterlinks:
"""Tests for main inject_interlinks function"""
def test_empty_content_records(self, mock_project, mock_content_repo, mock_link_repo):
inject_interlinks([], [], {}, mock_project, None, mock_content_repo, mock_link_repo)
# Should not crash, just log warning
mock_content_repo.update.assert_not_called()
def test_successful_injection(self, mock_content, mock_project, mock_content_repo, mock_link_repo):
article_urls = [
{"content_id": 1, "title": "Article 1", "url": "https://example.com/article1.html"},
{"content_id": 2, "title": "Article 2", "url": "https://example.com/article2.html"}
]
tiered_links = {"tier": 1, "money_site_url": "https://moneysite.com"}
job_config = None
with patch('src.interlinking.content_injection._inject_tiered_links') as mock_tiered, \
patch('src.interlinking.content_injection._inject_homepage_link') as mock_home, \
patch('src.interlinking.content_injection._inject_see_also_section') as mock_see_also:
mock_tiered.return_value = "<p>Updated content</p>"
mock_home.return_value = "<p>Updated content</p>"
mock_see_also.return_value = "<p>Updated content</p>"
inject_interlinks(
[mock_content],
article_urls,
tiered_links,
mock_project,
job_config,
mock_content_repo,
mock_link_repo
)
mock_content_repo.update.assert_called_once()
def test_missing_url_for_content(self, mock_content, mock_project, mock_content_repo, mock_link_repo):
article_urls = [
{"content_id": 2, "title": "Article 2", "url": "https://example.com/article2.html"}
]
tiered_links = {"tier": 1, "money_site_url": "https://moneysite.com"}
mock_content.id = 1 # ID not in article_urls
inject_interlinks(
[mock_content],
article_urls,
tiered_links,
mock_project,
None,
mock_content_repo,
mock_link_repo
)
# Should skip this content
mock_content_repo.update.assert_not_called()