174 lines
7.1 KiB
Python
174 lines
7.1 KiB
Python
"""
|
|
Test script to verify image reinsertion after interlink injection
|
|
|
|
Tests the new flow:
|
|
1. Get existing articles (2 T1, 2 T2) from project 30
|
|
2. Simulate interlink injection (already done, just read current content)
|
|
3. Re-insert images using _reinsert_images logic
|
|
4. Apply templates
|
|
5. Save formatted HTML locally to verify images display
|
|
|
|
Usage:
|
|
uv run python scripts/test_image_reinsertion.py
|
|
"""
|
|
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
project_root = Path(__file__).parent.parent
|
|
sys.path.insert(0, str(project_root))
|
|
|
|
from src.database.session import db_manager
|
|
from src.database.repositories import GeneratedContentRepository, ProjectRepository, SiteDeploymentRepository
|
|
from src.generation.image_injection import insert_hero_after_h1, insert_content_images_after_h2s, generate_alt_text
|
|
from src.templating.service import TemplateService
|
|
|
|
|
|
def test_image_reinsertion(project_id: int = 30):
|
|
"""Test image reinsertion on existing articles"""
|
|
session = db_manager.get_session()
|
|
|
|
try:
|
|
content_repo = GeneratedContentRepository(session)
|
|
project_repo = ProjectRepository(session)
|
|
site_repo = SiteDeploymentRepository(session)
|
|
|
|
project = project_repo.get_by_id(project_id)
|
|
if not project:
|
|
print(f"Project {project_id} not found")
|
|
return
|
|
|
|
# Get 2 T1 and 2 T2 articles
|
|
t1_articles = content_repo.get_by_project_and_tier(project_id, "tier1", require_site=False)
|
|
t2_articles = content_repo.get_by_project_and_tier(project_id, "tier2", require_site=False)
|
|
|
|
if len(t1_articles) < 2:
|
|
print(f"Not enough T1 articles (found {len(t1_articles)}, need 2)")
|
|
return
|
|
|
|
if len(t2_articles) < 2:
|
|
print(f"Not enough T2 articles (found {len(t2_articles)}, need 2)")
|
|
return
|
|
|
|
test_articles = t1_articles[:2] + t2_articles[:2]
|
|
|
|
print(f"\nTesting image reinsertion for project {project_id}: {project.name}")
|
|
print(f"Selected {len(test_articles)} articles:")
|
|
for article in test_articles:
|
|
has_hero = article.hero_image_url or "None"
|
|
has_content = f"{len(article.content_images) if article.content_images else 0} images"
|
|
existing_imgs = article.content.count("<img")
|
|
print(f" - {article.tier}: {article.title[:50]}")
|
|
print(f" Hero URL in DB: {has_hero}")
|
|
print(f" Content images in DB: {has_content}")
|
|
print(f" Existing <img> tags in content: {existing_imgs}")
|
|
|
|
# Create output directory
|
|
output_dir = Path("test_output")
|
|
output_dir.mkdir(exist_ok=True)
|
|
|
|
# Initialize template service
|
|
template_service = TemplateService()
|
|
|
|
# Process each article
|
|
for article in test_articles:
|
|
print(f"\nProcessing: {article.title[:50]}...")
|
|
|
|
# Step 1: Get current content (after interlink injection)
|
|
html = article.content
|
|
print(f" Content length: {len(html)} chars")
|
|
|
|
# Step 2: Re-insert images (simulating _reinsert_images)
|
|
if article.hero_image_url or article.content_images:
|
|
print(f" Re-inserting images...")
|
|
|
|
# Remove existing images first (to avoid duplicates)
|
|
import re
|
|
existing_count = html.count("<img")
|
|
if existing_count > 0:
|
|
print(f" Removing {existing_count} existing image(s)...")
|
|
html = re.sub(r'<img[^>]*>', '', html)
|
|
|
|
# Insert hero image if exists
|
|
if article.hero_image_url:
|
|
alt_text = generate_alt_text(project)
|
|
html = insert_hero_after_h1(html, article.hero_image_url, alt_text)
|
|
print(f" Hero image inserted: {article.hero_image_url}")
|
|
else:
|
|
print(f" No hero image URL in database")
|
|
|
|
# Insert content images if exist
|
|
if article.content_images:
|
|
alt_texts = [generate_alt_text(project) for _ in article.content_images]
|
|
html = insert_content_images_after_h2s(html, article.content_images, alt_texts)
|
|
print(f" {len(article.content_images)} content images inserted")
|
|
else:
|
|
print(f" No images to insert (hero_image_url and content_images both empty)")
|
|
|
|
# Step 3: Apply template
|
|
print(f" Applying template...")
|
|
try:
|
|
# Get template name from site or use default
|
|
template_name = template_service.select_template_for_content(
|
|
site_deployment_id=article.site_deployment_id,
|
|
site_deployment_repo=site_repo
|
|
)
|
|
|
|
# Generate meta description
|
|
import re
|
|
from html import unescape
|
|
text = re.sub(r'<[^>]+>', '', html)
|
|
text = unescape(text)
|
|
words = text.split()[:25]
|
|
meta_description = ' '.join(words) + '...'
|
|
|
|
# Format content with template
|
|
formatted_html = template_service.format_content(
|
|
content=html,
|
|
title=article.title,
|
|
meta_description=meta_description,
|
|
template_name=template_name,
|
|
canonical_url=article.deployed_url
|
|
)
|
|
|
|
print(f" Template '{template_name}' applied")
|
|
|
|
# Step 4: Save to file
|
|
safe_title = "".join(c for c in article.title if c.isalnum() or c in (' ', '-', '_')).rstrip()[:50]
|
|
filename = f"{article.tier}_{article.id}_{safe_title}.html"
|
|
filepath = output_dir / filename
|
|
|
|
with open(filepath, 'w', encoding='utf-8') as f:
|
|
f.write(formatted_html)
|
|
|
|
print(f" Saved to: {filepath}")
|
|
|
|
# Check if images are in the HTML
|
|
hero_count = formatted_html.count(article.hero_image_url) if article.hero_image_url else 0
|
|
content_count = sum(formatted_html.count(url) for url in (article.content_images or []))
|
|
|
|
print(f" Image check: Hero={hero_count}, Content={content_count}")
|
|
|
|
except Exception as e:
|
|
print(f" ERROR applying template: {e}")
|
|
import traceback
|
|
traceback.print_exc()
|
|
|
|
print(f"\n✓ Test complete! Check files in {output_dir}/")
|
|
print(f" Open the HTML files in a browser to verify images display correctly.")
|
|
|
|
finally:
|
|
session.close()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
project_id = 30
|
|
if len(sys.argv) > 1:
|
|
try:
|
|
project_id = int(sys.argv[1])
|
|
except ValueError:
|
|
print(f"Invalid project_id: {sys.argv[1]}. Using default: 30")
|
|
|
|
test_image_reinsertion(project_id)
|
|
|