Fix: Assign sites before generating images
- Move site assignment to occur immediately after article creation - Generate images after site assignment so they can be uploaded - Add assign_site_to_single_article() helper function - Fixes issue where images were generated with site_deployment_id=Nonemain
parent
be03594fc7
commit
62074cd995
|
|
@ -17,7 +17,7 @@ from src.database.repositories import GeneratedContentRepository, ProjectReposit
|
|||
from src.generation.url_generator import generate_urls_for_batch
|
||||
from src.interlinking.tiered_links import find_tiered_links
|
||||
from src.interlinking.content_injection import inject_interlinks
|
||||
from src.generation.site_assignment import assign_sites_to_batch
|
||||
from src.generation.site_assignment import assign_sites_to_batch, assign_site_to_single_article
|
||||
from src.deployment.bunny_storage import BunnyStorageClient
|
||||
from src.deployment.deployment_service import DeploymentService
|
||||
from src.deployment.url_logger import URLLogger
|
||||
|
|
@ -265,6 +265,8 @@ class BatchProcessor:
|
|||
'title': titles[article_index],
|
||||
'keyword': keyword,
|
||||
'resolved_targets': targets_for_tier,
|
||||
'job': job,
|
||||
'project_keyword': keyword,
|
||||
'debug': debug,
|
||||
'models': models
|
||||
})
|
||||
|
|
@ -292,6 +294,8 @@ class BatchProcessor:
|
|||
title: str,
|
||||
keyword: str,
|
||||
resolved_targets: Dict[str, int],
|
||||
job: Job,
|
||||
project_keyword: str,
|
||||
debug: bool,
|
||||
models = None
|
||||
):
|
||||
|
|
@ -357,17 +361,7 @@ class BatchProcessor:
|
|||
status = "augmented"
|
||||
self.stats["augmented_articles"] += 1
|
||||
|
||||
# Generate and insert images
|
||||
content, hero_url, content_image_urls = self._generate_and_insert_images(
|
||||
project_id=project_id,
|
||||
tier_name=tier_name,
|
||||
tier_config=tier_config,
|
||||
title=title,
|
||||
content=content,
|
||||
site_deployment_id=site_deployment_id,
|
||||
prefix=prefix
|
||||
)
|
||||
|
||||
# Create minimal article record first so we can assign a site
|
||||
saved_content = self.content_repo.create(
|
||||
project_id=project_id,
|
||||
tier=tier_name,
|
||||
|
|
@ -378,10 +372,44 @@ class BatchProcessor:
|
|||
word_count=word_count,
|
||||
status=status,
|
||||
site_deployment_id=site_deployment_id,
|
||||
hero_image_url=hero_url,
|
||||
content_images=content_image_urls if content_image_urls else None
|
||||
hero_image_url=None,
|
||||
content_images=None
|
||||
)
|
||||
|
||||
# Assign site if not explicitly assigned
|
||||
if not site_deployment_id and self.site_deployment_repo:
|
||||
assigned_site = assign_site_to_single_article(
|
||||
content=saved_content,
|
||||
job=job,
|
||||
site_repo=self.site_deployment_repo,
|
||||
content_repo=self.content_repo,
|
||||
project_keyword=project_keyword
|
||||
)
|
||||
if assigned_site:
|
||||
site_deployment_id = assigned_site.id
|
||||
hostname = assigned_site.custom_hostname or assigned_site.pull_zone_bcdn_hostname
|
||||
click.echo(f"{prefix} Assigned to site: {hostname} (ID: {site_deployment_id})")
|
||||
# Update the article with the assigned site
|
||||
saved_content.site_deployment_id = site_deployment_id
|
||||
self.content_repo.session.add(saved_content)
|
||||
self.content_repo.session.commit()
|
||||
|
||||
# Generate images (now with assigned site_deployment_id)
|
||||
hero_url, content_image_urls = self._generate_images_only(
|
||||
project_id=project_id,
|
||||
tier_name=tier_name,
|
||||
tier_config=tier_config,
|
||||
title=title,
|
||||
site_deployment_id=site_deployment_id,
|
||||
prefix=prefix
|
||||
)
|
||||
|
||||
# Update article with image URLs
|
||||
saved_content.hero_image_url = hero_url
|
||||
saved_content.content_images = content_image_urls if content_image_urls else None
|
||||
self.content_repo.session.add(saved_content)
|
||||
self.content_repo.session.commit()
|
||||
|
||||
click.echo(f"{prefix} Saved (ID: {saved_content.id}, Status: {status})")
|
||||
|
||||
def _generate_and_insert_images(
|
||||
|
|
@ -397,9 +425,14 @@ class BatchProcessor:
|
|||
"""
|
||||
Generate images and insert into HTML content
|
||||
|
||||
WARNING: This method inserts images before interlink injection, which may cause
|
||||
images to be lost during BeautifulSoup parsing. Consider using _generate_images_only()
|
||||
and _reinsert_images() instead.
|
||||
|
||||
Note: image_config is always created by job config parser (with defaults if not in JSON).
|
||||
Defaults: hero images for all tiers (1280x720), content images for T1 only (1-3 images).
|
||||
"""
|
||||
click.echo(f"{prefix} WARNING: DO YOU REALLY WANT TO GEN AND INSERT THE IMAGE? This may cause images to be lost during interlink injection!")
|
||||
if not tier_config.image_config:
|
||||
return content, None, []
|
||||
|
||||
|
|
@ -499,6 +532,144 @@ class BatchProcessor:
|
|||
|
||||
return content, hero_url, content_image_urls
|
||||
|
||||
def _generate_images_only(
|
||||
self,
|
||||
project_id: int,
|
||||
tier_name: str,
|
||||
tier_config: TierConfig,
|
||||
title: str,
|
||||
site_deployment_id: Optional[int],
|
||||
prefix: str
|
||||
) -> tuple[Optional[str], List[str]]:
|
||||
"""
|
||||
Generate images and upload to storage, but don't insert into HTML.
|
||||
Returns (hero_url, content_image_urls) for later insertion.
|
||||
|
||||
Note: image_config is always created by job config parser (with defaults if not in JSON).
|
||||
Defaults: hero images for all tiers (1280x720), content images for T1 only (1-3 images).
|
||||
"""
|
||||
if not tier_config.image_config:
|
||||
return None, []
|
||||
|
||||
project = self.project_repo.get_by_id(project_id)
|
||||
if not project:
|
||||
return None, []
|
||||
|
||||
# Initialize image generator
|
||||
image_generator = ImageGenerator(
|
||||
ai_client=self.generator.ai_client,
|
||||
prompt_manager=self.generator.prompt_manager,
|
||||
project_repo=self.project_repo
|
||||
)
|
||||
|
||||
storage_client = BunnyStorageClient()
|
||||
hero_url = None
|
||||
content_image_urls = []
|
||||
|
||||
# Generate hero image (all tiers if enabled)
|
||||
if tier_config.image_config.hero:
|
||||
try:
|
||||
click.echo(f"{prefix} Generating hero image...")
|
||||
hero_image = image_generator.generate_hero_image(
|
||||
project_id=project_id,
|
||||
title=title,
|
||||
width=tier_config.image_config.hero.width,
|
||||
height=tier_config.image_config.hero.height
|
||||
)
|
||||
|
||||
if hero_image and site_deployment_id:
|
||||
site = self.site_deployment_repo.get_by_id(site_deployment_id) if self.site_deployment_repo else None
|
||||
if site:
|
||||
main_keyword_slug = slugify(project.main_keyword)
|
||||
file_path = f"images/{main_keyword_slug}.jpg"
|
||||
hero_url = upload_image_to_storage(storage_client, site, hero_image, file_path)
|
||||
if hero_url:
|
||||
click.echo(f"{prefix} Hero image uploaded: {hero_url}")
|
||||
else:
|
||||
click.echo(f"{prefix} Hero image upload failed")
|
||||
except Exception as e:
|
||||
click.echo(f"{prefix} Hero image generation failed: {e}")
|
||||
|
||||
# Generate content images (T1 only, if enabled)
|
||||
if tier_config.image_config.content and tier_config.image_config.content.max_num_images > 0:
|
||||
try:
|
||||
num_images = random.randint(
|
||||
tier_config.image_config.content.min_num_images,
|
||||
tier_config.image_config.content.max_num_images
|
||||
)
|
||||
|
||||
if num_images > 0:
|
||||
click.echo(f"{prefix} Generating {num_images} content image(s)...")
|
||||
|
||||
entities = project.entities or []
|
||||
related_searches = project.related_searches or []
|
||||
|
||||
if not entities or not related_searches:
|
||||
click.echo(f"{prefix} Skipping content images (no entities/related_searches)")
|
||||
else:
|
||||
for i in range(num_images):
|
||||
try:
|
||||
entity = random.choice(entities)
|
||||
related_search = random.choice(related_searches)
|
||||
|
||||
content_image = image_generator.generate_content_image(
|
||||
project_id=project_id,
|
||||
entity=entity,
|
||||
related_search=related_search,
|
||||
width=tier_config.image_config.content.width,
|
||||
height=tier_config.image_config.content.height
|
||||
)
|
||||
|
||||
if content_image and site_deployment_id:
|
||||
site = self.site_deployment_repo.get_by_id(site_deployment_id) if self.site_deployment_repo else None
|
||||
if site:
|
||||
main_keyword_slug = slugify(project.main_keyword)
|
||||
entity_slug = slugify(entity)
|
||||
related_slug = slugify(related_search)
|
||||
file_path = f"images/{main_keyword_slug}-{entity_slug}-{related_slug}.jpg"
|
||||
img_url = upload_image_to_storage(storage_client, site, content_image, file_path)
|
||||
if img_url:
|
||||
content_image_urls.append(img_url)
|
||||
click.echo(f"{prefix} Content image {i+1}/{num_images} uploaded: {img_url}")
|
||||
except Exception as e:
|
||||
click.echo(f"{prefix} Content image {i+1} generation failed: {e}")
|
||||
except Exception as e:
|
||||
click.echo(f"{prefix} Content image generation failed: {e}")
|
||||
|
||||
return hero_url, content_image_urls
|
||||
|
||||
def _reinsert_images(
|
||||
self,
|
||||
content_records: List,
|
||||
project
|
||||
) -> None:
|
||||
"""Re-insert images into content after interlink injection"""
|
||||
import re
|
||||
|
||||
for content in content_records:
|
||||
if not content.hero_image_url and not content.content_images:
|
||||
continue
|
||||
|
||||
html = content.content
|
||||
|
||||
# Remove existing images first (to avoid duplicates)
|
||||
# Remove all img tags
|
||||
html = re.sub(r'<img[^>]*>', '', html)
|
||||
|
||||
# Insert hero image if exists
|
||||
if content.hero_image_url:
|
||||
alt_text = generate_alt_text(project)
|
||||
html = insert_hero_after_h1(html, content.hero_image_url, alt_text)
|
||||
|
||||
# Insert content images if exist
|
||||
if content.content_images:
|
||||
alt_texts = [generate_alt_text(project) for _ in content.content_images]
|
||||
html = insert_content_images_after_h2s(html, content.content_images, alt_texts)
|
||||
|
||||
# Update content
|
||||
content.content = html
|
||||
self.content_repo.update(content)
|
||||
|
||||
def _process_articles_concurrent(
|
||||
self,
|
||||
article_tasks: List[Dict[str, Any]],
|
||||
|
|
@ -590,6 +761,8 @@ class BatchProcessor:
|
|||
title: str,
|
||||
keyword: str,
|
||||
resolved_targets: Dict[str, int],
|
||||
job: Job,
|
||||
project_keyword: str,
|
||||
debug: bool,
|
||||
models = None
|
||||
):
|
||||
|
|
@ -680,17 +853,7 @@ class BatchProcessor:
|
|||
with self.stats_lock:
|
||||
self.stats["augmented_articles"] += 1
|
||||
|
||||
# Generate and insert images
|
||||
content, hero_url, content_image_urls = self._generate_and_insert_images(
|
||||
project_id=project_id,
|
||||
tier_name=tier_name,
|
||||
tier_config=tier_config,
|
||||
title=title,
|
||||
content=content,
|
||||
site_deployment_id=site_deployment_id,
|
||||
prefix=prefix
|
||||
)
|
||||
|
||||
# Create article first so we can assign a site
|
||||
saved_content = thread_content_repo.create(
|
||||
project_id=project_id,
|
||||
tier=tier_name,
|
||||
|
|
@ -701,10 +864,121 @@ class BatchProcessor:
|
|||
word_count=word_count,
|
||||
status=status,
|
||||
site_deployment_id=site_deployment_id,
|
||||
hero_image_url=hero_url,
|
||||
content_images=content_image_urls if content_image_urls else None
|
||||
hero_image_url=None,
|
||||
content_images=None
|
||||
)
|
||||
|
||||
# Assign site if not explicitly assigned
|
||||
if not site_deployment_id:
|
||||
from src.database.repositories import SiteDeploymentRepository
|
||||
thread_site_repo = SiteDeploymentRepository(thread_session)
|
||||
assigned_site = assign_site_to_single_article(
|
||||
content=saved_content,
|
||||
job=job,
|
||||
site_repo=thread_site_repo,
|
||||
content_repo=thread_content_repo,
|
||||
project_keyword=project_keyword
|
||||
)
|
||||
if assigned_site:
|
||||
site_deployment_id = assigned_site.id
|
||||
hostname = assigned_site.custom_hostname or assigned_site.pull_zone_bcdn_hostname
|
||||
click.echo(f"{prefix} Assigned to site: {hostname} (ID: {site_deployment_id})")
|
||||
# Update the article with the assigned site
|
||||
saved_content.site_deployment_id = site_deployment_id
|
||||
thread_session.add(saved_content)
|
||||
thread_session.commit()
|
||||
|
||||
# Generate images (now with assigned site_deployment_id)
|
||||
from src.generation.image_generator import ImageGenerator
|
||||
from src.generation.image_upload import upload_image_to_storage
|
||||
from src.deployment.bunny_storage import BunnyStorageClient
|
||||
|
||||
thread_image_generator = ImageGenerator(
|
||||
ai_client=thread_generator.ai_client,
|
||||
prompt_manager=thread_generator.prompt_manager,
|
||||
project_repo=thread_project_repo
|
||||
)
|
||||
|
||||
hero_url = None
|
||||
content_image_urls = []
|
||||
|
||||
if tier_config.image_config:
|
||||
project = thread_project_repo.get_by_id(project_id)
|
||||
if project:
|
||||
storage_client = BunnyStorageClient()
|
||||
from src.database.repositories import SiteDeploymentRepository
|
||||
thread_site_repo = SiteDeploymentRepository(thread_session)
|
||||
|
||||
# Generate hero image
|
||||
if tier_config.image_config.hero:
|
||||
try:
|
||||
click.echo(f"{prefix} Generating hero image...")
|
||||
hero_image = thread_image_generator.generate_hero_image(
|
||||
project_id=project_id,
|
||||
title=title,
|
||||
width=tier_config.image_config.hero.width,
|
||||
height=tier_config.image_config.hero.height
|
||||
)
|
||||
|
||||
if hero_image and site_deployment_id:
|
||||
site = thread_site_repo.get_by_id(site_deployment_id)
|
||||
if site:
|
||||
main_keyword_slug = slugify(project.main_keyword)
|
||||
file_path = f"images/{main_keyword_slug}.jpg"
|
||||
hero_url = upload_image_to_storage(storage_client, site, hero_image, file_path)
|
||||
if hero_url:
|
||||
click.echo(f"{prefix} Hero image uploaded: {hero_url}")
|
||||
except Exception as e:
|
||||
click.echo(f"{prefix} Hero image generation failed: {e}")
|
||||
|
||||
# Generate content images
|
||||
if tier_config.image_config.content and tier_config.image_config.content.max_num_images > 0:
|
||||
try:
|
||||
num_images = random.randint(
|
||||
tier_config.image_config.content.min_num_images,
|
||||
tier_config.image_config.content.max_num_images
|
||||
)
|
||||
|
||||
if num_images > 0:
|
||||
click.echo(f"{prefix} Generating {num_images} content image(s)...")
|
||||
|
||||
entities = project.entities or []
|
||||
related_searches = project.related_searches or []
|
||||
|
||||
if entities and related_searches:
|
||||
for i in range(num_images):
|
||||
try:
|
||||
entity = random.choice(entities)
|
||||
related_search = random.choice(related_searches)
|
||||
|
||||
content_image = thread_image_generator.generate_content_image(
|
||||
project_id=project_id,
|
||||
entity=entity,
|
||||
related_search=related_search,
|
||||
width=tier_config.image_config.content.width,
|
||||
height=tier_config.image_config.content.height
|
||||
)
|
||||
|
||||
if content_image and site_deployment_id:
|
||||
site = thread_site_repo.get_by_id(site_deployment_id)
|
||||
if site:
|
||||
main_keyword_slug = slugify(project.main_keyword)
|
||||
entity_slug = slugify(entity)
|
||||
related_slug = slugify(related_search)
|
||||
file_path = f"images/{main_keyword_slug}-{entity_slug}-{related_slug}.jpg"
|
||||
img_url = upload_image_to_storage(storage_client, site, content_image, file_path)
|
||||
if img_url:
|
||||
content_image_urls.append(img_url)
|
||||
click.echo(f"{prefix} Content image {i+1}/{num_images} uploaded: {img_url}")
|
||||
except Exception as e:
|
||||
click.echo(f"{prefix} Content image {i+1} generation failed: {e}")
|
||||
except Exception as e:
|
||||
click.echo(f"{prefix} Content image generation failed: {e}")
|
||||
|
||||
# Update article with image URLs
|
||||
saved_content.hero_image_url = hero_url
|
||||
saved_content.content_images = content_image_urls if content_image_urls else None
|
||||
thread_session.add(saved_content)
|
||||
thread_session.commit()
|
||||
click.echo(f"{prefix} Saved (ID: {saved_content.id}, Status: {status})")
|
||||
|
||||
|
|
@ -825,6 +1099,16 @@ class BatchProcessor:
|
|||
)
|
||||
click.echo(f" Interlinks injected successfully")
|
||||
|
||||
# Step 3.5: Re-insert images after interlink injection
|
||||
click.echo(f" Re-inserting images...")
|
||||
self._reinsert_images(content_records, project)
|
||||
click.echo(f" Images re-inserted successfully")
|
||||
|
||||
# Refresh content records to ensure we have latest content with images
|
||||
self.content_repo.session.expire_all()
|
||||
for content in content_records:
|
||||
self.content_repo.session.refresh(content)
|
||||
|
||||
# Step 4: Apply templates
|
||||
click.echo(f" Applying templates...")
|
||||
url_map = {url_info["content_id"]: url_info["url"] for url_info in article_urls}
|
||||
|
|
@ -836,6 +1120,8 @@ class BatchProcessor:
|
|||
template_count += 1
|
||||
except Exception as e:
|
||||
click.echo(f" Warning: Failed to apply template to content {content.id}: {e}")
|
||||
import traceback
|
||||
click.echo(f" Traceback: {traceback.format_exc()}")
|
||||
|
||||
click.echo(f" Applied templates to {template_count}/{len(content_records)} articles")
|
||||
click.echo(f" {tier_name}: Post-processing complete")
|
||||
|
|
|
|||
|
|
@ -44,6 +44,89 @@ def _get_keyword_sites(
|
|||
return matching
|
||||
|
||||
|
||||
def assign_site_to_single_article(
|
||||
content: GeneratedContent,
|
||||
job: Job,
|
||||
site_repo: SiteDeploymentRepository,
|
||||
content_repo,
|
||||
project_keyword: str
|
||||
) -> Optional[SiteDeployment]:
|
||||
"""
|
||||
Assign a site to a single article if it doesn't already have one.
|
||||
|
||||
Uses the same priority logic as assign_sites_to_batch:
|
||||
- Tier1: preferred sites → keyword sites → random
|
||||
- Tier2+: keyword sites → random
|
||||
|
||||
Args:
|
||||
content: GeneratedContent record to assign site to
|
||||
job: Job configuration with site assignment settings
|
||||
site_repo: SiteDeploymentRepository for querying/updating
|
||||
content_repo: GeneratedContentRepository to query already-assigned sites
|
||||
project_keyword: Main keyword from project
|
||||
|
||||
Returns:
|
||||
Assigned SiteDeployment if successful, None if already assigned or no sites available
|
||||
"""
|
||||
if content.site_deployment_id:
|
||||
return None
|
||||
|
||||
all_sites = site_repo.get_all()
|
||||
if not all_sites:
|
||||
logger.warning(f"No sites available for content_id={content.id}")
|
||||
return None
|
||||
|
||||
already_assigned_articles = content_repo.get_by_project_and_tier(
|
||||
content.project_id, content.tier, require_site=False
|
||||
)
|
||||
already_assigned_site_ids: Set[int] = {
|
||||
a.site_deployment_id for a in already_assigned_articles if a.site_deployment_id
|
||||
}
|
||||
|
||||
available_pool = [s for s in all_sites if s.id not in already_assigned_site_ids]
|
||||
|
||||
if not available_pool:
|
||||
logger.warning(f"No available sites for content_id={content.id} (all sites already used in this tier)")
|
||||
return None
|
||||
|
||||
is_tier1 = content.tier.lower() == "tier1"
|
||||
assigned_site = None
|
||||
|
||||
preferred_sites_map = {}
|
||||
if is_tier1 and job.tier1_preferred_sites:
|
||||
for hostname in job.tier1_preferred_sites:
|
||||
site = site_repo.get_by_hostname(hostname) or site_repo.get_by_bcdn_hostname(hostname)
|
||||
if site and site.id not in already_assigned_site_ids:
|
||||
preferred_sites_map[site.id] = site
|
||||
|
||||
if is_tier1 and preferred_sites_map:
|
||||
for site_id, site in preferred_sites_map.items():
|
||||
assigned_site = site
|
||||
logger.info(f"Assigned content_id={content.id} to preferred site: {site.custom_hostname or site.pull_zone_bcdn_hostname}")
|
||||
break
|
||||
|
||||
if not assigned_site and content.keyword:
|
||||
keyword_matches = _get_keyword_sites(available_pool, content.keyword)
|
||||
for site in keyword_matches:
|
||||
if site.id not in already_assigned_site_ids:
|
||||
assigned_site = site
|
||||
logger.info(f"Assigned content_id={content.id} to keyword site: {site.site_name}")
|
||||
break
|
||||
|
||||
if not assigned_site:
|
||||
remaining_pool = [s for s in available_pool if s.id not in already_assigned_site_ids]
|
||||
if remaining_pool:
|
||||
assigned_site = random.choice(remaining_pool)
|
||||
logger.info(f"Assigned content_id={content.id} to random site: {assigned_site.custom_hostname or assigned_site.pull_zone_bcdn_hostname}")
|
||||
|
||||
if assigned_site:
|
||||
content.site_deployment_id = assigned_site.id
|
||||
site_repo.session.add(content)
|
||||
site_repo.session.commit()
|
||||
|
||||
return assigned_site
|
||||
|
||||
|
||||
def assign_sites_to_batch(
|
||||
content_records: List[GeneratedContent],
|
||||
job: Job,
|
||||
|
|
|
|||
Loading…
Reference in New Issue