diff --git a/src/generation/batch_processor.py b/src/generation/batch_processor.py index 2c72878..4a6be14 100644 --- a/src/generation/batch_processor.py +++ b/src/generation/batch_processor.py @@ -17,7 +17,7 @@ from src.database.repositories import GeneratedContentRepository, ProjectReposit from src.generation.url_generator import generate_urls_for_batch from src.interlinking.tiered_links import find_tiered_links from src.interlinking.content_injection import inject_interlinks -from src.generation.site_assignment import assign_sites_to_batch +from src.generation.site_assignment import assign_sites_to_batch, assign_site_to_single_article from src.deployment.bunny_storage import BunnyStorageClient from src.deployment.deployment_service import DeploymentService from src.deployment.url_logger import URLLogger @@ -265,6 +265,8 @@ class BatchProcessor: 'title': titles[article_index], 'keyword': keyword, 'resolved_targets': targets_for_tier, + 'job': job, + 'project_keyword': keyword, 'debug': debug, 'models': models }) @@ -292,6 +294,8 @@ class BatchProcessor: title: str, keyword: str, resolved_targets: Dict[str, int], + job: Job, + project_keyword: str, debug: bool, models = None ): @@ -357,17 +361,7 @@ class BatchProcessor: status = "augmented" self.stats["augmented_articles"] += 1 - # Generate and insert images - content, hero_url, content_image_urls = self._generate_and_insert_images( - project_id=project_id, - tier_name=tier_name, - tier_config=tier_config, - title=title, - content=content, - site_deployment_id=site_deployment_id, - prefix=prefix - ) - + # Create minimal article record first so we can assign a site saved_content = self.content_repo.create( project_id=project_id, tier=tier_name, @@ -378,10 +372,44 @@ class BatchProcessor: word_count=word_count, status=status, site_deployment_id=site_deployment_id, - hero_image_url=hero_url, - content_images=content_image_urls if content_image_urls else None + hero_image_url=None, + content_images=None ) + # Assign site if not explicitly assigned + if not site_deployment_id and self.site_deployment_repo: + assigned_site = assign_site_to_single_article( + content=saved_content, + job=job, + site_repo=self.site_deployment_repo, + content_repo=self.content_repo, + project_keyword=project_keyword + ) + if assigned_site: + site_deployment_id = assigned_site.id + hostname = assigned_site.custom_hostname or assigned_site.pull_zone_bcdn_hostname + click.echo(f"{prefix} Assigned to site: {hostname} (ID: {site_deployment_id})") + # Update the article with the assigned site + saved_content.site_deployment_id = site_deployment_id + self.content_repo.session.add(saved_content) + self.content_repo.session.commit() + + # Generate images (now with assigned site_deployment_id) + hero_url, content_image_urls = self._generate_images_only( + project_id=project_id, + tier_name=tier_name, + tier_config=tier_config, + title=title, + site_deployment_id=site_deployment_id, + prefix=prefix + ) + + # Update article with image URLs + saved_content.hero_image_url = hero_url + saved_content.content_images = content_image_urls if content_image_urls else None + self.content_repo.session.add(saved_content) + self.content_repo.session.commit() + click.echo(f"{prefix} Saved (ID: {saved_content.id}, Status: {status})") def _generate_and_insert_images( @@ -397,9 +425,14 @@ class BatchProcessor: """ Generate images and insert into HTML content + WARNING: This method inserts images before interlink injection, which may cause + images to be lost during BeautifulSoup parsing. Consider using _generate_images_only() + and _reinsert_images() instead. + Note: image_config is always created by job config parser (with defaults if not in JSON). Defaults: hero images for all tiers (1280x720), content images for T1 only (1-3 images). """ + click.echo(f"{prefix} WARNING: DO YOU REALLY WANT TO GEN AND INSERT THE IMAGE? This may cause images to be lost during interlink injection!") if not tier_config.image_config: return content, None, [] @@ -499,6 +532,144 @@ class BatchProcessor: return content, hero_url, content_image_urls + def _generate_images_only( + self, + project_id: int, + tier_name: str, + tier_config: TierConfig, + title: str, + site_deployment_id: Optional[int], + prefix: str + ) -> tuple[Optional[str], List[str]]: + """ + Generate images and upload to storage, but don't insert into HTML. + Returns (hero_url, content_image_urls) for later insertion. + + Note: image_config is always created by job config parser (with defaults if not in JSON). + Defaults: hero images for all tiers (1280x720), content images for T1 only (1-3 images). + """ + if not tier_config.image_config: + return None, [] + + project = self.project_repo.get_by_id(project_id) + if not project: + return None, [] + + # Initialize image generator + image_generator = ImageGenerator( + ai_client=self.generator.ai_client, + prompt_manager=self.generator.prompt_manager, + project_repo=self.project_repo + ) + + storage_client = BunnyStorageClient() + hero_url = None + content_image_urls = [] + + # Generate hero image (all tiers if enabled) + if tier_config.image_config.hero: + try: + click.echo(f"{prefix} Generating hero image...") + hero_image = image_generator.generate_hero_image( + project_id=project_id, + title=title, + width=tier_config.image_config.hero.width, + height=tier_config.image_config.hero.height + ) + + if hero_image and site_deployment_id: + site = self.site_deployment_repo.get_by_id(site_deployment_id) if self.site_deployment_repo else None + if site: + main_keyword_slug = slugify(project.main_keyword) + file_path = f"images/{main_keyword_slug}.jpg" + hero_url = upload_image_to_storage(storage_client, site, hero_image, file_path) + if hero_url: + click.echo(f"{prefix} Hero image uploaded: {hero_url}") + else: + click.echo(f"{prefix} Hero image upload failed") + except Exception as e: + click.echo(f"{prefix} Hero image generation failed: {e}") + + # Generate content images (T1 only, if enabled) + if tier_config.image_config.content and tier_config.image_config.content.max_num_images > 0: + try: + num_images = random.randint( + tier_config.image_config.content.min_num_images, + tier_config.image_config.content.max_num_images + ) + + if num_images > 0: + click.echo(f"{prefix} Generating {num_images} content image(s)...") + + entities = project.entities or [] + related_searches = project.related_searches or [] + + if not entities or not related_searches: + click.echo(f"{prefix} Skipping content images (no entities/related_searches)") + else: + for i in range(num_images): + try: + entity = random.choice(entities) + related_search = random.choice(related_searches) + + content_image = image_generator.generate_content_image( + project_id=project_id, + entity=entity, + related_search=related_search, + width=tier_config.image_config.content.width, + height=tier_config.image_config.content.height + ) + + if content_image and site_deployment_id: + site = self.site_deployment_repo.get_by_id(site_deployment_id) if self.site_deployment_repo else None + if site: + main_keyword_slug = slugify(project.main_keyword) + entity_slug = slugify(entity) + related_slug = slugify(related_search) + file_path = f"images/{main_keyword_slug}-{entity_slug}-{related_slug}.jpg" + img_url = upload_image_to_storage(storage_client, site, content_image, file_path) + if img_url: + content_image_urls.append(img_url) + click.echo(f"{prefix} Content image {i+1}/{num_images} uploaded: {img_url}") + except Exception as e: + click.echo(f"{prefix} Content image {i+1} generation failed: {e}") + except Exception as e: + click.echo(f"{prefix} Content image generation failed: {e}") + + return hero_url, content_image_urls + + def _reinsert_images( + self, + content_records: List, + project + ) -> None: + """Re-insert images into content after interlink injection""" + import re + + for content in content_records: + if not content.hero_image_url and not content.content_images: + continue + + html = content.content + + # Remove existing images first (to avoid duplicates) + # Remove all img tags + html = re.sub(r']*>', '', html) + + # Insert hero image if exists + if content.hero_image_url: + alt_text = generate_alt_text(project) + html = insert_hero_after_h1(html, content.hero_image_url, alt_text) + + # Insert content images if exist + if content.content_images: + alt_texts = [generate_alt_text(project) for _ in content.content_images] + html = insert_content_images_after_h2s(html, content.content_images, alt_texts) + + # Update content + content.content = html + self.content_repo.update(content) + def _process_articles_concurrent( self, article_tasks: List[Dict[str, Any]], @@ -590,6 +761,8 @@ class BatchProcessor: title: str, keyword: str, resolved_targets: Dict[str, int], + job: Job, + project_keyword: str, debug: bool, models = None ): @@ -680,17 +853,7 @@ class BatchProcessor: with self.stats_lock: self.stats["augmented_articles"] += 1 - # Generate and insert images - content, hero_url, content_image_urls = self._generate_and_insert_images( - project_id=project_id, - tier_name=tier_name, - tier_config=tier_config, - title=title, - content=content, - site_deployment_id=site_deployment_id, - prefix=prefix - ) - + # Create article first so we can assign a site saved_content = thread_content_repo.create( project_id=project_id, tier=tier_name, @@ -701,10 +864,121 @@ class BatchProcessor: word_count=word_count, status=status, site_deployment_id=site_deployment_id, - hero_image_url=hero_url, - content_images=content_image_urls if content_image_urls else None + hero_image_url=None, + content_images=None ) + # Assign site if not explicitly assigned + if not site_deployment_id: + from src.database.repositories import SiteDeploymentRepository + thread_site_repo = SiteDeploymentRepository(thread_session) + assigned_site = assign_site_to_single_article( + content=saved_content, + job=job, + site_repo=thread_site_repo, + content_repo=thread_content_repo, + project_keyword=project_keyword + ) + if assigned_site: + site_deployment_id = assigned_site.id + hostname = assigned_site.custom_hostname or assigned_site.pull_zone_bcdn_hostname + click.echo(f"{prefix} Assigned to site: {hostname} (ID: {site_deployment_id})") + # Update the article with the assigned site + saved_content.site_deployment_id = site_deployment_id + thread_session.add(saved_content) + thread_session.commit() + + # Generate images (now with assigned site_deployment_id) + from src.generation.image_generator import ImageGenerator + from src.generation.image_upload import upload_image_to_storage + from src.deployment.bunny_storage import BunnyStorageClient + + thread_image_generator = ImageGenerator( + ai_client=thread_generator.ai_client, + prompt_manager=thread_generator.prompt_manager, + project_repo=thread_project_repo + ) + + hero_url = None + content_image_urls = [] + + if tier_config.image_config: + project = thread_project_repo.get_by_id(project_id) + if project: + storage_client = BunnyStorageClient() + from src.database.repositories import SiteDeploymentRepository + thread_site_repo = SiteDeploymentRepository(thread_session) + + # Generate hero image + if tier_config.image_config.hero: + try: + click.echo(f"{prefix} Generating hero image...") + hero_image = thread_image_generator.generate_hero_image( + project_id=project_id, + title=title, + width=tier_config.image_config.hero.width, + height=tier_config.image_config.hero.height + ) + + if hero_image and site_deployment_id: + site = thread_site_repo.get_by_id(site_deployment_id) + if site: + main_keyword_slug = slugify(project.main_keyword) + file_path = f"images/{main_keyword_slug}.jpg" + hero_url = upload_image_to_storage(storage_client, site, hero_image, file_path) + if hero_url: + click.echo(f"{prefix} Hero image uploaded: {hero_url}") + except Exception as e: + click.echo(f"{prefix} Hero image generation failed: {e}") + + # Generate content images + if tier_config.image_config.content and tier_config.image_config.content.max_num_images > 0: + try: + num_images = random.randint( + tier_config.image_config.content.min_num_images, + tier_config.image_config.content.max_num_images + ) + + if num_images > 0: + click.echo(f"{prefix} Generating {num_images} content image(s)...") + + entities = project.entities or [] + related_searches = project.related_searches or [] + + if entities and related_searches: + for i in range(num_images): + try: + entity = random.choice(entities) + related_search = random.choice(related_searches) + + content_image = thread_image_generator.generate_content_image( + project_id=project_id, + entity=entity, + related_search=related_search, + width=tier_config.image_config.content.width, + height=tier_config.image_config.content.height + ) + + if content_image and site_deployment_id: + site = thread_site_repo.get_by_id(site_deployment_id) + if site: + main_keyword_slug = slugify(project.main_keyword) + entity_slug = slugify(entity) + related_slug = slugify(related_search) + file_path = f"images/{main_keyword_slug}-{entity_slug}-{related_slug}.jpg" + img_url = upload_image_to_storage(storage_client, site, content_image, file_path) + if img_url: + content_image_urls.append(img_url) + click.echo(f"{prefix} Content image {i+1}/{num_images} uploaded: {img_url}") + except Exception as e: + click.echo(f"{prefix} Content image {i+1} generation failed: {e}") + except Exception as e: + click.echo(f"{prefix} Content image generation failed: {e}") + + # Update article with image URLs + saved_content.hero_image_url = hero_url + saved_content.content_images = content_image_urls if content_image_urls else None + thread_session.add(saved_content) thread_session.commit() click.echo(f"{prefix} Saved (ID: {saved_content.id}, Status: {status})") @@ -825,6 +1099,16 @@ class BatchProcessor: ) click.echo(f" Interlinks injected successfully") + # Step 3.5: Re-insert images after interlink injection + click.echo(f" Re-inserting images...") + self._reinsert_images(content_records, project) + click.echo(f" Images re-inserted successfully") + + # Refresh content records to ensure we have latest content with images + self.content_repo.session.expire_all() + for content in content_records: + self.content_repo.session.refresh(content) + # Step 4: Apply templates click.echo(f" Applying templates...") url_map = {url_info["content_id"]: url_info["url"] for url_info in article_urls} @@ -836,6 +1120,8 @@ class BatchProcessor: template_count += 1 except Exception as e: click.echo(f" Warning: Failed to apply template to content {content.id}: {e}") + import traceback + click.echo(f" Traceback: {traceback.format_exc()}") click.echo(f" Applied templates to {template_count}/{len(content_records)} articles") click.echo(f" {tier_name}: Post-processing complete") diff --git a/src/generation/site_assignment.py b/src/generation/site_assignment.py index 2126c28..816daf7 100644 --- a/src/generation/site_assignment.py +++ b/src/generation/site_assignment.py @@ -44,6 +44,89 @@ def _get_keyword_sites( return matching +def assign_site_to_single_article( + content: GeneratedContent, + job: Job, + site_repo: SiteDeploymentRepository, + content_repo, + project_keyword: str +) -> Optional[SiteDeployment]: + """ + Assign a site to a single article if it doesn't already have one. + + Uses the same priority logic as assign_sites_to_batch: + - Tier1: preferred sites → keyword sites → random + - Tier2+: keyword sites → random + + Args: + content: GeneratedContent record to assign site to + job: Job configuration with site assignment settings + site_repo: SiteDeploymentRepository for querying/updating + content_repo: GeneratedContentRepository to query already-assigned sites + project_keyword: Main keyword from project + + Returns: + Assigned SiteDeployment if successful, None if already assigned or no sites available + """ + if content.site_deployment_id: + return None + + all_sites = site_repo.get_all() + if not all_sites: + logger.warning(f"No sites available for content_id={content.id}") + return None + + already_assigned_articles = content_repo.get_by_project_and_tier( + content.project_id, content.tier, require_site=False + ) + already_assigned_site_ids: Set[int] = { + a.site_deployment_id for a in already_assigned_articles if a.site_deployment_id + } + + available_pool = [s for s in all_sites if s.id not in already_assigned_site_ids] + + if not available_pool: + logger.warning(f"No available sites for content_id={content.id} (all sites already used in this tier)") + return None + + is_tier1 = content.tier.lower() == "tier1" + assigned_site = None + + preferred_sites_map = {} + if is_tier1 and job.tier1_preferred_sites: + for hostname in job.tier1_preferred_sites: + site = site_repo.get_by_hostname(hostname) or site_repo.get_by_bcdn_hostname(hostname) + if site and site.id not in already_assigned_site_ids: + preferred_sites_map[site.id] = site + + if is_tier1 and preferred_sites_map: + for site_id, site in preferred_sites_map.items(): + assigned_site = site + logger.info(f"Assigned content_id={content.id} to preferred site: {site.custom_hostname or site.pull_zone_bcdn_hostname}") + break + + if not assigned_site and content.keyword: + keyword_matches = _get_keyword_sites(available_pool, content.keyword) + for site in keyword_matches: + if site.id not in already_assigned_site_ids: + assigned_site = site + logger.info(f"Assigned content_id={content.id} to keyword site: {site.site_name}") + break + + if not assigned_site: + remaining_pool = [s for s in available_pool if s.id not in already_assigned_site_ids] + if remaining_pool: + assigned_site = random.choice(remaining_pool) + logger.info(f"Assigned content_id={content.id} to random site: {assigned_site.custom_hostname or assigned_site.pull_zone_bcdn_hostname}") + + if assigned_site: + content.site_deployment_id = assigned_site.id + site_repo.session.add(content) + site_repo.session.commit() + + return assigned_site + + def assign_sites_to_batch( content_records: List[GeneratedContent], job: Job,