From 4d3a78d255cde42175e2b71bc74b2daa68bd50eb Mon Sep 17 00:00:00 2001 From: PeninsulaInd Date: Wed, 21 Jan 2026 16:23:29 -0600 Subject: [PATCH] added branded + anchor text with -bp flag --- .cursor/plans/branded-plus-anchor-text.md | 116 ++++++++++++++++++ src/cli/commands.py | 143 +++++++++++++++++++--- 2 files changed, 242 insertions(+), 17 deletions(-) create mode 100644 .cursor/plans/branded-plus-anchor-text.md diff --git a/.cursor/plans/branded-plus-anchor-text.md b/.cursor/plans/branded-plus-anchor-text.md new file mode 100644 index 0000000..336e0a6 --- /dev/null +++ b/.cursor/plans/branded-plus-anchor-text.md @@ -0,0 +1,116 @@ +# Branded+ Anchor Text Implementation Plan + +## Overview + +Enhance the `ingest-cora` command to support "branded+" anchor text generation, which combines brand names with related searches. Add a brand mapping system to store company URLs and their associated brand names, and update the anchor text calculation logic to handle branded, branded+, and regular terms sequentially. + +## Components + +### 1. Brand Mapping Storage + +- **File**: `brands.json` (root directory) +- **Format**: JSON mapping normalized domains to brand name arrays + ```json + { + "gullco.com": ["Gullco", "Gullco International"] + } + ``` +- **Location**: Project root for easy editing +- **Normalization**: Store only normalized domains (no www., no scheme) + +### 2. Brand Lookup Helper (Inline) + +- **File**: `src/cli/commands.py` (add helper function) +- **Function**: `_get_brands_for_url(url: str) -> List[str]` + - Extract domain from URL (remove scheme, www., trailing slash) + - Load brands.json from project root + - Lookup normalized domain + - Return brand names list or empty list if not found/file missing + +### 3. Branded+ Anchor Text Generation + +- **File**: `src/cli/commands.py` (modify `create_job_file_for_project`) +- **Patterns**: Generate two variations per related search: + - `"{brand} {term}"` (e.g., "Gullco welder") + - `"{term} by {brand}"` (e.g., "welder by Gullco") +- **Logic**: For each brand name and each related search, generate both patterns + +### 4. CLI Command Updates + +- **File**: `src/cli/commands.py` (modify `ingest_cora`) +- **New flag**: `--tier1-branded-plus-ratio` (float, optional) + - Only prompts for branded+ if this flag is provided + - Prompts for percentage (0.0-1.0) of remaining slots after branded +- **Brand text prompt update**: + - Show default brands from brand mapping if URL found + - Allow Enter to accept defaults + - Format: "Enter branded anchor text (company name) for tier1 [default: 'Gullco, Gullco International'] (press Enter for default):" + +### 5. Anchor Text Calculation Logic + +- **File**: `src/cli/commands.py` (modify `create_job_file_for_project`) +- **Calculation order**: + 1. Get available terms (custom_anchor_text or related_searches) + 2. Calculate branded count: `total * tier1_branded_ratio` + 3. Calculate remaining: `total - branded_count` + 4. Calculate branded+ count: `remaining * branded_plus_ratio` (if enabled) + 5. Calculate regular count: `remaining - branded_plus_count` +- **Generation**: + - Branded terms: Use provided brand names (cycled) + - Branded+ terms: Generate from brands + related_searches (both patterns) + - Regular terms: Use remaining related_searches/keyword variations + +### 6. Function Signature Updates + +- **File**: `src/cli/commands.py` +- **`create_job_file_for_project`**: + - Add `tier1_branded_plus_ratio: Optional[float] = None` + - Add `brand_names: Optional[List[str]] = None` (for branded+ generation) +- **`ingest_cora`**: + - Add `tier1_branded_plus_ratio: Optional[float] = None` parameter + - Pass brand names to `create_job_file_for_project` + +## Implementation Details + +### Brand Lookup Flow + +1. Normalize `money_site_url`: remove scheme (http://, https://), remove www. prefix, remove trailing slash +2. Look up normalized domain in brands.json +3. Return list of brand names or empty list if not found + +### Branded+ Generation Example + +- Brands: ["Gullco", "Gullco International"] +- Related searches: ["welder", "automatic welder"] +- Generated terms: + - "Gullco welder" + - "welder by Gullco" + - "Gullco automatic welder" + - "automatic welder by Gullco" + - "Gullco International welder" + - "welder by Gullco International" + - "Gullco International automatic welder" + - "automatic welder by Gullco International" + +### Anchor Text Distribution Example + +- Total available terms: 10 +- `tier1_branded_ratio`: 0.4 → 4 branded terms +- Remaining: 6 +- `tier1_branded_plus_ratio`: 0.67 → 4 branded+ terms +- Regular: 2 terms +- Final list: [4 branded, 4 branded+, 2 regular] + +## Files to Modify + +1. `src/cli/commands.py` - Add branded+ logic, brand lookup helper, update prompts, calculation +2. `brands.json` - New file for brand mappings (create with example entry) + +## Testing Considerations + +- Test with brand mapping present and absent +- Test with Enter (default) and custom brand input +- Test branded+ calculation with various ratios +- Test URL normalization (with/without www., http/https) +- Test with multiple brand names per URL +- Test with no related searches (fallback behavior) diff --git a/src/cli/commands.py b/src/cli/commands.py index aee8684..181ee00 100644 --- a/src/cli/commands.py +++ b/src/cli/commands.py @@ -3,7 +3,7 @@ CLI command definitions using Click """ import random import click -from typing import Optional +from typing import Optional, List from src.core.config import get_config, get_bunny_account_api_key, get_concurrent_workers from src.auth.service import AuthService from src.database.session import db_manager @@ -37,12 +37,49 @@ from datetime import datetime load_dotenv() +def _get_brands_for_url(url: str) -> List[str]: + """ + Look up brand names for a given URL from brands.json + + Args: + url: Money site URL (e.g., "https://www.gullco.com") + + Returns: + List of brand names, or empty list if not found or file missing + """ + try: + from urllib.parse import urlparse + + # Normalize URL: remove scheme, www., trailing slash + parsed = urlparse(url) + domain = parsed.netloc + + # Remove www. prefix if present + if domain.startswith('www.'): + domain = domain[4:] + + # Load brands.json from project root + brands_file = Path("brands.json") + if not brands_file.exists(): + return [] + + with open(brands_file, 'r', encoding='utf-8') as f: + brands_data = json.load(f) + + # Look up normalized domain + return brands_data.get(domain, []) + except Exception: + return [] + + def create_job_file_for_project( project_id: int, project_name: str, session, tier1_branded_ratio: Optional[float] = None, tier1_branded_text: Optional[str] = None, + tier1_branded_plus_ratio: Optional[float] = None, + brand_names: Optional[List[str]] = None, random_deployment_targets: Optional[int] = None ) -> Optional[str]: """ @@ -54,6 +91,8 @@ def create_job_file_for_project( session: Database session tier1_branded_ratio: Optional ratio of branded anchor text for tier1 (0.0-1.0) tier1_branded_text: Optional branded anchor text (company name) for tier1 + tier1_branded_plus_ratio: Optional ratio of branded+ anchor text for tier1 (0.0-1.0, applied to remaining slots after branded) + brand_names: Optional list of brand names for branded+ generation random_deployment_targets: Optional number of random deployment targets to select (default: random 2-3) Returns: @@ -107,8 +146,8 @@ def create_job_file_for_project( } } - # Add anchor_text_config if branded ratio and text are provided - if tier1_branded_ratio is not None and tier1_branded_text: + # Add anchor_text_config if branded ratio/text or branded+ ratio is provided + if (tier1_branded_ratio is not None and tier1_branded_text) or (tier1_branded_plus_ratio is not None and brand_names): # Get project to retrieve main_keyword for non-branded terms project_repo = ProjectRepository(session) project = project_repo.get_by_id(project_id) @@ -128,24 +167,58 @@ def create_job_file_for_project( # Use the ACTUAL count of available terms actual_count = len(keyword_variations) - # Calculate branded and keyword counts based on actual available terms - branded_count = int(actual_count * tier1_branded_ratio) - keyword_count = actual_count - branded_count + # Calculate branded and remaining counts based on actual available terms + branded_count = 0 + if tier1_branded_ratio is not None and tier1_branded_text: + branded_count = int(actual_count * tier1_branded_ratio) + remaining_count = actual_count - branded_count # Parse comma-separated branded anchor texts - branded_texts = [text.strip() for text in tier1_branded_text.split(',') if text.strip()] + branded_texts = [] + if tier1_branded_text: + branded_texts = [text.strip() for text in tier1_branded_text.split(',') if text.strip()] - # Create anchor text list with branded terms (cycling through multiple if provided) and custom anchor text from CORA + # Create anchor text list starting with branded terms anchor_terms = [] for i in range(branded_count): branded_text = branded_texts[i % len(branded_texts)] # Cycle through branded texts anchor_terms.append(branded_text) - # Randomize keyword selection if we're not using all available terms - if keyword_count < actual_count: - selected_keywords = random.sample(keyword_variations, keyword_count) - else: - selected_keywords = keyword_variations - anchor_terms.extend(selected_keywords) + + # Generate branded+ terms if enabled + branded_plus_count = 0 + if tier1_branded_plus_ratio is not None and brand_names and len(brand_names) > 0: + branded_plus_count = int(remaining_count * tier1_branded_plus_ratio) + + # Generate branded+ terms from brands + related_searches + # Use related_searches from project, or fallback to keyword_variations + related_searches = project.related_searches if project.related_searches else keyword_variations + + branded_plus_terms = [] + for brand in brand_names: + for term in related_searches: + branded_plus_terms.append(f"{brand} {term}") + branded_plus_terms.append(f"{term} by {brand}") + + # Randomly select the needed number of branded+ terms + if len(branded_plus_terms) > 0: + if branded_plus_count > len(branded_plus_terms): + selected_branded_plus = branded_plus_terms + else: + selected_branded_plus = random.sample(branded_plus_terms, branded_plus_count) + anchor_terms.extend(selected_branded_plus) + + # Calculate regular count from remaining slots + regular_count = remaining_count - branded_plus_count + + # Add regular terms + if regular_count > 0: + # Randomize keyword selection if we're not using all available terms + if regular_count < len(keyword_variations): + selected_keywords = random.sample(keyword_variations, regular_count) + else: + selected_keywords = keyword_variations[:regular_count] + anchor_terms.extend(selected_keywords) + tier1_config["anchor_text_config"] = { "mode": "explicit", "terms": anchor_terms @@ -1005,10 +1078,11 @@ def sync_sites(admin_user: Optional[str], admin_password: Optional[str], dry_run @click.option('--money-site-url', '-m', help='Money site URL (e.g., https://example.com)') @click.option('--custom-anchors', '-a', help='Comma-separated list of custom anchor text (optional)') @click.option('--tier1-branded-ratio', '-t', default=None, type=float, help='Ratio of branded anchor text for tier1 (optional, only prompts if provided)') +@click.option('--tier1-branded-plus-ratio', '-bp', default=None, type=float, help='Ratio of branded+ anchor text for tier1 (optional, applied to remaining slots after branded)') @click.option('--random-deployment-targets', '-r', type=int, help='Number of random deployment targets to select (default: random 2-3)') @click.option('--username', '-u', help='Username for authentication') @click.option('--password', '-p', help='Password for authentication') -def ingest_cora(file_path: str, name: str, money_site_url: Optional[str], custom_anchors: Optional[str], tier1_branded_ratio: float, random_deployment_targets: Optional[int], username: Optional[str], password: Optional[str]): +def ingest_cora(file_path: str, name: str, money_site_url: Optional[str], custom_anchors: Optional[str], tier1_branded_ratio: float, tier1_branded_plus_ratio: Optional[float], random_deployment_targets: Optional[int], username: Optional[str], password: Optional[str]): """Ingest a CORA .xlsx report and create a new project""" try: if not username or not password: @@ -1079,15 +1153,48 @@ def ingest_cora(file_path: str, name: str, money_site_url: Optional[str], custom # Handle tier1 branded anchor text if ratio is specified tier1_branded_text = None + brand_names = None if tier1_branded_ratio is not None and tier1_branded_ratio > 0: + # Look up default brands from brand mapping + default_brands = _get_brands_for_url(money_site_url) + default_prompt = "" + if default_brands: + default_prompt = f" [default: '{', '.join(default_brands)}'] (press Enter for default)" + tier1_branded_text = click.prompt( - "\nEnter branded anchor text (company name) for tier1 (comma-separated for multiple, e.g., 'AGI Fabricators, AGI')", - type=str + f"\nEnter branded anchor text (company name) for tier1 (comma-separated for multiple, e.g., 'AGI Fabricators, AGI'){default_prompt}", + type=str, + default="" ).strip() + + # Use defaults if Enter was pressed and defaults exist + if not tier1_branded_text and default_brands: + tier1_branded_text = ", ".join(default_brands) + click.echo(f"Using default brands: {tier1_branded_text}") + if not tier1_branded_text: click.echo("Warning: Empty branded anchor text provided, skipping tier1 branded anchor text configuration.", err=True) tier1_branded_text = None tier1_branded_ratio = None + else: + # Parse brand names for branded+ generation + brand_names = [text.strip() for text in tier1_branded_text.split(',') if text.strip()] + + # Handle branded+ ratio if flag is provided + if tier1_branded_plus_ratio is not None: + # Validate the provided ratio + if tier1_branded_plus_ratio <= 0 or tier1_branded_plus_ratio > 1: + click.echo("Warning: Invalid branded+ ratio provided, skipping branded+ configuration.", err=True) + tier1_branded_plus_ratio = None + elif not brand_names: + # If brand names weren't set from branded prompt, try to get them from brand lookup + default_brands = _get_brands_for_url(money_site_url) + if default_brands: + brand_names = default_brands + click.echo(f"Using brand names from mapping for branded+: {', '.join(brand_names)}") + else: + click.echo("Warning: No brand names available for branded+ (set --tier1-branded-ratio or add to brands.json). Skipping branded+ configuration.", err=True) + tier1_branded_plus_ratio = None job_file = create_job_file_for_project( project.id, @@ -1095,6 +1202,8 @@ def ingest_cora(file_path: str, name: str, money_site_url: Optional[str], custom session, tier1_branded_ratio=tier1_branded_ratio, tier1_branded_text=tier1_branded_text, + tier1_branded_plus_ratio=tier1_branded_plus_ratio, + brand_names=brand_names, random_deployment_targets=random_deployment_targets ) if job_file: