CheddahBot/.claude/skills/content-researcher/scripts/cora_parser.py

"""
Cora SEO Report Parser

Reads a Cora XLSX file and extracts structured data from relevant sheets.
Used as a foundation module by entity_optimizer, lsi_optimizer, and seo_optimizer.

Usage:
    uv run --with openpyxl python cora_parser.py <xlsx_path> [--sheet SHEET] [--format FORMAT]

Options:
    --sheet   Which data to extract: entities, lsi, variations, results, tunings,
              structure, densities, targets, summary, all (default: summary)
    --format  Output format: json, text (default: text)
"""

import argparse
import json
import math
import re
import sys
from pathlib import Path

try:
    import openpyxl
except ImportError:
    print("Error: openpyxl is required. Install with: uv add openpyxl", file=sys.stderr)
    sys.exit(1)


# =============================================================================
# Optimization Rules
#
# Hard-wired overrides that apply regardless of what Cora data says.
# These encode expert SEO knowledge and practical constraints.
# =============================================================================

OPTIMIZATION_RULES = {
    # Heading rules
    "h1_max": 1,                  # Never more than 1 H1
    "h1_min": 1,                  # Always have exactly 1 H1
    "optimize_headings": ["h1", "h2", "h3"],  # Primary optimization targets
    "low_priority_headings": ["h4"],           # Only add if most competitors have them
    "ignore_headings": ["h5", "h6"],           # Skip entirely

    # Keyword density
    "exact_match_density_min": 0.02,  # 2% minimum for exact match keyword
    "no_keyword_stuffing_limit": True,  # Do NOT flag for keyword stuffing
    # Variations capture exact match, so hitting variation density covers it

    # Word count strategy
    "word_count_strategy": "cluster",  # "cluster" = nearest competitive cluster, not raw average
    "word_count_acceptable_max": 1500,  # Up to 1500 is always acceptable even if target is lower

    # Density awareness
    "density_interdependent": True,  # Adding content changes all density calculations

    # Entity / LSI filtering
    "exclude_competitor_entities": True,   # Never use competitor company names as entities or LSI
    "exclude_measurement_entities": True,  # Ignore measurements (dimensions, tolerances) as entities
    "allow_organization_entities": True,   # Organizations like ISO, ANSI, etc. are OK
    "never_mention_competitors": True,     # Never mention competitors by name in content

    # Entity correlation threshold
    # Best of Both = lower of Spearman's or Pearson's correlation.
    # Measures correlation to ranking position (1=top, 100=bottom), so negative = better ranking.
    # Only include entities with Best of Both <= this value.
    # Set to None to disable filtering.
    "entity_correlation_threshold": -0.19,
}


class CoraReport:
    """Parses a Cora SEO XLSX report and provides structured access to its data."""

    def __init__(self, xlsx_path: str):
        self.path = Path(xlsx_path)
        if not self.path.exists():
            raise FileNotFoundError(f"XLSX file not found: {xlsx_path}")
        self.wb = openpyxl.load_workbook(str(self.path), data_only=True)
        self._site_domain = None  # Cached after first detection

    # -------------------------------------------------------------------------
    # Core metadata
    # -------------------------------------------------------------------------

    def get_sheet_names(self) -> list[str]:
        return self.wb.sheetnames

    def get_search_term(self) -> str:
        """Extract the target keyword from the report."""
        for sheet_name in ["Basic Tunings", "Strategic Overview", "Structure"]:
            if sheet_name not in self.wb.sheetnames:
                continue
            ws = self.wb[sheet_name]
            for row in ws.iter_rows(min_row=1, max_row=10, values_only=True):
                if row and row[0] == "Search Terms" and len(row) > 1 and row[1]:
                    return str(row[1])
        return ""

    def get_variations_list(self) -> list[str]:
        """Extract the keyword variations list from Strategic Overview B10.

        These are pipe-delimited inside curly braces:
        {cnc screw|cnc screw machining|cnc swiss|...}
        """
        if "Strategic Overview" not in self.wb.sheetnames:
            return []

        ws = self.wb["Strategic Overview"]
        rows = list(ws.iter_rows(min_row=1, max_row=12, values_only=True))

        for row in rows:
            if row and row[0] == "Keywords" and len(row) > 1 and row[1]:
                raw = str(row[1]).strip()
                # Remove curly braces and split on pipe
                raw = raw.strip("{}")
                return [v.strip() for v in raw.split("|") if v.strip()]
        return []

    def get_site_domain(self) -> str:
        """Detect the user's site domain from the report.

        Looks for the domain in the Entities sheet header (column with a .com/.net etc.
        that isn't a standard Cora column) or the site column in other sheets.
        """
        if self._site_domain:
            return self._site_domain

        # Try Entities sheet first
        if "Entities" in self.wb.sheetnames:
            ws = self.wb["Entities"]
            rows = list(ws.iter_rows(min_row=1, max_row=5, values_only=True))
            for row in rows:
                if row and row[0] == "Entity":
                    for h in row:
                        if h and isinstance(h, str):
                            h = h.strip()
                            if re.match(r'^[a-zA-Z0-9-]+\.[a-zA-Z]{2,}$', h):
                                self._site_domain = h
                                return h

        # Try LSI Keywords sheet — header like "#40.7 hoggeprecision.com"
        if "LSI Keywords" in self.wb.sheetnames:
            ws = self.wb["LSI Keywords"]
            rows = list(ws.iter_rows(min_row=1, max_row=10, values_only=True))
            for row in rows:
                if row and row[0] == "LSI Keyword":
                    for h in row:
                        if h and isinstance(h, str):
                            match = re.search(r'([a-zA-Z0-9-]+\.[a-zA-Z]{2,})', h.strip())
                            if match:
                                self._site_domain = match.group(1)
                                return self._site_domain

        return ""

    # -------------------------------------------------------------------------
    # Entities
    # -------------------------------------------------------------------------

    def get_entities(self) -> list[dict]:
        """Extract entities from the Entities sheet.

        Returns list of dicts with: name, freebase_id, wikidata_id, wiki_link,
        relevance, confidence, type, correlation, current_count, max_count, deficit
        """
        if "Entities" not in self.wb.sheetnames:
            return []

        ws = self.wb["Entities"]
        rows = list(ws.iter_rows(values_only=True))

        # Find header row containing "Entity", "Freebase ID", etc.
        header_idx = None
        for i, row in enumerate(rows):
            if row and row[0] == "Entity" and len(row) > 1 and row[1] == "Freebase ID":
                header_idx = i
                break

        if header_idx is None:
            return []

        headers = rows[header_idx]
        col_map = {str(h).strip(): j for j, h in enumerate(headers) if h}

        # Find the site-specific column (domain name like "hoggeprecision.com")
        site_col_idx = None
        site_domain = self.get_site_domain()
        if site_domain:
            site_col_idx = col_map.get(site_domain)

        entities = []
        for row in rows[header_idx + 1:]:
            if not row or not row[0]:
                continue

            name = str(row[0]).strip()
            if not name:
                continue

            # Skip rows that look like metadata (e.g., "critical values: ...")
            if name.startswith("critical") or name.startswith("http"):
                continue

            correlation = _safe_float(row, col_map.get("Best of Both"))

            # Filter by Best of Both correlation threshold.
            # Lower (more negative) = stronger ranking signal (correlates with
            # position 1 vs 100). Only keep entities at or below the threshold.
            threshold = OPTIMIZATION_RULES.get("entity_correlation_threshold")
            if threshold is not None and (correlation is None or correlation > threshold):
                continue

            entity = {
                "name": name,
                "freebase_id": _safe_str(row, col_map.get("Freebase ID")),
                "wikidata_id": _safe_str(row, col_map.get("Wikidata ID")),
                "wiki_link": _safe_str(row, col_map.get("Wiki Link")),
                "relevance": _safe_float(row, col_map.get("Relevance")),
                "confidence": _safe_float(row, col_map.get("Confidence")),
                "type": _safe_str(row, col_map.get("Type")),
                "correlation": correlation,
                "current_count": _safe_int(row, site_col_idx),
                "max_count": _safe_int(row, col_map.get("Max")),
                "deficit": _safe_int(row, col_map.get("Deficit")),
            }
            entities.append(entity)

        return entities

    # -------------------------------------------------------------------------
    # LSI Keywords
    # -------------------------------------------------------------------------

    def get_lsi_keywords(self) -> list[dict]:
        """Extract LSI keywords from the LSI Keywords sheet.

        Returns list of dicts with: keyword, spearmans, pearsons, best_of_both,
        pages, max, avg, current_count, deficit
        """
        if "LSI Keywords" not in self.wb.sheetnames:
            return []

        ws = self.wb["LSI Keywords"]
        rows = list(ws.iter_rows(values_only=True))

        # Find header row containing "LSI Keyword", "Spearmans", etc.
        header_idx = None
        for i, row in enumerate(rows):
            if row and row[0] == "LSI Keyword":
                header_idx = i
                break

        if header_idx is None:
            return []

        headers = rows[header_idx]
        col_map = {str(h).strip(): j for j, h in enumerate(headers) if h}

        # Find site column — pattern like "#40.7 hoggeprecision.com"
        site_col_idx = None
        site_domain = self.get_site_domain()
        if site_domain:
            for j, h in enumerate(headers):
                if h and isinstance(h, str) and site_domain in h:
                    site_col_idx = j
                    break
        if site_col_idx is None:
            site_col_idx = _find_site_col_idx(headers)

        lsi_keywords = []
        for row in rows[header_idx + 1:]:
            if not row or not row[0]:
                continue

            keyword = str(row[0]).strip()
            if not keyword:
                continue

            lsi = {
                "keyword": keyword,
                "spearmans": _safe_float(row, col_map.get("Spearmans")),
                "pearsons": _safe_float(row, col_map.get("Pearsons")),
                "best_of_both": _safe_float(row, col_map.get("Best of Both")),
                "pages": _safe_int(row, col_map.get("Pages")),
                "max": _safe_int(row, col_map.get("Max")),
                "avg": _safe_float(row, col_map.get("Avg")),
                "current_count": _safe_int(row, site_col_idx),
                "deficit": _safe_float(row, col_map.get("Deficit")),
            }
            lsi_keywords.append(lsi)

        return lsi_keywords

    # -------------------------------------------------------------------------
    # Keyword Variations
    # -------------------------------------------------------------------------

    def get_keyword_variations(self) -> list[dict]:
        """Extract keyword variation counts from the Variations sheet.

        Returns list of dicts with: variation, page1_max, page1_avg
        """
        if "Variations" not in self.wb.sheetnames:
            return []

        ws = self.wb["Variations"]
        rows = list(ws.iter_rows(values_only=True))

        if not rows or len(rows) < 3:
            return []

        header_row = rows[0]

        # Find where variation columns start (after "# used" column)
        var_start = 3  # default
        for j, h in enumerate(header_row):
            if h and str(h).strip() == "# used":
                var_start = j + 1
                break

        max_row = rows[1] if len(rows) > 1 else None
        avg_row = rows[2] if len(rows) > 2 else None

        variations = []
        for j in range(var_start, len(header_row)):
            name = header_row[j]
            if not name:
                continue

            variation = {
                "variation": str(name).strip(),
                "page1_max": _safe_int(max_row, j) if max_row else 0,
                "page1_avg": _safe_int(avg_row, j) if avg_row else 0,
            }
            variations.append(variation)

        return variations

    # -------------------------------------------------------------------------
    # Structure Targets (per-element targets from Structure sheet)
    # -------------------------------------------------------------------------

    def get_structure_targets(self) -> dict:
        """Extract per-element optimization targets from the Structure sheet.

        Returns a dict keyed by element type with sub-targets:
        {
            "title_tag": {"exact_match": 0.2, "variations": 1.3, "entities": 5.8, "lsi_words": 10.7},
            "meta_description": {...},
            "all_h_tags": {"count": 20.7, "exact_match": 0.4, "variations": 5.7, "entities": 45.8, "lsi_words": 77.4},
            "h1": {"count": 1.1, "exact_match": 0.1, "variations": 1, "entities": 3.8, "lsi_words": 7.3},
            "h2": {...},
            "h3": {...},
            "h4": {...},
        }
        Page 1 Average values are in column D (index 3).
        """
        if "Structure" not in self.wb.sheetnames:
            return {}

        ws = self.wb["Structure"]
        rows = list(ws.iter_rows(values_only=True))

        # Find the header row with "Factor Name", "Page 1 Avg" etc.
        header_idx = None
        for i, row in enumerate(rows):
            if row and len(row) > 3:
                if row[2] == "Factor Name" or (row[1] == "Factor ID" and row[2] == "Factor Name"):
                    header_idx = i
                    break
                # Also check for the combined "Best of Both Correlation" header
                if row[0] and "Best of Both" in str(row[0]):
                    header_idx = i
                    break

        if header_idx is None:
            return {}

        # Parse factor rows into sections
        # Section headers: "TITLE TAG", "META DESCRIPTION", "TOTAL FOR ALL H TAGS",
        # "H1 Data", "H2 Data", "H3 Data", "H4 Data", "H5 Data", "H6 Data"
        section_map = {
            "TITLE TAG": "title_tag",
            "META DESCRIPTION": "meta_description",
            "TOTAL FOR ALL H TAGS": "all_h_tags",
            "H1 Data": "h1",
            "H2 Data": "h2",
            "H3 Data": "h3",
            "H4 Data": "h4",
        }

        # Factor name patterns to field names
        factor_patterns = {
            "Number of": "count",
            "Exact Match": "exact_match",
            "Variation": "variations",
            "Entities": "entities",
            "LSI": "lsi_words",
            "Search Term": "search_terms",
            "Keywords": "keywords",
        }

        targets = {}
        current_section = None

        for row in rows[header_idx + 1:]:
            if not row or len(row) < 4:
                continue

            factor_name = _safe_str(row, 2)

            # Check if this is a section header
            if factor_name in section_map:
                current_section = section_map[factor_name]
                targets[current_section] = {}
                continue

            # Skip sections we don't care about (H5, H6)
            if factor_name in ("H5 Data", "H6 Data"):
                current_section = None
                continue

            if current_section is None:
                continue

            # Get the Page 1 Average (column D, index 3)
            avg_val = _safe_float(row, 3)
            if avg_val is None:
                continue

            # Map factor name to field
            field_name = None
            for pattern, field in factor_patterns.items():
                if pattern.lower() in factor_name.lower():
                    field_name = field
                    break

            if field_name and current_section:
                # Also grab correlation from column A
                correlation = _safe_float(row, 0)

                # Outlier detection: check if one of the top 10 results
                # contributes >50% of the sum. If so, exclude it and
                # recompute the average — that outlier is skewing the target.
                top10 = [_safe_float(row, j) or 0 for j in range(4, 14)]
                top10_sum = sum(top10)
                adjusted_avg = avg_val
                outlier_detected = False

                if top10_sum > 0:
                    max_val = max(top10)
                    if max_val > top10_sum * 0.5 and avg_val > 1:
                        # One result is >50% of the total — outlier.
                        # Skip adjustment when avg <= 1: a single "1" among
                        # zeros triggers the rule but the target is already
                        # small enough that adjustment would zero it out.
                        remaining = [v for v in top10 if v != max_val]
                        # If max_val appears multiple times, only remove one
                        if len(remaining) == len(top10):
                            remaining = top10[:]
                            remaining.remove(max_val)
                        if remaining:
                            adjusted_avg = sum(remaining) / len(remaining)
                        outlier_detected = True

                target_val = math.ceil(adjusted_avg)

                entry = {
                    "avg": avg_val,
                    "target": target_val,
                    "correlation": correlation,
                }
                if outlier_detected:
                    entry["outlier_adjusted"] = True
                    entry["original_target"] = math.ceil(avg_val)

                targets[current_section][field_name] = entry

        return targets

    # -------------------------------------------------------------------------
    # Density Targets (from Strategic Overview rows 46-48)
    # -------------------------------------------------------------------------

    def get_density_targets(self) -> dict:
        """Extract density targets from Strategic Overview rows 46-48.

        Row 46: Variation density
        Row 47: Entity density
        Row 48: LSI density

        Column D (index 3) = Page 1 Average.
        Returns per-result values so we can show distribution.
        """
        if "Strategic Overview" not in self.wb.sheetnames:
            return {}

        ws = self.wb["Strategic Overview"]
        rows = list(ws.iter_rows(values_only=True))

        # Find the density rows — they're the last 3 non-empty rows in the data section
        # Look for them near row 46-48 area, identified by having floats in col D
        # and being near the bottom of the data
        # Approach: find the row with "Relevant Density" and the 3 rows after the gap
        density_area_start = None
        for i, row in enumerate(rows):
            if row and len(row) > 2 and row[2] == "Relevant Density":
                # Density target rows are a few rows below this
                density_area_start = i
                break

        if density_area_start is None:
            return {}

        # The 3 density rows come after a gap. They have NO values in cols A, B, C —
        # only numeric values from col D onward. Row 44 (which has a correlation in
        # col A) is a count row, not a density row, so we skip it.
        density_rows = []
        for i in range(density_area_start + 1, min(density_area_start + 10, len(rows))):
            row = rows[i]
            if not row:
                continue
            col_a = row[0] if len(row) > 0 else None
            col_b = row[1] if len(row) > 1 else None
            col_c = row[2] if len(row) > 2 else None
            col_d = row[3] if len(row) > 3 else None
            # Density rows have None in A, B, C and a float in D
            if col_a is None and col_b is None and col_c is None and col_d is not None:
                try:
                    float(col_d)
                    density_rows.append(row)
                except (ValueError, TypeError):
                    pass

        # Get result domains from row 22 area for the site column
        result_start_col = 4  # Results start at col E (index 4)

        result = {}
        labels = ["variation_density", "entity_density", "lsi_density"]

        for idx, label in enumerate(labels):
            if idx >= len(density_rows):
                break
            row = density_rows[idx]
            avg = _safe_float(row, 3)
            # Collect per-competitor values
            competitor_vals = []
            for j in range(result_start_col, min(result_start_col + 10, len(row))):
                v = _safe_float(row, j)
                if v is not None:
                    competitor_vals.append(v)

            result[label] = {
                "avg": avg,
                "avg_pct": f"{avg * 100:.2f}%" if avg else "N/A",
                "competitor_values": competitor_vals,
            }

        return result

    # -------------------------------------------------------------------------
    # Content Targets (word count, distinct entities, etc.)
    # -------------------------------------------------------------------------

    def get_content_targets(self) -> dict:
        """Extract key content-level targets from Strategic Overview.

        Includes: word count distribution, distinct entities target, variations in HTML, etc.
        """
        if "Strategic Overview" not in self.wb.sheetnames:
            return {}

        ws = self.wb["Strategic Overview"]
        rows = list(ws.iter_rows(values_only=True))

        targets = {}
        result_start_col = 4

        for i, row in enumerate(rows):
            if not row or len(row) < 4:
                continue

            factor_name = _safe_str(row, 2)
            factor_id = _safe_str(row, 1)
            correlation = _safe_float(row, 0)
            avg = _safe_float(row, 3)

            if not factor_name or avg is None:
                continue

            # Key factors we care about
            if factor_name == "Number of Distinct Entities Used":
                competitor_vals = []
                for j in range(result_start_col, min(result_start_col + 10, len(row))):
                    v = _safe_float(row, j)
                    if v is not None:
                        competitor_vals.append(int(v))
                targets["distinct_entities"] = {
                    "factor_id": factor_id,
                    "avg": avg,
                    "target": math.ceil(avg),
                    "correlation": correlation,
                    "competitor_values": competitor_vals,
                }

            elif factor_name == "Variations in HTML Tags":
                targets["variations_in_html"] = {
                    "factor_id": factor_id,
                    "avg": avg,
                    "target": math.ceil(avg),
                    "correlation": correlation,
                }

            elif factor_name == "Entities in the HTML Tag":
                targets["entities_in_html"] = {
                    "factor_id": factor_id,
                    "avg": avg,
                    "target": math.ceil(avg),
                    "correlation": correlation,
                }

        return targets

    def get_word_count_distribution(self) -> dict:
        """Get word count data for competitive cluster analysis.

        Returns the clean word count for each competitor from the Keywords sheet,
        sorted ascending, plus the Page 1 Average and suggested cluster target.
        """
        if "Keywords" not in self.wb.sheetnames:
            return {}

        ws = self.wb["Keywords"]
        rows = list(ws.iter_rows(values_only=True))

        if not rows:
            return {}

        headers = rows[0]
        col_map = {str(h).strip(): j for j, h in enumerate(headers) if h}

        host_idx = col_map.get("Host")
        clean_wc_idx = col_map.get("Clean Word Count")

        if host_idx is None or clean_wc_idx is None:
            return {}

        # Collect word counts for page 1 results (top 10)
        competitors = []
        for row in rows[1:11]:
            if not row or not row[host_idx]:
                continue
            wc = _safe_int(row, clean_wc_idx)
            if wc and wc > 0:
                competitors.append({
                    "host": str(row[host_idx]),
                    "clean_word_count": wc,
                })

        if not competitors:
            return {}

        # Sort by word count
        competitors.sort(key=lambda x: x["clean_word_count"])
        counts = [c["clean_word_count"] for c in competitors]

        # Calculate cluster target
        avg = sum(counts) / len(counts)
        median = counts[len(counts) // 2]
        cluster_target = _find_cluster_target(counts)

        return {
            "competitors": competitors,
            "counts_sorted": counts,
            "average": round(avg),
            "median": median,
            "cluster_target": cluster_target,
            "min": counts[0],
            "max": counts[-1],
        }

    # -------------------------------------------------------------------------
    # Basic Tunings
    # -------------------------------------------------------------------------

    def get_basic_tunings(self) -> list[dict]:
        """Extract on-page tuning factors from the Basic Tunings sheet."""
        if "Basic Tunings" not in self.wb.sheetnames:
            return []

        ws = self.wb["Basic Tunings"]
        rows = list(ws.iter_rows(values_only=True))

        # Find sub-header row with "Factor ID", "Factor"
        header_idx = None
        for i, row in enumerate(rows):
            if row and len(row) > 2 and row[1] == "Factor ID" and row[2] == "Factor":
                header_idx = i
                break

        if header_idx is None:
            return []

        tunings = []
        for row in rows[header_idx + 1:]:
            if not row:
                continue

            factor_id = row[1] if len(row) > 1 else None
            if not factor_id or not str(factor_id).strip():
                continue

            factor_id_str = str(factor_id).strip()
            if not re.match(r'^[A-Z]{2,}\d+', factor_id_str):
                continue

            tuning = {
                "factor_id": factor_id_str,
                "factor": _safe_str(row, 2),
                "current": _safe_str(row, 3),
                "goal": _safe_str(row, 4),
                "percent": _safe_float(row, 5),
                "recommendation": _safe_str(row, 6),
            }
            tunings.append(tuning)

        return tunings

    # -------------------------------------------------------------------------
    # Competitor URLs (Results sheet)
    # -------------------------------------------------------------------------

    def get_competitor_urls(self) -> list[dict]:
        """Extract competitor URLs from the Results sheet."""
        if "Results" not in self.wb.sheetnames:
            return []

        ws = self.wb["Results"]
        rows = list(ws.iter_rows(values_only=True))

        if not rows:
            return []

        headers = rows[0]
        col_map = {str(h).strip(): j for j, h in enumerate(headers) if h}

        results = []
        for row in rows[1:]:
            if not row or not row[0]:
                continue

            result = {
                "rank": _safe_int(row, col_map.get("Rank")),
                "host": _safe_str(row, col_map.get("Host")),
                "url": _safe_str(row, col_map.get("URL")),
                "title": _safe_str(row, col_map.get("Link Text")),
                "summary": _safe_str(row, col_map.get("Summary")),
            }
            results.append(result)

        return results

    # -------------------------------------------------------------------------
    # Summary
    # -------------------------------------------------------------------------

    def get_summary(self) -> dict:
        """Get a high-level summary of the Cora report with all key targets."""
        entities = self.get_entities()
        lsi = self.get_lsi_keywords()
        variations = self.get_variations_list()
        tunings = self.get_basic_tunings()
        results = self.get_competitor_urls()
        density = self.get_density_targets()
        content = self.get_content_targets()
        wc_dist = self.get_word_count_distribution()

        # Find word count goal from tunings
        word_count_goal = None
        for t in tunings:
            if t["factor"] == "Word Count":
                word_count_goal = t["goal"]
                break

        entities_with_deficit = [e for e in entities if e["deficit"] and e["deficit"] > 0]
        lsi_with_deficit = [l for l in lsi if l["deficit"] and l["deficit"] > 0]

        return {
            "search_term": self.get_search_term(),
            "site_domain": self.get_site_domain(),
            "keyword_variations": variations,
            "total_entities": len(entities),
            "entities_with_deficit": len(entities_with_deficit),
            "total_lsi_keywords": len(lsi),
            "lsi_with_deficit": len(lsi_with_deficit),
            "word_count_goal": word_count_goal,
            "word_count_cluster_target": wc_dist.get("cluster_target"),
            "word_count_distribution": wc_dist.get("counts_sorted", []),
            "variation_density_avg": density.get("variation_density", {}).get("avg_pct"),
            "entity_density_avg": density.get("entity_density", {}).get("avg_pct"),
            "lsi_density_avg": density.get("lsi_density", {}).get("avg_pct"),
            "distinct_entities_target": content.get("distinct_entities", {}).get("target"),
            "competitors_analyzed": len(results),
            "tuning_factors": len(tunings),
            "optimization_rules": OPTIMIZATION_RULES,
        }


# =============================================================================
# Helper functions
# =============================================================================

def _safe_str(row, idx) -> str:
    if idx is None or idx >= len(row) or row[idx] is None:
        return ""
    return str(row[idx]).strip()


def _safe_float(row, idx) -> float | None:
    if idx is None or idx >= len(row) or row[idx] is None:
        return None
    try:
        return float(row[idx])
    except (ValueError, TypeError):
        return None


def _safe_int(row, idx) -> int | None:
    if idx is None or idx >= len(row) or row[idx] is None:
        return None
    try:
        return int(float(row[idx]))
    except (ValueError, TypeError):
        return None


def _find_site_col_idx(headers) -> int | None:
    """Find site column by looking for domain pattern in header values."""
    for j, h in enumerate(headers):
        if h and isinstance(h, str):
            h_str = h.strip()
            if re.search(r'[a-zA-Z0-9-]+\.[a-zA-Z]{2,}', h_str):
                # Skip known non-site headers
                if h_str in ("Best of Both", "LSI Keyword"):
                    continue
                return j
    return None


def _find_cluster_target(counts: list[int]) -> int:
    """Find the nearest competitive cluster target for word count.

    Strategy: Don't use the raw average (skewed by outliers).
    Instead, find clusters of 3+ competitors within 30% of each other
    and target slightly above the nearest cluster's center.
    """
    if not counts:
        return 0

    if len(counts) <= 3:
        return math.ceil(max(counts) * 1.05)

    # Simple clustering: find the densest grouping
    best_cluster = []
    for i in range(len(counts)):
        cluster = [counts[i]]
        for j in range(i + 1, len(counts)):
            # Within 40% range of the cluster start
            if counts[j] <= counts[i] * 1.4:
                cluster.append(counts[j])
            else:
                break
        if len(cluster) >= len(best_cluster):
            best_cluster = cluster

    if best_cluster:
        cluster_avg = sum(best_cluster) / len(best_cluster)
        # Target slightly above the cluster average
        return math.ceil(cluster_avg * 1.05)

    # Fallback: median + 5%
    median = counts[len(counts) // 2]
    return math.ceil(median * 1.05)


# =============================================================================
# Output formatting
# =============================================================================

def format_text(data, label: str = "") -> str:
    """Format data as human-readable text."""
    lines = []
    if label:
        lines.append(f"=== {label} ===")
        lines.append("")

    if isinstance(data, dict):
        for key, value in data.items():
            if isinstance(value, list) and len(value) > 5:
                lines.append(f"  {key}: [{len(value)} items]")
            elif isinstance(value, dict):
                lines.append(f"  {key}:")
                for k2, v2 in value.items():
                    lines.append(f"    {k2}: {v2}")
            else:
                lines.append(f"  {key}: {value}")
    elif isinstance(data, list):
        for i, item in enumerate(data):
            if isinstance(item, dict):
                lines.append(f"  [{i + 1}]")
                for key, value in item.items():
                    lines.append(f"    {key}: {value}")
            else:
                lines.append(f"  [{i + 1}] {item}")
    lines.append("")
    return "\n".join(lines)


# =============================================================================
# CLI
# =============================================================================

def main():
    parser = argparse.ArgumentParser(description="Parse a Cora SEO XLSX report")
    parser.add_argument("xlsx_path", help="Path to the Cora XLSX file")
    parser.add_argument(
        "--sheet",
        choices=[
            "entities", "lsi", "variations", "results", "tunings",
            "structure", "densities", "targets", "wordcount", "summary", "all",
        ],
        default="summary",
        help="Which data to extract (default: summary)",
    )
    parser.add_argument(
        "--format",
        choices=["json", "text"],
        default="text",
        help="Output format (default: text)",
    )
    parser.add_argument(
        "--top-n",
        type=int,
        default=0,
        help="Limit output to top N results (0 = all)",
    )
    args = parser.parse_args()

    report = CoraReport(args.xlsx_path)

    extractors = {
        "entities": ("Entities", report.get_entities),
        "lsi": ("LSI Keywords", report.get_lsi_keywords),
        "variations": ("Keyword Variations", lambda: report.get_keyword_variations()),
        "results": ("Competitor URLs", report.get_competitor_urls),
        "tunings": ("Basic Tunings", report.get_basic_tunings),
        "structure": ("Structure Targets", report.get_structure_targets),
        "densities": ("Density Targets", report.get_density_targets),
        "targets": ("Content Targets", report.get_content_targets),
        "wordcount": ("Word Count Distribution", report.get_word_count_distribution),
        "summary": ("Summary", report.get_summary),
    }

    if args.sheet == "all":
        sheets_to_show = ["summary", "structure", "densities", "targets", "wordcount"]
    else:
        sheets_to_show = [args.sheet]

    for sheet_key in sheets_to_show:
        label, extractor = extractors[sheet_key]
        data = extractor()

        if args.top_n > 0 and isinstance(data, list):
            data = data[:args.top_n]

        if args.format == "json":
            print(json.dumps(data, indent=2, default=str))
        else:
            print(format_text(data, label))


if __name__ == "__main__":
    main()