CheddahBot/.claude/skills/content-researcher/scripts/cora_parser.py

985 lines
36 KiB
Python

"""
Cora SEO Report Parser
Reads a Cora XLSX file and extracts structured data from relevant sheets.
Used as a foundation module by entity_optimizer, lsi_optimizer, and seo_optimizer.
Usage:
uv run --with openpyxl python cora_parser.py <xlsx_path> [--sheet SHEET] [--format FORMAT]
Options:
--sheet Which data to extract: entities, lsi, variations, results, tunings,
structure, densities, targets, summary, all (default: summary)
--format Output format: json, text (default: text)
"""
import argparse
import json
import math
import re
import sys
from pathlib import Path
try:
import openpyxl
except ImportError:
print("Error: openpyxl is required. Install with: uv add openpyxl", file=sys.stderr)
sys.exit(1)
# =============================================================================
# Optimization Rules
#
# Hard-wired overrides that apply regardless of what Cora data says.
# These encode expert SEO knowledge and practical constraints.
# =============================================================================
OPTIMIZATION_RULES = {
# Heading rules
"h1_max": 1, # Never more than 1 H1
"h1_min": 1, # Always have exactly 1 H1
"optimize_headings": ["h1", "h2", "h3"], # Primary optimization targets
"low_priority_headings": ["h4"], # Only add if most competitors have them
"ignore_headings": ["h5", "h6"], # Skip entirely
# Keyword density
"exact_match_density_min": 0.02, # 2% minimum for exact match keyword
"no_keyword_stuffing_limit": True, # Do NOT flag for keyword stuffing
# Variations capture exact match, so hitting variation density covers it
# Word count strategy
"word_count_strategy": "cluster", # "cluster" = nearest competitive cluster, not raw average
"word_count_acceptable_max": 1500, # Up to 1500 is always acceptable even if target is lower
# Density awareness
"density_interdependent": True, # Adding content changes all density calculations
# Entity / LSI filtering
"exclude_competitor_entities": True, # Never use competitor company names as entities or LSI
"exclude_measurement_entities": True, # Ignore measurements (dimensions, tolerances) as entities
"allow_organization_entities": True, # Organizations like ISO, ANSI, etc. are OK
"never_mention_competitors": True, # Never mention competitors by name in content
# Entity correlation threshold
# Best of Both = lower of Spearman's or Pearson's correlation.
# Measures correlation to ranking position (1=top, 100=bottom), so negative = better ranking.
# Only include entities with Best of Both <= this value.
# Set to None to disable filtering.
"entity_correlation_threshold": -0.19,
}
class CoraReport:
"""Parses a Cora SEO XLSX report and provides structured access to its data."""
def __init__(self, xlsx_path: str):
self.path = Path(xlsx_path)
if not self.path.exists():
raise FileNotFoundError(f"XLSX file not found: {xlsx_path}")
self.wb = openpyxl.load_workbook(str(self.path), data_only=True)
self._site_domain = None # Cached after first detection
# -------------------------------------------------------------------------
# Core metadata
# -------------------------------------------------------------------------
def get_sheet_names(self) -> list[str]:
return self.wb.sheetnames
def get_search_term(self) -> str:
"""Extract the target keyword from the report."""
for sheet_name in ["Basic Tunings", "Strategic Overview", "Structure"]:
if sheet_name not in self.wb.sheetnames:
continue
ws = self.wb[sheet_name]
for row in ws.iter_rows(min_row=1, max_row=10, values_only=True):
if row and row[0] == "Search Terms" and len(row) > 1 and row[1]:
return str(row[1])
return ""
def get_variations_list(self) -> list[str]:
"""Extract the keyword variations list from Strategic Overview B10.
These are pipe-delimited inside curly braces:
{cnc screw|cnc screw machining|cnc swiss|...}
"""
if "Strategic Overview" not in self.wb.sheetnames:
return []
ws = self.wb["Strategic Overview"]
rows = list(ws.iter_rows(min_row=1, max_row=12, values_only=True))
for row in rows:
if row and row[0] == "Keywords" and len(row) > 1 and row[1]:
raw = str(row[1]).strip()
# Remove curly braces and split on pipe
raw = raw.strip("{}")
return [v.strip() for v in raw.split("|") if v.strip()]
return []
def get_site_domain(self) -> str:
"""Detect the user's site domain from the report.
Looks for the domain in the Entities sheet header (column with a .com/.net etc.
that isn't a standard Cora column) or the site column in other sheets.
"""
if self._site_domain:
return self._site_domain
# Try Entities sheet first
if "Entities" in self.wb.sheetnames:
ws = self.wb["Entities"]
rows = list(ws.iter_rows(min_row=1, max_row=5, values_only=True))
for row in rows:
if row and row[0] == "Entity":
for h in row:
if h and isinstance(h, str):
h = h.strip()
if re.match(r'^[a-zA-Z0-9-]+\.[a-zA-Z]{2,}$', h):
self._site_domain = h
return h
# Try LSI Keywords sheet — header like "#40.7 hoggeprecision.com"
if "LSI Keywords" in self.wb.sheetnames:
ws = self.wb["LSI Keywords"]
rows = list(ws.iter_rows(min_row=1, max_row=10, values_only=True))
for row in rows:
if row and row[0] == "LSI Keyword":
for h in row:
if h and isinstance(h, str):
match = re.search(r'([a-zA-Z0-9-]+\.[a-zA-Z]{2,})', h.strip())
if match:
self._site_domain = match.group(1)
return self._site_domain
return ""
# -------------------------------------------------------------------------
# Entities
# -------------------------------------------------------------------------
def get_entities(self) -> list[dict]:
"""Extract entities from the Entities sheet.
Returns list of dicts with: name, freebase_id, wikidata_id, wiki_link,
relevance, confidence, type, correlation, current_count, max_count, deficit
"""
if "Entities" not in self.wb.sheetnames:
return []
ws = self.wb["Entities"]
rows = list(ws.iter_rows(values_only=True))
# Find header row containing "Entity", "Freebase ID", etc.
header_idx = None
for i, row in enumerate(rows):
if row and row[0] == "Entity" and len(row) > 1 and row[1] == "Freebase ID":
header_idx = i
break
if header_idx is None:
return []
headers = rows[header_idx]
col_map = {str(h).strip(): j for j, h in enumerate(headers) if h}
# Find the site-specific column (domain name like "hoggeprecision.com")
site_col_idx = None
site_domain = self.get_site_domain()
if site_domain:
site_col_idx = col_map.get(site_domain)
entities = []
for row in rows[header_idx + 1:]:
if not row or not row[0]:
continue
name = str(row[0]).strip()
if not name:
continue
# Skip rows that look like metadata (e.g., "critical values: ...")
if name.startswith("critical") or name.startswith("http"):
continue
correlation = _safe_float(row, col_map.get("Best of Both"))
# Filter by Best of Both correlation threshold.
# Lower (more negative) = stronger ranking signal (correlates with
# position 1 vs 100). Only keep entities at or below the threshold.
threshold = OPTIMIZATION_RULES.get("entity_correlation_threshold")
if threshold is not None and (correlation is None or correlation > threshold):
continue
entity = {
"name": name,
"freebase_id": _safe_str(row, col_map.get("Freebase ID")),
"wikidata_id": _safe_str(row, col_map.get("Wikidata ID")),
"wiki_link": _safe_str(row, col_map.get("Wiki Link")),
"relevance": _safe_float(row, col_map.get("Relevance")),
"confidence": _safe_float(row, col_map.get("Confidence")),
"type": _safe_str(row, col_map.get("Type")),
"correlation": correlation,
"current_count": _safe_int(row, site_col_idx),
"max_count": _safe_int(row, col_map.get("Max")),
"deficit": _safe_int(row, col_map.get("Deficit")),
}
entities.append(entity)
return entities
# -------------------------------------------------------------------------
# LSI Keywords
# -------------------------------------------------------------------------
def get_lsi_keywords(self) -> list[dict]:
"""Extract LSI keywords from the LSI Keywords sheet.
Returns list of dicts with: keyword, spearmans, pearsons, best_of_both,
pages, max, avg, current_count, deficit
"""
if "LSI Keywords" not in self.wb.sheetnames:
return []
ws = self.wb["LSI Keywords"]
rows = list(ws.iter_rows(values_only=True))
# Find header row containing "LSI Keyword", "Spearmans", etc.
header_idx = None
for i, row in enumerate(rows):
if row and row[0] == "LSI Keyword":
header_idx = i
break
if header_idx is None:
return []
headers = rows[header_idx]
col_map = {str(h).strip(): j for j, h in enumerate(headers) if h}
# Find site column — pattern like "#40.7 hoggeprecision.com"
site_col_idx = None
site_domain = self.get_site_domain()
if site_domain:
for j, h in enumerate(headers):
if h and isinstance(h, str) and site_domain in h:
site_col_idx = j
break
if site_col_idx is None:
site_col_idx = _find_site_col_idx(headers)
lsi_keywords = []
for row in rows[header_idx + 1:]:
if not row or not row[0]:
continue
keyword = str(row[0]).strip()
if not keyword:
continue
lsi = {
"keyword": keyword,
"spearmans": _safe_float(row, col_map.get("Spearmans")),
"pearsons": _safe_float(row, col_map.get("Pearsons")),
"best_of_both": _safe_float(row, col_map.get("Best of Both")),
"pages": _safe_int(row, col_map.get("Pages")),
"max": _safe_int(row, col_map.get("Max")),
"avg": _safe_float(row, col_map.get("Avg")),
"current_count": _safe_int(row, site_col_idx),
"deficit": _safe_float(row, col_map.get("Deficit")),
}
lsi_keywords.append(lsi)
return lsi_keywords
# -------------------------------------------------------------------------
# Keyword Variations
# -------------------------------------------------------------------------
def get_keyword_variations(self) -> list[dict]:
"""Extract keyword variation counts from the Variations sheet.
Returns list of dicts with: variation, page1_max, page1_avg
"""
if "Variations" not in self.wb.sheetnames:
return []
ws = self.wb["Variations"]
rows = list(ws.iter_rows(values_only=True))
if not rows or len(rows) < 3:
return []
header_row = rows[0]
# Find where variation columns start (after "# used" column)
var_start = 3 # default
for j, h in enumerate(header_row):
if h and str(h).strip() == "# used":
var_start = j + 1
break
max_row = rows[1] if len(rows) > 1 else None
avg_row = rows[2] if len(rows) > 2 else None
variations = []
for j in range(var_start, len(header_row)):
name = header_row[j]
if not name:
continue
variation = {
"variation": str(name).strip(),
"page1_max": _safe_int(max_row, j) if max_row else 0,
"page1_avg": _safe_int(avg_row, j) if avg_row else 0,
}
variations.append(variation)
return variations
# -------------------------------------------------------------------------
# Structure Targets (per-element targets from Structure sheet)
# -------------------------------------------------------------------------
def get_structure_targets(self) -> dict:
"""Extract per-element optimization targets from the Structure sheet.
Returns a dict keyed by element type with sub-targets:
{
"title_tag": {"exact_match": 0.2, "variations": 1.3, "entities": 5.8, "lsi_words": 10.7},
"meta_description": {...},
"all_h_tags": {"count": 20.7, "exact_match": 0.4, "variations": 5.7, "entities": 45.8, "lsi_words": 77.4},
"h1": {"count": 1.1, "exact_match": 0.1, "variations": 1, "entities": 3.8, "lsi_words": 7.3},
"h2": {...},
"h3": {...},
"h4": {...},
}
Page 1 Average values are in column D (index 3).
"""
if "Structure" not in self.wb.sheetnames:
return {}
ws = self.wb["Structure"]
rows = list(ws.iter_rows(values_only=True))
# Find the header row with "Factor Name", "Page 1 Avg" etc.
header_idx = None
for i, row in enumerate(rows):
if row and len(row) > 3:
if row[2] == "Factor Name" or (row[1] == "Factor ID" and row[2] == "Factor Name"):
header_idx = i
break
# Also check for the combined "Best of Both Correlation" header
if row[0] and "Best of Both" in str(row[0]):
header_idx = i
break
if header_idx is None:
return {}
# Parse factor rows into sections
# Section headers: "TITLE TAG", "META DESCRIPTION", "TOTAL FOR ALL H TAGS",
# "H1 Data", "H2 Data", "H3 Data", "H4 Data", "H5 Data", "H6 Data"
section_map = {
"TITLE TAG": "title_tag",
"META DESCRIPTION": "meta_description",
"TOTAL FOR ALL H TAGS": "all_h_tags",
"H1 Data": "h1",
"H2 Data": "h2",
"H3 Data": "h3",
"H4 Data": "h4",
}
# Factor name patterns to field names
factor_patterns = {
"Number of": "count",
"Exact Match": "exact_match",
"Variation": "variations",
"Entities": "entities",
"LSI": "lsi_words",
"Search Term": "search_terms",
"Keywords": "keywords",
}
targets = {}
current_section = None
for row in rows[header_idx + 1:]:
if not row or len(row) < 4:
continue
factor_name = _safe_str(row, 2)
# Check if this is a section header
if factor_name in section_map:
current_section = section_map[factor_name]
targets[current_section] = {}
continue
# Skip sections we don't care about (H5, H6)
if factor_name in ("H5 Data", "H6 Data"):
current_section = None
continue
if current_section is None:
continue
# Get the Page 1 Average (column D, index 3)
avg_val = _safe_float(row, 3)
if avg_val is None:
continue
# Map factor name to field
field_name = None
for pattern, field in factor_patterns.items():
if pattern.lower() in factor_name.lower():
field_name = field
break
if field_name and current_section:
# Also grab correlation from column A
correlation = _safe_float(row, 0)
# Outlier detection: check if one of the top 10 results
# contributes >50% of the sum. If so, exclude it and
# recompute the average — that outlier is skewing the target.
top10 = [_safe_float(row, j) or 0 for j in range(4, 14)]
top10_sum = sum(top10)
adjusted_avg = avg_val
outlier_detected = False
if top10_sum > 0:
max_val = max(top10)
if max_val > top10_sum * 0.5 and avg_val > 1:
# One result is >50% of the total — outlier.
# Skip adjustment when avg <= 1: a single "1" among
# zeros triggers the rule but the target is already
# small enough that adjustment would zero it out.
remaining = [v for v in top10 if v != max_val]
# If max_val appears multiple times, only remove one
if len(remaining) == len(top10):
remaining = top10[:]
remaining.remove(max_val)
if remaining:
adjusted_avg = sum(remaining) / len(remaining)
outlier_detected = True
target_val = math.ceil(adjusted_avg)
entry = {
"avg": avg_val,
"target": target_val,
"correlation": correlation,
}
if outlier_detected:
entry["outlier_adjusted"] = True
entry["original_target"] = math.ceil(avg_val)
targets[current_section][field_name] = entry
return targets
# -------------------------------------------------------------------------
# Density Targets (from Strategic Overview rows 46-48)
# -------------------------------------------------------------------------
def get_density_targets(self) -> dict:
"""Extract density targets from Strategic Overview rows 46-48.
Row 46: Variation density
Row 47: Entity density
Row 48: LSI density
Column D (index 3) = Page 1 Average.
Returns per-result values so we can show distribution.
"""
if "Strategic Overview" not in self.wb.sheetnames:
return {}
ws = self.wb["Strategic Overview"]
rows = list(ws.iter_rows(values_only=True))
# Find the density rows — they're the last 3 non-empty rows in the data section
# Look for them near row 46-48 area, identified by having floats in col D
# and being near the bottom of the data
# Approach: find the row with "Relevant Density" and the 3 rows after the gap
density_area_start = None
for i, row in enumerate(rows):
if row and len(row) > 2 and row[2] == "Relevant Density":
# Density target rows are a few rows below this
density_area_start = i
break
if density_area_start is None:
return {}
# The 3 density rows come after a gap. They have NO values in cols A, B, C —
# only numeric values from col D onward. Row 44 (which has a correlation in
# col A) is a count row, not a density row, so we skip it.
density_rows = []
for i in range(density_area_start + 1, min(density_area_start + 10, len(rows))):
row = rows[i]
if not row:
continue
col_a = row[0] if len(row) > 0 else None
col_b = row[1] if len(row) > 1 else None
col_c = row[2] if len(row) > 2 else None
col_d = row[3] if len(row) > 3 else None
# Density rows have None in A, B, C and a float in D
if col_a is None and col_b is None and col_c is None and col_d is not None:
try:
float(col_d)
density_rows.append(row)
except (ValueError, TypeError):
pass
# Get result domains from row 22 area for the site column
result_start_col = 4 # Results start at col E (index 4)
result = {}
labels = ["variation_density", "entity_density", "lsi_density"]
for idx, label in enumerate(labels):
if idx >= len(density_rows):
break
row = density_rows[idx]
avg = _safe_float(row, 3)
# Collect per-competitor values
competitor_vals = []
for j in range(result_start_col, min(result_start_col + 10, len(row))):
v = _safe_float(row, j)
if v is not None:
competitor_vals.append(v)
result[label] = {
"avg": avg,
"avg_pct": f"{avg * 100:.2f}%" if avg else "N/A",
"competitor_values": competitor_vals,
}
return result
# -------------------------------------------------------------------------
# Content Targets (word count, distinct entities, etc.)
# -------------------------------------------------------------------------
def get_content_targets(self) -> dict:
"""Extract key content-level targets from Strategic Overview.
Includes: word count distribution, distinct entities target, variations in HTML, etc.
"""
if "Strategic Overview" not in self.wb.sheetnames:
return {}
ws = self.wb["Strategic Overview"]
rows = list(ws.iter_rows(values_only=True))
targets = {}
result_start_col = 4
for i, row in enumerate(rows):
if not row or len(row) < 4:
continue
factor_name = _safe_str(row, 2)
factor_id = _safe_str(row, 1)
correlation = _safe_float(row, 0)
avg = _safe_float(row, 3)
if not factor_name or avg is None:
continue
# Key factors we care about
if factor_name == "Number of Distinct Entities Used":
competitor_vals = []
for j in range(result_start_col, min(result_start_col + 10, len(row))):
v = _safe_float(row, j)
if v is not None:
competitor_vals.append(int(v))
targets["distinct_entities"] = {
"factor_id": factor_id,
"avg": avg,
"target": math.ceil(avg),
"correlation": correlation,
"competitor_values": competitor_vals,
}
elif factor_name == "Variations in HTML Tags":
targets["variations_in_html"] = {
"factor_id": factor_id,
"avg": avg,
"target": math.ceil(avg),
"correlation": correlation,
}
elif factor_name == "Entities in the HTML Tag":
targets["entities_in_html"] = {
"factor_id": factor_id,
"avg": avg,
"target": math.ceil(avg),
"correlation": correlation,
}
return targets
def get_word_count_distribution(self) -> dict:
"""Get word count data for competitive cluster analysis.
Returns the clean word count for each competitor from the Keywords sheet,
sorted ascending, plus the Page 1 Average and suggested cluster target.
"""
if "Keywords" not in self.wb.sheetnames:
return {}
ws = self.wb["Keywords"]
rows = list(ws.iter_rows(values_only=True))
if not rows:
return {}
headers = rows[0]
col_map = {str(h).strip(): j for j, h in enumerate(headers) if h}
host_idx = col_map.get("Host")
clean_wc_idx = col_map.get("Clean Word Count")
if host_idx is None or clean_wc_idx is None:
return {}
# Collect word counts for page 1 results (top 10)
competitors = []
for row in rows[1:11]:
if not row or not row[host_idx]:
continue
wc = _safe_int(row, clean_wc_idx)
if wc and wc > 0:
competitors.append({
"host": str(row[host_idx]),
"clean_word_count": wc,
})
if not competitors:
return {}
# Sort by word count
competitors.sort(key=lambda x: x["clean_word_count"])
counts = [c["clean_word_count"] for c in competitors]
# Calculate cluster target
avg = sum(counts) / len(counts)
median = counts[len(counts) // 2]
cluster_target = _find_cluster_target(counts)
return {
"competitors": competitors,
"counts_sorted": counts,
"average": round(avg),
"median": median,
"cluster_target": cluster_target,
"min": counts[0],
"max": counts[-1],
}
# -------------------------------------------------------------------------
# Basic Tunings
# -------------------------------------------------------------------------
def get_basic_tunings(self) -> list[dict]:
"""Extract on-page tuning factors from the Basic Tunings sheet."""
if "Basic Tunings" not in self.wb.sheetnames:
return []
ws = self.wb["Basic Tunings"]
rows = list(ws.iter_rows(values_only=True))
# Find sub-header row with "Factor ID", "Factor"
header_idx = None
for i, row in enumerate(rows):
if row and len(row) > 2 and row[1] == "Factor ID" and row[2] == "Factor":
header_idx = i
break
if header_idx is None:
return []
tunings = []
for row in rows[header_idx + 1:]:
if not row:
continue
factor_id = row[1] if len(row) > 1 else None
if not factor_id or not str(factor_id).strip():
continue
factor_id_str = str(factor_id).strip()
if not re.match(r'^[A-Z]{2,}\d+', factor_id_str):
continue
tuning = {
"factor_id": factor_id_str,
"factor": _safe_str(row, 2),
"current": _safe_str(row, 3),
"goal": _safe_str(row, 4),
"percent": _safe_float(row, 5),
"recommendation": _safe_str(row, 6),
}
tunings.append(tuning)
return tunings
# -------------------------------------------------------------------------
# Competitor URLs (Results sheet)
# -------------------------------------------------------------------------
def get_competitor_urls(self) -> list[dict]:
"""Extract competitor URLs from the Results sheet."""
if "Results" not in self.wb.sheetnames:
return []
ws = self.wb["Results"]
rows = list(ws.iter_rows(values_only=True))
if not rows:
return []
headers = rows[0]
col_map = {str(h).strip(): j for j, h in enumerate(headers) if h}
results = []
for row in rows[1:]:
if not row or not row[0]:
continue
result = {
"rank": _safe_int(row, col_map.get("Rank")),
"host": _safe_str(row, col_map.get("Host")),
"url": _safe_str(row, col_map.get("URL")),
"title": _safe_str(row, col_map.get("Link Text")),
"summary": _safe_str(row, col_map.get("Summary")),
}
results.append(result)
return results
# -------------------------------------------------------------------------
# Summary
# -------------------------------------------------------------------------
def get_summary(self) -> dict:
"""Get a high-level summary of the Cora report with all key targets."""
entities = self.get_entities()
lsi = self.get_lsi_keywords()
variations = self.get_variations_list()
tunings = self.get_basic_tunings()
results = self.get_competitor_urls()
density = self.get_density_targets()
content = self.get_content_targets()
wc_dist = self.get_word_count_distribution()
# Find word count goal from tunings
word_count_goal = None
for t in tunings:
if t["factor"] == "Word Count":
word_count_goal = t["goal"]
break
entities_with_deficit = [e for e in entities if e["deficit"] and e["deficit"] > 0]
lsi_with_deficit = [l for l in lsi if l["deficit"] and l["deficit"] > 0]
return {
"search_term": self.get_search_term(),
"site_domain": self.get_site_domain(),
"keyword_variations": variations,
"total_entities": len(entities),
"entities_with_deficit": len(entities_with_deficit),
"total_lsi_keywords": len(lsi),
"lsi_with_deficit": len(lsi_with_deficit),
"word_count_goal": word_count_goal,
"word_count_cluster_target": wc_dist.get("cluster_target"),
"word_count_distribution": wc_dist.get("counts_sorted", []),
"variation_density_avg": density.get("variation_density", {}).get("avg_pct"),
"entity_density_avg": density.get("entity_density", {}).get("avg_pct"),
"lsi_density_avg": density.get("lsi_density", {}).get("avg_pct"),
"distinct_entities_target": content.get("distinct_entities", {}).get("target"),
"competitors_analyzed": len(results),
"tuning_factors": len(tunings),
"optimization_rules": OPTIMIZATION_RULES,
}
# =============================================================================
# Helper functions
# =============================================================================
def _safe_str(row, idx) -> str:
if idx is None or idx >= len(row) or row[idx] is None:
return ""
return str(row[idx]).strip()
def _safe_float(row, idx) -> float | None:
if idx is None or idx >= len(row) or row[idx] is None:
return None
try:
return float(row[idx])
except (ValueError, TypeError):
return None
def _safe_int(row, idx) -> int | None:
if idx is None or idx >= len(row) or row[idx] is None:
return None
try:
return int(float(row[idx]))
except (ValueError, TypeError):
return None
def _find_site_col_idx(headers) -> int | None:
"""Find site column by looking for domain pattern in header values."""
for j, h in enumerate(headers):
if h and isinstance(h, str):
h_str = h.strip()
if re.search(r'[a-zA-Z0-9-]+\.[a-zA-Z]{2,}', h_str):
# Skip known non-site headers
if h_str in ("Best of Both", "LSI Keyword"):
continue
return j
return None
def _find_cluster_target(counts: list[int]) -> int:
"""Find the nearest competitive cluster target for word count.
Strategy: Don't use the raw average (skewed by outliers).
Instead, find clusters of 3+ competitors within 30% of each other
and target slightly above the nearest cluster's center.
"""
if not counts:
return 0
if len(counts) <= 3:
return math.ceil(max(counts) * 1.05)
# Simple clustering: find the densest grouping
best_cluster = []
for i in range(len(counts)):
cluster = [counts[i]]
for j in range(i + 1, len(counts)):
# Within 40% range of the cluster start
if counts[j] <= counts[i] * 1.4:
cluster.append(counts[j])
else:
break
if len(cluster) >= len(best_cluster):
best_cluster = cluster
if best_cluster:
cluster_avg = sum(best_cluster) / len(best_cluster)
# Target slightly above the cluster average
return math.ceil(cluster_avg * 1.05)
# Fallback: median + 5%
median = counts[len(counts) // 2]
return math.ceil(median * 1.05)
# =============================================================================
# Output formatting
# =============================================================================
def format_text(data, label: str = "") -> str:
"""Format data as human-readable text."""
lines = []
if label:
lines.append(f"=== {label} ===")
lines.append("")
if isinstance(data, dict):
for key, value in data.items():
if isinstance(value, list) and len(value) > 5:
lines.append(f" {key}: [{len(value)} items]")
elif isinstance(value, dict):
lines.append(f" {key}:")
for k2, v2 in value.items():
lines.append(f" {k2}: {v2}")
else:
lines.append(f" {key}: {value}")
elif isinstance(data, list):
for i, item in enumerate(data):
if isinstance(item, dict):
lines.append(f" [{i + 1}]")
for key, value in item.items():
lines.append(f" {key}: {value}")
else:
lines.append(f" [{i + 1}] {item}")
lines.append("")
return "\n".join(lines)
# =============================================================================
# CLI
# =============================================================================
def main():
parser = argparse.ArgumentParser(description="Parse a Cora SEO XLSX report")
parser.add_argument("xlsx_path", help="Path to the Cora XLSX file")
parser.add_argument(
"--sheet",
choices=[
"entities", "lsi", "variations", "results", "tunings",
"structure", "densities", "targets", "wordcount", "summary", "all",
],
default="summary",
help="Which data to extract (default: summary)",
)
parser.add_argument(
"--format",
choices=["json", "text"],
default="text",
help="Output format (default: text)",
)
parser.add_argument(
"--top-n",
type=int,
default=0,
help="Limit output to top N results (0 = all)",
)
args = parser.parse_args()
report = CoraReport(args.xlsx_path)
extractors = {
"entities": ("Entities", report.get_entities),
"lsi": ("LSI Keywords", report.get_lsi_keywords),
"variations": ("Keyword Variations", lambda: report.get_keyword_variations()),
"results": ("Competitor URLs", report.get_competitor_urls),
"tunings": ("Basic Tunings", report.get_basic_tunings),
"structure": ("Structure Targets", report.get_structure_targets),
"densities": ("Density Targets", report.get_density_targets),
"targets": ("Content Targets", report.get_content_targets),
"wordcount": ("Word Count Distribution", report.get_word_count_distribution),
"summary": ("Summary", report.get_summary),
}
if args.sheet == "all":
sheets_to_show = ["summary", "structure", "densities", "targets", "wordcount"]
else:
sheets_to_show = [args.sheet]
for sheet_key in sheets_to_show:
label, extractor = extractors[sheet_key]
data = extractor()
if args.top_n > 0 and isinstance(data, list):
data = data[:args.top_n]
if args.format == "json":
print(json.dumps(data, indent=2, default=str))
else:
print(format_text(data, label))
if __name__ == "__main__":
main()