969 lines
35 KiB
Python
969 lines
35 KiB
Python
"""
|
|
Cora SEO Report Parser
|
|
|
|
Reads a Cora XLSX file and extracts structured data from relevant sheets.
|
|
Used as a foundation module by entity_optimizer, lsi_optimizer, and seo_optimizer.
|
|
|
|
Usage:
|
|
uv run --with openpyxl python cora_parser.py <xlsx_path> [--sheet SHEET] [--format FORMAT]
|
|
|
|
Options:
|
|
--sheet Which data to extract: entities, lsi, variations, results, tunings,
|
|
structure, densities, targets, summary, all (default: summary)
|
|
--format Output format: json, text (default: text)
|
|
"""
|
|
|
|
import argparse
|
|
import json
|
|
import math
|
|
import re
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
try:
|
|
import openpyxl
|
|
except ImportError:
|
|
print("Error: openpyxl is required. Install with: uv add openpyxl", file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
|
|
# =============================================================================
|
|
# Optimization Rules
|
|
#
|
|
# Hard-wired overrides that apply regardless of what Cora data says.
|
|
# These encode expert SEO knowledge and practical constraints.
|
|
# =============================================================================
|
|
|
|
OPTIMIZATION_RULES = {
|
|
# Heading rules
|
|
"h1_max": 1, # Never more than 1 H1
|
|
"h1_min": 1, # Always have exactly 1 H1
|
|
"optimize_headings": ["h1", "h2", "h3"], # Primary optimization targets
|
|
"low_priority_headings": ["h4"], # Only add if most competitors have them
|
|
"ignore_headings": ["h5", "h6"], # Skip entirely
|
|
|
|
# Keyword density
|
|
"exact_match_density_min": 0.02, # 2% minimum for exact match keyword
|
|
"no_keyword_stuffing_limit": True, # Do NOT flag for keyword stuffing
|
|
# Variations capture exact match, so hitting variation density covers it
|
|
|
|
# Word count strategy
|
|
"word_count_strategy": "cluster", # "cluster" = nearest competitive cluster, not raw average
|
|
"word_count_acceptable_max": 1500, # Up to 1500 is always acceptable even if target is lower
|
|
|
|
# Density awareness
|
|
"density_interdependent": True, # Adding content changes all density calculations
|
|
|
|
# Entity / LSI filtering
|
|
"exclude_competitor_entities": True, # Never use competitor company names as entities or LSI
|
|
"exclude_measurement_entities": True, # Ignore measurements (dimensions, tolerances) as entities
|
|
"allow_organization_entities": True, # Organizations like ISO, ANSI, etc. are OK
|
|
"never_mention_competitors": True, # Never mention competitors by name in content
|
|
}
|
|
|
|
|
|
class CoraReport:
|
|
"""Parses a Cora SEO XLSX report and provides structured access to its data."""
|
|
|
|
def __init__(self, xlsx_path: str):
|
|
self.path = Path(xlsx_path)
|
|
if not self.path.exists():
|
|
raise FileNotFoundError(f"XLSX file not found: {xlsx_path}")
|
|
self.wb = openpyxl.load_workbook(str(self.path), data_only=True)
|
|
self._site_domain = None # Cached after first detection
|
|
|
|
# -------------------------------------------------------------------------
|
|
# Core metadata
|
|
# -------------------------------------------------------------------------
|
|
|
|
def get_sheet_names(self) -> list[str]:
|
|
return self.wb.sheetnames
|
|
|
|
def get_search_term(self) -> str:
|
|
"""Extract the target keyword from the report."""
|
|
for sheet_name in ["Basic Tunings", "Strategic Overview", "Structure"]:
|
|
if sheet_name not in self.wb.sheetnames:
|
|
continue
|
|
ws = self.wb[sheet_name]
|
|
for row in ws.iter_rows(min_row=1, max_row=10, values_only=True):
|
|
if row and row[0] == "Search Terms" and len(row) > 1 and row[1]:
|
|
return str(row[1])
|
|
return ""
|
|
|
|
def get_variations_list(self) -> list[str]:
|
|
"""Extract the keyword variations list from Strategic Overview B10.
|
|
|
|
These are pipe-delimited inside curly braces:
|
|
{cnc screw|cnc screw machining|cnc swiss|...}
|
|
"""
|
|
if "Strategic Overview" not in self.wb.sheetnames:
|
|
return []
|
|
|
|
ws = self.wb["Strategic Overview"]
|
|
rows = list(ws.iter_rows(min_row=1, max_row=12, values_only=True))
|
|
|
|
for row in rows:
|
|
if row and row[0] == "Keywords" and len(row) > 1 and row[1]:
|
|
raw = str(row[1]).strip()
|
|
# Remove curly braces and split on pipe
|
|
raw = raw.strip("{}")
|
|
return [v.strip() for v in raw.split("|") if v.strip()]
|
|
return []
|
|
|
|
def get_site_domain(self) -> str:
|
|
"""Detect the user's site domain from the report.
|
|
|
|
Looks for the domain in the Entities sheet header (column with a .com/.net etc.
|
|
that isn't a standard Cora column) or the site column in other sheets.
|
|
"""
|
|
if self._site_domain:
|
|
return self._site_domain
|
|
|
|
# Try Entities sheet first
|
|
if "Entities" in self.wb.sheetnames:
|
|
ws = self.wb["Entities"]
|
|
rows = list(ws.iter_rows(min_row=1, max_row=5, values_only=True))
|
|
for row in rows:
|
|
if row and row[0] == "Entity":
|
|
for h in row:
|
|
if h and isinstance(h, str):
|
|
h = h.strip()
|
|
if re.match(r'^[a-zA-Z0-9-]+\.[a-zA-Z]{2,}$', h):
|
|
self._site_domain = h
|
|
return h
|
|
|
|
# Try LSI Keywords sheet — header like "#40.7 hoggeprecision.com"
|
|
if "LSI Keywords" in self.wb.sheetnames:
|
|
ws = self.wb["LSI Keywords"]
|
|
rows = list(ws.iter_rows(min_row=1, max_row=10, values_only=True))
|
|
for row in rows:
|
|
if row and row[0] == "LSI Keyword":
|
|
for h in row:
|
|
if h and isinstance(h, str):
|
|
match = re.search(r'([a-zA-Z0-9-]+\.[a-zA-Z]{2,})', h.strip())
|
|
if match:
|
|
self._site_domain = match.group(1)
|
|
return self._site_domain
|
|
|
|
return ""
|
|
|
|
# -------------------------------------------------------------------------
|
|
# Entities
|
|
# -------------------------------------------------------------------------
|
|
|
|
def get_entities(self) -> list[dict]:
|
|
"""Extract entities from the Entities sheet.
|
|
|
|
Returns list of dicts with: name, freebase_id, wikidata_id, wiki_link,
|
|
relevance, confidence, type, correlation, current_count, max_count, deficit
|
|
"""
|
|
if "Entities" not in self.wb.sheetnames:
|
|
return []
|
|
|
|
ws = self.wb["Entities"]
|
|
rows = list(ws.iter_rows(values_only=True))
|
|
|
|
# Find header row containing "Entity", "Freebase ID", etc.
|
|
header_idx = None
|
|
for i, row in enumerate(rows):
|
|
if row and row[0] == "Entity" and len(row) > 1 and row[1] == "Freebase ID":
|
|
header_idx = i
|
|
break
|
|
|
|
if header_idx is None:
|
|
return []
|
|
|
|
headers = rows[header_idx]
|
|
col_map = {str(h).strip(): j for j, h in enumerate(headers) if h}
|
|
|
|
# Find the site-specific column (domain name like "hoggeprecision.com")
|
|
site_col_idx = None
|
|
site_domain = self.get_site_domain()
|
|
if site_domain:
|
|
site_col_idx = col_map.get(site_domain)
|
|
|
|
entities = []
|
|
for row in rows[header_idx + 1:]:
|
|
if not row or not row[0]:
|
|
continue
|
|
|
|
name = str(row[0]).strip()
|
|
if not name:
|
|
continue
|
|
|
|
# Skip rows that look like metadata (e.g., "critical values: ...")
|
|
if name.startswith("critical") or name.startswith("http"):
|
|
continue
|
|
|
|
entity = {
|
|
"name": name,
|
|
"freebase_id": _safe_str(row, col_map.get("Freebase ID")),
|
|
"wikidata_id": _safe_str(row, col_map.get("Wikidata ID")),
|
|
"wiki_link": _safe_str(row, col_map.get("Wiki Link")),
|
|
"relevance": _safe_float(row, col_map.get("Relevance")),
|
|
"confidence": _safe_float(row, col_map.get("Confidence")),
|
|
"type": _safe_str(row, col_map.get("Type")),
|
|
"correlation": _safe_float(row, col_map.get("Best of Both")),
|
|
"current_count": _safe_int(row, site_col_idx),
|
|
"max_count": _safe_int(row, col_map.get("Max")),
|
|
"deficit": _safe_int(row, col_map.get("Deficit")),
|
|
}
|
|
entities.append(entity)
|
|
|
|
return entities
|
|
|
|
# -------------------------------------------------------------------------
|
|
# LSI Keywords
|
|
# -------------------------------------------------------------------------
|
|
|
|
def get_lsi_keywords(self) -> list[dict]:
|
|
"""Extract LSI keywords from the LSI Keywords sheet.
|
|
|
|
Returns list of dicts with: keyword, spearmans, pearsons, best_of_both,
|
|
pages, max, avg, current_count, deficit
|
|
"""
|
|
if "LSI Keywords" not in self.wb.sheetnames:
|
|
return []
|
|
|
|
ws = self.wb["LSI Keywords"]
|
|
rows = list(ws.iter_rows(values_only=True))
|
|
|
|
# Find header row containing "LSI Keyword", "Spearmans", etc.
|
|
header_idx = None
|
|
for i, row in enumerate(rows):
|
|
if row and row[0] == "LSI Keyword":
|
|
header_idx = i
|
|
break
|
|
|
|
if header_idx is None:
|
|
return []
|
|
|
|
headers = rows[header_idx]
|
|
col_map = {str(h).strip(): j for j, h in enumerate(headers) if h}
|
|
|
|
# Find site column — pattern like "#40.7 hoggeprecision.com"
|
|
site_col_idx = None
|
|
site_domain = self.get_site_domain()
|
|
if site_domain:
|
|
for j, h in enumerate(headers):
|
|
if h and isinstance(h, str) and site_domain in h:
|
|
site_col_idx = j
|
|
break
|
|
if site_col_idx is None:
|
|
site_col_idx = _find_site_col_idx(headers)
|
|
|
|
lsi_keywords = []
|
|
for row in rows[header_idx + 1:]:
|
|
if not row or not row[0]:
|
|
continue
|
|
|
|
keyword = str(row[0]).strip()
|
|
if not keyword:
|
|
continue
|
|
|
|
lsi = {
|
|
"keyword": keyword,
|
|
"spearmans": _safe_float(row, col_map.get("Spearmans")),
|
|
"pearsons": _safe_float(row, col_map.get("Pearsons")),
|
|
"best_of_both": _safe_float(row, col_map.get("Best of Both")),
|
|
"pages": _safe_int(row, col_map.get("Pages")),
|
|
"max": _safe_int(row, col_map.get("Max")),
|
|
"avg": _safe_float(row, col_map.get("Avg")),
|
|
"current_count": _safe_int(row, site_col_idx),
|
|
"deficit": _safe_float(row, col_map.get("Deficit")),
|
|
}
|
|
lsi_keywords.append(lsi)
|
|
|
|
return lsi_keywords
|
|
|
|
# -------------------------------------------------------------------------
|
|
# Keyword Variations
|
|
# -------------------------------------------------------------------------
|
|
|
|
def get_keyword_variations(self) -> list[dict]:
|
|
"""Extract keyword variation counts from the Variations sheet.
|
|
|
|
Returns list of dicts with: variation, page1_max, page1_avg
|
|
"""
|
|
if "Variations" not in self.wb.sheetnames:
|
|
return []
|
|
|
|
ws = self.wb["Variations"]
|
|
rows = list(ws.iter_rows(values_only=True))
|
|
|
|
if not rows or len(rows) < 3:
|
|
return []
|
|
|
|
header_row = rows[0]
|
|
|
|
# Find where variation columns start (after "# used" column)
|
|
var_start = 3 # default
|
|
for j, h in enumerate(header_row):
|
|
if h and str(h).strip() == "# used":
|
|
var_start = j + 1
|
|
break
|
|
|
|
max_row = rows[1] if len(rows) > 1 else None
|
|
avg_row = rows[2] if len(rows) > 2 else None
|
|
|
|
variations = []
|
|
for j in range(var_start, len(header_row)):
|
|
name = header_row[j]
|
|
if not name:
|
|
continue
|
|
|
|
variation = {
|
|
"variation": str(name).strip(),
|
|
"page1_max": _safe_int(max_row, j) if max_row else 0,
|
|
"page1_avg": _safe_int(avg_row, j) if avg_row else 0,
|
|
}
|
|
variations.append(variation)
|
|
|
|
return variations
|
|
|
|
# -------------------------------------------------------------------------
|
|
# Structure Targets (per-element targets from Structure sheet)
|
|
# -------------------------------------------------------------------------
|
|
|
|
def get_structure_targets(self) -> dict:
|
|
"""Extract per-element optimization targets from the Structure sheet.
|
|
|
|
Returns a dict keyed by element type with sub-targets:
|
|
{
|
|
"title_tag": {"exact_match": 0.2, "variations": 1.3, "entities": 5.8, "lsi_words": 10.7},
|
|
"meta_description": {...},
|
|
"all_h_tags": {"count": 20.7, "exact_match": 0.4, "variations": 5.7, "entities": 45.8, "lsi_words": 77.4},
|
|
"h1": {"count": 1.1, "exact_match": 0.1, "variations": 1, "entities": 3.8, "lsi_words": 7.3},
|
|
"h2": {...},
|
|
"h3": {...},
|
|
"h4": {...},
|
|
}
|
|
Page 1 Average values are in column D (index 3).
|
|
"""
|
|
if "Structure" not in self.wb.sheetnames:
|
|
return {}
|
|
|
|
ws = self.wb["Structure"]
|
|
rows = list(ws.iter_rows(values_only=True))
|
|
|
|
# Find the header row with "Factor Name", "Page 1 Avg" etc.
|
|
header_idx = None
|
|
for i, row in enumerate(rows):
|
|
if row and len(row) > 3:
|
|
if row[2] == "Factor Name" or (row[1] == "Factor ID" and row[2] == "Factor Name"):
|
|
header_idx = i
|
|
break
|
|
# Also check for the combined "Best of Both Correlation" header
|
|
if row[0] and "Best of Both" in str(row[0]):
|
|
header_idx = i
|
|
break
|
|
|
|
if header_idx is None:
|
|
return {}
|
|
|
|
# Parse factor rows into sections
|
|
# Section headers: "TITLE TAG", "META DESCRIPTION", "TOTAL FOR ALL H TAGS",
|
|
# "H1 Data", "H2 Data", "H3 Data", "H4 Data", "H5 Data", "H6 Data"
|
|
section_map = {
|
|
"TITLE TAG": "title_tag",
|
|
"META DESCRIPTION": "meta_description",
|
|
"TOTAL FOR ALL H TAGS": "all_h_tags",
|
|
"H1 Data": "h1",
|
|
"H2 Data": "h2",
|
|
"H3 Data": "h3",
|
|
"H4 Data": "h4",
|
|
}
|
|
|
|
# Factor name patterns to field names
|
|
factor_patterns = {
|
|
"Number of": "count",
|
|
"Exact Match": "exact_match",
|
|
"Variation": "variations",
|
|
"Entities": "entities",
|
|
"LSI": "lsi_words",
|
|
"Search Term": "search_terms",
|
|
"Keywords": "keywords",
|
|
}
|
|
|
|
targets = {}
|
|
current_section = None
|
|
|
|
for row in rows[header_idx + 1:]:
|
|
if not row or len(row) < 4:
|
|
continue
|
|
|
|
factor_name = _safe_str(row, 2)
|
|
|
|
# Check if this is a section header
|
|
if factor_name in section_map:
|
|
current_section = section_map[factor_name]
|
|
targets[current_section] = {}
|
|
continue
|
|
|
|
# Skip sections we don't care about (H5, H6)
|
|
if factor_name in ("H5 Data", "H6 Data"):
|
|
current_section = None
|
|
continue
|
|
|
|
if current_section is None:
|
|
continue
|
|
|
|
# Get the Page 1 Average (column D, index 3)
|
|
avg_val = _safe_float(row, 3)
|
|
if avg_val is None:
|
|
continue
|
|
|
|
# Map factor name to field
|
|
field_name = None
|
|
for pattern, field in factor_patterns.items():
|
|
if pattern.lower() in factor_name.lower():
|
|
field_name = field
|
|
break
|
|
|
|
if field_name and current_section:
|
|
# Also grab correlation from column A
|
|
correlation = _safe_float(row, 0)
|
|
|
|
# Outlier detection: check if one of the top 10 results
|
|
# contributes >50% of the sum. If so, exclude it and
|
|
# recompute the average — that outlier is skewing the target.
|
|
top10 = [_safe_float(row, j) or 0 for j in range(4, 14)]
|
|
top10_sum = sum(top10)
|
|
adjusted_avg = avg_val
|
|
outlier_detected = False
|
|
|
|
if top10_sum > 0:
|
|
max_val = max(top10)
|
|
if max_val > top10_sum * 0.5 and avg_val > 1:
|
|
# One result is >50% of the total — outlier.
|
|
# Skip adjustment when avg <= 1: a single "1" among
|
|
# zeros triggers the rule but the target is already
|
|
# small enough that adjustment would zero it out.
|
|
remaining = [v for v in top10 if v != max_val]
|
|
# If max_val appears multiple times, only remove one
|
|
if len(remaining) == len(top10):
|
|
remaining = top10[:]
|
|
remaining.remove(max_val)
|
|
if remaining:
|
|
adjusted_avg = sum(remaining) / len(remaining)
|
|
outlier_detected = True
|
|
|
|
target_val = math.ceil(adjusted_avg)
|
|
|
|
entry = {
|
|
"avg": avg_val,
|
|
"target": target_val,
|
|
"correlation": correlation,
|
|
}
|
|
if outlier_detected:
|
|
entry["outlier_adjusted"] = True
|
|
entry["original_target"] = math.ceil(avg_val)
|
|
|
|
targets[current_section][field_name] = entry
|
|
|
|
return targets
|
|
|
|
# -------------------------------------------------------------------------
|
|
# Density Targets (from Strategic Overview rows 46-48)
|
|
# -------------------------------------------------------------------------
|
|
|
|
def get_density_targets(self) -> dict:
|
|
"""Extract density targets from Strategic Overview rows 46-48.
|
|
|
|
Row 46: Variation density
|
|
Row 47: Entity density
|
|
Row 48: LSI density
|
|
|
|
Column D (index 3) = Page 1 Average.
|
|
Returns per-result values so we can show distribution.
|
|
"""
|
|
if "Strategic Overview" not in self.wb.sheetnames:
|
|
return {}
|
|
|
|
ws = self.wb["Strategic Overview"]
|
|
rows = list(ws.iter_rows(values_only=True))
|
|
|
|
# Find the density rows — they're the last 3 non-empty rows in the data section
|
|
# Look for them near row 46-48 area, identified by having floats in col D
|
|
# and being near the bottom of the data
|
|
# Approach: find the row with "Relevant Density" and the 3 rows after the gap
|
|
density_area_start = None
|
|
for i, row in enumerate(rows):
|
|
if row and len(row) > 2 and row[2] == "Relevant Density":
|
|
# Density target rows are a few rows below this
|
|
density_area_start = i
|
|
break
|
|
|
|
if density_area_start is None:
|
|
return {}
|
|
|
|
# The 3 density rows come after a gap. They have NO values in cols A, B, C —
|
|
# only numeric values from col D onward. Row 44 (which has a correlation in
|
|
# col A) is a count row, not a density row, so we skip it.
|
|
density_rows = []
|
|
for i in range(density_area_start + 1, min(density_area_start + 10, len(rows))):
|
|
row = rows[i]
|
|
if not row:
|
|
continue
|
|
col_a = row[0] if len(row) > 0 else None
|
|
col_b = row[1] if len(row) > 1 else None
|
|
col_c = row[2] if len(row) > 2 else None
|
|
col_d = row[3] if len(row) > 3 else None
|
|
# Density rows have None in A, B, C and a float in D
|
|
if col_a is None and col_b is None and col_c is None and col_d is not None:
|
|
try:
|
|
float(col_d)
|
|
density_rows.append(row)
|
|
except (ValueError, TypeError):
|
|
pass
|
|
|
|
# Get result domains from row 22 area for the site column
|
|
result_start_col = 4 # Results start at col E (index 4)
|
|
|
|
result = {}
|
|
labels = ["variation_density", "entity_density", "lsi_density"]
|
|
|
|
for idx, label in enumerate(labels):
|
|
if idx >= len(density_rows):
|
|
break
|
|
row = density_rows[idx]
|
|
avg = _safe_float(row, 3)
|
|
# Collect per-competitor values
|
|
competitor_vals = []
|
|
for j in range(result_start_col, min(result_start_col + 10, len(row))):
|
|
v = _safe_float(row, j)
|
|
if v is not None:
|
|
competitor_vals.append(v)
|
|
|
|
result[label] = {
|
|
"avg": avg,
|
|
"avg_pct": f"{avg * 100:.2f}%" if avg else "N/A",
|
|
"competitor_values": competitor_vals,
|
|
}
|
|
|
|
return result
|
|
|
|
# -------------------------------------------------------------------------
|
|
# Content Targets (word count, distinct entities, etc.)
|
|
# -------------------------------------------------------------------------
|
|
|
|
def get_content_targets(self) -> dict:
|
|
"""Extract key content-level targets from Strategic Overview.
|
|
|
|
Includes: word count distribution, distinct entities target, variations in HTML, etc.
|
|
"""
|
|
if "Strategic Overview" not in self.wb.sheetnames:
|
|
return {}
|
|
|
|
ws = self.wb["Strategic Overview"]
|
|
rows = list(ws.iter_rows(values_only=True))
|
|
|
|
targets = {}
|
|
result_start_col = 4
|
|
|
|
for i, row in enumerate(rows):
|
|
if not row or len(row) < 4:
|
|
continue
|
|
|
|
factor_name = _safe_str(row, 2)
|
|
factor_id = _safe_str(row, 1)
|
|
correlation = _safe_float(row, 0)
|
|
avg = _safe_float(row, 3)
|
|
|
|
if not factor_name or avg is None:
|
|
continue
|
|
|
|
# Key factors we care about
|
|
if factor_name == "Number of Distinct Entities Used":
|
|
competitor_vals = []
|
|
for j in range(result_start_col, min(result_start_col + 10, len(row))):
|
|
v = _safe_float(row, j)
|
|
if v is not None:
|
|
competitor_vals.append(int(v))
|
|
targets["distinct_entities"] = {
|
|
"factor_id": factor_id,
|
|
"avg": avg,
|
|
"target": math.ceil(avg),
|
|
"correlation": correlation,
|
|
"competitor_values": competitor_vals,
|
|
}
|
|
|
|
elif factor_name == "Variations in HTML Tags":
|
|
targets["variations_in_html"] = {
|
|
"factor_id": factor_id,
|
|
"avg": avg,
|
|
"target": math.ceil(avg),
|
|
"correlation": correlation,
|
|
}
|
|
|
|
elif factor_name == "Entities in the HTML Tag":
|
|
targets["entities_in_html"] = {
|
|
"factor_id": factor_id,
|
|
"avg": avg,
|
|
"target": math.ceil(avg),
|
|
"correlation": correlation,
|
|
}
|
|
|
|
return targets
|
|
|
|
def get_word_count_distribution(self) -> dict:
|
|
"""Get word count data for competitive cluster analysis.
|
|
|
|
Returns the clean word count for each competitor from the Keywords sheet,
|
|
sorted ascending, plus the Page 1 Average and suggested cluster target.
|
|
"""
|
|
if "Keywords" not in self.wb.sheetnames:
|
|
return {}
|
|
|
|
ws = self.wb["Keywords"]
|
|
rows = list(ws.iter_rows(values_only=True))
|
|
|
|
if not rows:
|
|
return {}
|
|
|
|
headers = rows[0]
|
|
col_map = {str(h).strip(): j for j, h in enumerate(headers) if h}
|
|
|
|
host_idx = col_map.get("Host")
|
|
clean_wc_idx = col_map.get("Clean Word Count")
|
|
|
|
if host_idx is None or clean_wc_idx is None:
|
|
return {}
|
|
|
|
# Collect word counts for page 1 results (top 10)
|
|
competitors = []
|
|
for row in rows[1:11]:
|
|
if not row or not row[host_idx]:
|
|
continue
|
|
wc = _safe_int(row, clean_wc_idx)
|
|
if wc and wc > 0:
|
|
competitors.append({
|
|
"host": str(row[host_idx]),
|
|
"clean_word_count": wc,
|
|
})
|
|
|
|
if not competitors:
|
|
return {}
|
|
|
|
# Sort by word count
|
|
competitors.sort(key=lambda x: x["clean_word_count"])
|
|
counts = [c["clean_word_count"] for c in competitors]
|
|
|
|
# Calculate cluster target
|
|
avg = sum(counts) / len(counts)
|
|
median = counts[len(counts) // 2]
|
|
cluster_target = _find_cluster_target(counts)
|
|
|
|
return {
|
|
"competitors": competitors,
|
|
"counts_sorted": counts,
|
|
"average": round(avg),
|
|
"median": median,
|
|
"cluster_target": cluster_target,
|
|
"min": counts[0],
|
|
"max": counts[-1],
|
|
}
|
|
|
|
# -------------------------------------------------------------------------
|
|
# Basic Tunings
|
|
# -------------------------------------------------------------------------
|
|
|
|
def get_basic_tunings(self) -> list[dict]:
|
|
"""Extract on-page tuning factors from the Basic Tunings sheet."""
|
|
if "Basic Tunings" not in self.wb.sheetnames:
|
|
return []
|
|
|
|
ws = self.wb["Basic Tunings"]
|
|
rows = list(ws.iter_rows(values_only=True))
|
|
|
|
# Find sub-header row with "Factor ID", "Factor"
|
|
header_idx = None
|
|
for i, row in enumerate(rows):
|
|
if row and len(row) > 2 and row[1] == "Factor ID" and row[2] == "Factor":
|
|
header_idx = i
|
|
break
|
|
|
|
if header_idx is None:
|
|
return []
|
|
|
|
tunings = []
|
|
for row in rows[header_idx + 1:]:
|
|
if not row:
|
|
continue
|
|
|
|
factor_id = row[1] if len(row) > 1 else None
|
|
if not factor_id or not str(factor_id).strip():
|
|
continue
|
|
|
|
factor_id_str = str(factor_id).strip()
|
|
if not re.match(r'^[A-Z]{2,}\d+', factor_id_str):
|
|
continue
|
|
|
|
tuning = {
|
|
"factor_id": factor_id_str,
|
|
"factor": _safe_str(row, 2),
|
|
"current": _safe_str(row, 3),
|
|
"goal": _safe_str(row, 4),
|
|
"percent": _safe_float(row, 5),
|
|
"recommendation": _safe_str(row, 6),
|
|
}
|
|
tunings.append(tuning)
|
|
|
|
return tunings
|
|
|
|
# -------------------------------------------------------------------------
|
|
# Competitor URLs (Results sheet)
|
|
# -------------------------------------------------------------------------
|
|
|
|
def get_competitor_urls(self) -> list[dict]:
|
|
"""Extract competitor URLs from the Results sheet."""
|
|
if "Results" not in self.wb.sheetnames:
|
|
return []
|
|
|
|
ws = self.wb["Results"]
|
|
rows = list(ws.iter_rows(values_only=True))
|
|
|
|
if not rows:
|
|
return []
|
|
|
|
headers = rows[0]
|
|
col_map = {str(h).strip(): j for j, h in enumerate(headers) if h}
|
|
|
|
results = []
|
|
for row in rows[1:]:
|
|
if not row or not row[0]:
|
|
continue
|
|
|
|
result = {
|
|
"rank": _safe_int(row, col_map.get("Rank")),
|
|
"host": _safe_str(row, col_map.get("Host")),
|
|
"url": _safe_str(row, col_map.get("URL")),
|
|
"title": _safe_str(row, col_map.get("Link Text")),
|
|
"summary": _safe_str(row, col_map.get("Summary")),
|
|
}
|
|
results.append(result)
|
|
|
|
return results
|
|
|
|
# -------------------------------------------------------------------------
|
|
# Summary
|
|
# -------------------------------------------------------------------------
|
|
|
|
def get_summary(self) -> dict:
|
|
"""Get a high-level summary of the Cora report with all key targets."""
|
|
entities = self.get_entities()
|
|
lsi = self.get_lsi_keywords()
|
|
variations = self.get_variations_list()
|
|
tunings = self.get_basic_tunings()
|
|
results = self.get_competitor_urls()
|
|
density = self.get_density_targets()
|
|
content = self.get_content_targets()
|
|
wc_dist = self.get_word_count_distribution()
|
|
|
|
# Find word count goal from tunings
|
|
word_count_goal = None
|
|
for t in tunings:
|
|
if t["factor"] == "Word Count":
|
|
word_count_goal = t["goal"]
|
|
break
|
|
|
|
entities_with_deficit = [e for e in entities if e["deficit"] and e["deficit"] > 0]
|
|
lsi_with_deficit = [l for l in lsi if l["deficit"] and l["deficit"] > 0]
|
|
|
|
return {
|
|
"search_term": self.get_search_term(),
|
|
"site_domain": self.get_site_domain(),
|
|
"keyword_variations": variations,
|
|
"total_entities": len(entities),
|
|
"entities_with_deficit": len(entities_with_deficit),
|
|
"total_lsi_keywords": len(lsi),
|
|
"lsi_with_deficit": len(lsi_with_deficit),
|
|
"word_count_goal": word_count_goal,
|
|
"word_count_cluster_target": wc_dist.get("cluster_target"),
|
|
"word_count_distribution": wc_dist.get("counts_sorted", []),
|
|
"variation_density_avg": density.get("variation_density", {}).get("avg_pct"),
|
|
"entity_density_avg": density.get("entity_density", {}).get("avg_pct"),
|
|
"lsi_density_avg": density.get("lsi_density", {}).get("avg_pct"),
|
|
"distinct_entities_target": content.get("distinct_entities", {}).get("target"),
|
|
"competitors_analyzed": len(results),
|
|
"tuning_factors": len(tunings),
|
|
"optimization_rules": OPTIMIZATION_RULES,
|
|
}
|
|
|
|
|
|
# =============================================================================
|
|
# Helper functions
|
|
# =============================================================================
|
|
|
|
def _safe_str(row, idx) -> str:
|
|
if idx is None or idx >= len(row) or row[idx] is None:
|
|
return ""
|
|
return str(row[idx]).strip()
|
|
|
|
|
|
def _safe_float(row, idx) -> float | None:
|
|
if idx is None or idx >= len(row) or row[idx] is None:
|
|
return None
|
|
try:
|
|
return float(row[idx])
|
|
except (ValueError, TypeError):
|
|
return None
|
|
|
|
|
|
def _safe_int(row, idx) -> int | None:
|
|
if idx is None or idx >= len(row) or row[idx] is None:
|
|
return None
|
|
try:
|
|
return int(float(row[idx]))
|
|
except (ValueError, TypeError):
|
|
return None
|
|
|
|
|
|
def _find_site_col_idx(headers) -> int | None:
|
|
"""Find site column by looking for domain pattern in header values."""
|
|
for j, h in enumerate(headers):
|
|
if h and isinstance(h, str):
|
|
h_str = h.strip()
|
|
if re.search(r'[a-zA-Z0-9-]+\.[a-zA-Z]{2,}', h_str):
|
|
# Skip known non-site headers
|
|
if h_str in ("Best of Both", "LSI Keyword"):
|
|
continue
|
|
return j
|
|
return None
|
|
|
|
|
|
def _find_cluster_target(counts: list[int]) -> int:
|
|
"""Find the nearest competitive cluster target for word count.
|
|
|
|
Strategy: Don't use the raw average (skewed by outliers).
|
|
Instead, find clusters of 3+ competitors within 30% of each other
|
|
and target slightly above the nearest cluster's center.
|
|
"""
|
|
if not counts:
|
|
return 0
|
|
|
|
if len(counts) <= 3:
|
|
return math.ceil(max(counts) * 1.05)
|
|
|
|
# Simple clustering: find the densest grouping
|
|
best_cluster = []
|
|
for i in range(len(counts)):
|
|
cluster = [counts[i]]
|
|
for j in range(i + 1, len(counts)):
|
|
# Within 40% range of the cluster start
|
|
if counts[j] <= counts[i] * 1.4:
|
|
cluster.append(counts[j])
|
|
else:
|
|
break
|
|
if len(cluster) >= len(best_cluster):
|
|
best_cluster = cluster
|
|
|
|
if best_cluster:
|
|
cluster_avg = sum(best_cluster) / len(best_cluster)
|
|
# Target slightly above the cluster average
|
|
return math.ceil(cluster_avg * 1.05)
|
|
|
|
# Fallback: median + 5%
|
|
median = counts[len(counts) // 2]
|
|
return math.ceil(median * 1.05)
|
|
|
|
|
|
# =============================================================================
|
|
# Output formatting
|
|
# =============================================================================
|
|
|
|
def format_text(data, label: str = "") -> str:
|
|
"""Format data as human-readable text."""
|
|
lines = []
|
|
if label:
|
|
lines.append(f"=== {label} ===")
|
|
lines.append("")
|
|
|
|
if isinstance(data, dict):
|
|
for key, value in data.items():
|
|
if isinstance(value, list) and len(value) > 5:
|
|
lines.append(f" {key}: [{len(value)} items]")
|
|
elif isinstance(value, dict):
|
|
lines.append(f" {key}:")
|
|
for k2, v2 in value.items():
|
|
lines.append(f" {k2}: {v2}")
|
|
else:
|
|
lines.append(f" {key}: {value}")
|
|
elif isinstance(data, list):
|
|
for i, item in enumerate(data):
|
|
if isinstance(item, dict):
|
|
lines.append(f" [{i + 1}]")
|
|
for key, value in item.items():
|
|
lines.append(f" {key}: {value}")
|
|
else:
|
|
lines.append(f" [{i + 1}] {item}")
|
|
lines.append("")
|
|
return "\n".join(lines)
|
|
|
|
|
|
# =============================================================================
|
|
# CLI
|
|
# =============================================================================
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description="Parse a Cora SEO XLSX report")
|
|
parser.add_argument("xlsx_path", help="Path to the Cora XLSX file")
|
|
parser.add_argument(
|
|
"--sheet",
|
|
choices=[
|
|
"entities", "lsi", "variations", "results", "tunings",
|
|
"structure", "densities", "targets", "wordcount", "summary", "all",
|
|
],
|
|
default="summary",
|
|
help="Which data to extract (default: summary)",
|
|
)
|
|
parser.add_argument(
|
|
"--format",
|
|
choices=["json", "text"],
|
|
default="text",
|
|
help="Output format (default: text)",
|
|
)
|
|
parser.add_argument(
|
|
"--top-n",
|
|
type=int,
|
|
default=0,
|
|
help="Limit output to top N results (0 = all)",
|
|
)
|
|
args = parser.parse_args()
|
|
|
|
report = CoraReport(args.xlsx_path)
|
|
|
|
extractors = {
|
|
"entities": ("Entities", report.get_entities),
|
|
"lsi": ("LSI Keywords", report.get_lsi_keywords),
|
|
"variations": ("Keyword Variations", lambda: report.get_keyword_variations()),
|
|
"results": ("Competitor URLs", report.get_competitor_urls),
|
|
"tunings": ("Basic Tunings", report.get_basic_tunings),
|
|
"structure": ("Structure Targets", report.get_structure_targets),
|
|
"densities": ("Density Targets", report.get_density_targets),
|
|
"targets": ("Content Targets", report.get_content_targets),
|
|
"wordcount": ("Word Count Distribution", report.get_word_count_distribution),
|
|
"summary": ("Summary", report.get_summary),
|
|
}
|
|
|
|
if args.sheet == "all":
|
|
sheets_to_show = ["summary", "structure", "densities", "targets", "wordcount"]
|
|
else:
|
|
sheets_to_show = [args.sheet]
|
|
|
|
for sheet_key in sheets_to_show:
|
|
label, extractor = extractors[sheet_key]
|
|
data = extractor()
|
|
|
|
if args.top_n > 0 and isinstance(data, list):
|
|
data = data[:args.top_n]
|
|
|
|
if args.format == "json":
|
|
print(json.dumps(data, indent=2, default=str))
|
|
else:
|
|
print(format_text(data, label))
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|