""" Cora SEO Report Parser Reads a Cora XLSX file and extracts structured data from relevant sheets. Used as a foundation module by entity_optimizer, lsi_optimizer, and seo_optimizer. Usage: uv run --with openpyxl python cora_parser.py [--sheet SHEET] [--format FORMAT] Options: --sheet Which data to extract: entities, lsi, variations, results, tunings, structure, densities, targets, summary, all (default: summary) --format Output format: json, text (default: text) """ import argparse import json import math import re import sys from pathlib import Path try: import openpyxl except ImportError: print("Error: openpyxl is required. Install with: uv add openpyxl", file=sys.stderr) sys.exit(1) # ============================================================================= # Optimization Rules # # Hard-wired overrides that apply regardless of what Cora data says. # These encode expert SEO knowledge and practical constraints. # ============================================================================= OPTIMIZATION_RULES = { # Heading rules "h1_max": 1, # Never more than 1 H1 "h1_min": 1, # Always have exactly 1 H1 "optimize_headings": ["h1", "h2", "h3"], # Primary optimization targets "low_priority_headings": ["h4"], # Only add if most competitors have them "ignore_headings": ["h5", "h6"], # Skip entirely # Keyword density "exact_match_density_min": 0.02, # 2% minimum for exact match keyword "no_keyword_stuffing_limit": True, # Do NOT flag for keyword stuffing # Variations capture exact match, so hitting variation density covers it # Word count strategy "word_count_strategy": "cluster", # "cluster" = nearest competitive cluster, not raw average "word_count_acceptable_max": 1500, # Up to 1500 is always acceptable even if target is lower # Density awareness "density_interdependent": True, # Adding content changes all density calculations # Entity / LSI filtering "exclude_competitor_entities": True, # Never use competitor company names as entities or LSI "exclude_measurement_entities": True, # Ignore measurements (dimensions, tolerances) as entities "allow_organization_entities": True, # Organizations like ISO, ANSI, etc. are OK "never_mention_competitors": True, # Never mention competitors by name in content # Entity correlation threshold # Best of Both = lower of Spearman's or Pearson's correlation. # Measures correlation to ranking position (1=top, 100=bottom), so negative = better ranking. # Only include entities with Best of Both <= this value. # Set to None to disable filtering. "entity_correlation_threshold": -0.19, } class CoraReport: """Parses a Cora SEO XLSX report and provides structured access to its data.""" def __init__(self, xlsx_path: str): self.path = Path(xlsx_path) if not self.path.exists(): raise FileNotFoundError(f"XLSX file not found: {xlsx_path}") self.wb = openpyxl.load_workbook(str(self.path), data_only=True) self._site_domain = None # Cached after first detection # ------------------------------------------------------------------------- # Core metadata # ------------------------------------------------------------------------- def get_sheet_names(self) -> list[str]: return self.wb.sheetnames def get_search_term(self) -> str: """Extract the target keyword from the report.""" for sheet_name in ["Basic Tunings", "Strategic Overview", "Structure"]: if sheet_name not in self.wb.sheetnames: continue ws = self.wb[sheet_name] for row in ws.iter_rows(min_row=1, max_row=10, values_only=True): if row and row[0] == "Search Terms" and len(row) > 1 and row[1]: return str(row[1]) return "" def get_variations_list(self) -> list[str]: """Extract the keyword variations list from Strategic Overview B10. These are pipe-delimited inside curly braces: {cnc screw|cnc screw machining|cnc swiss|...} """ if "Strategic Overview" not in self.wb.sheetnames: return [] ws = self.wb["Strategic Overview"] rows = list(ws.iter_rows(min_row=1, max_row=12, values_only=True)) for row in rows: if row and row[0] == "Keywords" and len(row) > 1 and row[1]: raw = str(row[1]).strip() # Remove curly braces and split on pipe raw = raw.strip("{}") return [v.strip() for v in raw.split("|") if v.strip()] return [] def get_site_domain(self) -> str: """Detect the user's site domain from the report. Looks for the domain in the Entities sheet header (column with a .com/.net etc. that isn't a standard Cora column) or the site column in other sheets. """ if self._site_domain: return self._site_domain # Try Entities sheet first if "Entities" in self.wb.sheetnames: ws = self.wb["Entities"] rows = list(ws.iter_rows(min_row=1, max_row=5, values_only=True)) for row in rows: if row and row[0] == "Entity": for h in row: if h and isinstance(h, str): h = h.strip() if re.match(r'^[a-zA-Z0-9-]+\.[a-zA-Z]{2,}$', h): self._site_domain = h return h # Try LSI Keywords sheet — header like "#40.7 hoggeprecision.com" if "LSI Keywords" in self.wb.sheetnames: ws = self.wb["LSI Keywords"] rows = list(ws.iter_rows(min_row=1, max_row=10, values_only=True)) for row in rows: if row and row[0] == "LSI Keyword": for h in row: if h and isinstance(h, str): match = re.search(r'([a-zA-Z0-9-]+\.[a-zA-Z]{2,})', h.strip()) if match: self._site_domain = match.group(1) return self._site_domain return "" # ------------------------------------------------------------------------- # Entities # ------------------------------------------------------------------------- def get_entities(self) -> list[dict]: """Extract entities from the Entities sheet. Returns list of dicts with: name, freebase_id, wikidata_id, wiki_link, relevance, confidence, type, correlation, current_count, max_count, deficit """ if "Entities" not in self.wb.sheetnames: return [] ws = self.wb["Entities"] rows = list(ws.iter_rows(values_only=True)) # Find header row containing "Entity", "Freebase ID", etc. header_idx = None for i, row in enumerate(rows): if row and row[0] == "Entity" and len(row) > 1 and row[1] == "Freebase ID": header_idx = i break if header_idx is None: return [] headers = rows[header_idx] col_map = {str(h).strip(): j for j, h in enumerate(headers) if h} # Find the site-specific column (domain name like "hoggeprecision.com") site_col_idx = None site_domain = self.get_site_domain() if site_domain: site_col_idx = col_map.get(site_domain) entities = [] for row in rows[header_idx + 1:]: if not row or not row[0]: continue name = str(row[0]).strip() if not name: continue # Skip rows that look like metadata (e.g., "critical values: ...") if name.startswith("critical") or name.startswith("http"): continue correlation = _safe_float(row, col_map.get("Best of Both")) # Filter by Best of Both correlation threshold. # Lower (more negative) = stronger ranking signal (correlates with # position 1 vs 100). Only keep entities at or below the threshold. threshold = OPTIMIZATION_RULES.get("entity_correlation_threshold") if threshold is not None and (correlation is None or correlation > threshold): continue entity = { "name": name, "freebase_id": _safe_str(row, col_map.get("Freebase ID")), "wikidata_id": _safe_str(row, col_map.get("Wikidata ID")), "wiki_link": _safe_str(row, col_map.get("Wiki Link")), "relevance": _safe_float(row, col_map.get("Relevance")), "confidence": _safe_float(row, col_map.get("Confidence")), "type": _safe_str(row, col_map.get("Type")), "correlation": correlation, "current_count": _safe_int(row, site_col_idx), "max_count": _safe_int(row, col_map.get("Max")), "deficit": _safe_int(row, col_map.get("Deficit")), } entities.append(entity) return entities # ------------------------------------------------------------------------- # LSI Keywords # ------------------------------------------------------------------------- def get_lsi_keywords(self) -> list[dict]: """Extract LSI keywords from the LSI Keywords sheet. Returns list of dicts with: keyword, spearmans, pearsons, best_of_both, pages, max, avg, current_count, deficit """ if "LSI Keywords" not in self.wb.sheetnames: return [] ws = self.wb["LSI Keywords"] rows = list(ws.iter_rows(values_only=True)) # Find header row containing "LSI Keyword", "Spearmans", etc. header_idx = None for i, row in enumerate(rows): if row and row[0] == "LSI Keyword": header_idx = i break if header_idx is None: return [] headers = rows[header_idx] col_map = {str(h).strip(): j for j, h in enumerate(headers) if h} # Find site column — pattern like "#40.7 hoggeprecision.com" site_col_idx = None site_domain = self.get_site_domain() if site_domain: for j, h in enumerate(headers): if h and isinstance(h, str) and site_domain in h: site_col_idx = j break if site_col_idx is None: site_col_idx = _find_site_col_idx(headers) lsi_keywords = [] for row in rows[header_idx + 1:]: if not row or not row[0]: continue keyword = str(row[0]).strip() if not keyword: continue lsi = { "keyword": keyword, "spearmans": _safe_float(row, col_map.get("Spearmans")), "pearsons": _safe_float(row, col_map.get("Pearsons")), "best_of_both": _safe_float(row, col_map.get("Best of Both")), "pages": _safe_int(row, col_map.get("Pages")), "max": _safe_int(row, col_map.get("Max")), "avg": _safe_float(row, col_map.get("Avg")), "current_count": _safe_int(row, site_col_idx), "deficit": _safe_float(row, col_map.get("Deficit")), } lsi_keywords.append(lsi) return lsi_keywords # ------------------------------------------------------------------------- # Keyword Variations # ------------------------------------------------------------------------- def get_keyword_variations(self) -> list[dict]: """Extract keyword variation counts from the Variations sheet. Returns list of dicts with: variation, page1_max, page1_avg """ if "Variations" not in self.wb.sheetnames: return [] ws = self.wb["Variations"] rows = list(ws.iter_rows(values_only=True)) if not rows or len(rows) < 3: return [] header_row = rows[0] # Find where variation columns start (after "# used" column) var_start = 3 # default for j, h in enumerate(header_row): if h and str(h).strip() == "# used": var_start = j + 1 break max_row = rows[1] if len(rows) > 1 else None avg_row = rows[2] if len(rows) > 2 else None variations = [] for j in range(var_start, len(header_row)): name = header_row[j] if not name: continue variation = { "variation": str(name).strip(), "page1_max": _safe_int(max_row, j) if max_row else 0, "page1_avg": _safe_int(avg_row, j) if avg_row else 0, } variations.append(variation) return variations # ------------------------------------------------------------------------- # Structure Targets (per-element targets from Structure sheet) # ------------------------------------------------------------------------- def get_structure_targets(self) -> dict: """Extract per-element optimization targets from the Structure sheet. Returns a dict keyed by element type with sub-targets: { "title_tag": {"exact_match": 0.2, "variations": 1.3, "entities": 5.8, "lsi_words": 10.7}, "meta_description": {...}, "all_h_tags": {"count": 20.7, "exact_match": 0.4, "variations": 5.7, "entities": 45.8, "lsi_words": 77.4}, "h1": {"count": 1.1, "exact_match": 0.1, "variations": 1, "entities": 3.8, "lsi_words": 7.3}, "h2": {...}, "h3": {...}, "h4": {...}, } Page 1 Average values are in column D (index 3). """ if "Structure" not in self.wb.sheetnames: return {} ws = self.wb["Structure"] rows = list(ws.iter_rows(values_only=True)) # Find the header row with "Factor Name", "Page 1 Avg" etc. header_idx = None for i, row in enumerate(rows): if row and len(row) > 3: if row[2] == "Factor Name" or (row[1] == "Factor ID" and row[2] == "Factor Name"): header_idx = i break # Also check for the combined "Best of Both Correlation" header if row[0] and "Best of Both" in str(row[0]): header_idx = i break if header_idx is None: return {} # Parse factor rows into sections # Section headers: "TITLE TAG", "META DESCRIPTION", "TOTAL FOR ALL H TAGS", # "H1 Data", "H2 Data", "H3 Data", "H4 Data", "H5 Data", "H6 Data" section_map = { "TITLE TAG": "title_tag", "META DESCRIPTION": "meta_description", "TOTAL FOR ALL H TAGS": "all_h_tags", "H1 Data": "h1", "H2 Data": "h2", "H3 Data": "h3", "H4 Data": "h4", } # Factor name patterns to field names factor_patterns = { "Number of": "count", "Exact Match": "exact_match", "Variation": "variations", "Entities": "entities", "LSI": "lsi_words", "Search Term": "search_terms", "Keywords": "keywords", } targets = {} current_section = None for row in rows[header_idx + 1:]: if not row or len(row) < 4: continue factor_name = _safe_str(row, 2) # Check if this is a section header if factor_name in section_map: current_section = section_map[factor_name] targets[current_section] = {} continue # Skip sections we don't care about (H5, H6) if factor_name in ("H5 Data", "H6 Data"): current_section = None continue if current_section is None: continue # Get the Page 1 Average (column D, index 3) avg_val = _safe_float(row, 3) if avg_val is None: continue # Map factor name to field field_name = None for pattern, field in factor_patterns.items(): if pattern.lower() in factor_name.lower(): field_name = field break if field_name and current_section: # Also grab correlation from column A correlation = _safe_float(row, 0) # Outlier detection: check if one of the top 10 results # contributes >50% of the sum. If so, exclude it and # recompute the average — that outlier is skewing the target. top10 = [_safe_float(row, j) or 0 for j in range(4, 14)] top10_sum = sum(top10) adjusted_avg = avg_val outlier_detected = False if top10_sum > 0: max_val = max(top10) if max_val > top10_sum * 0.5 and avg_val > 1: # One result is >50% of the total — outlier. # Skip adjustment when avg <= 1: a single "1" among # zeros triggers the rule but the target is already # small enough that adjustment would zero it out. remaining = [v for v in top10 if v != max_val] # If max_val appears multiple times, only remove one if len(remaining) == len(top10): remaining = top10[:] remaining.remove(max_val) if remaining: adjusted_avg = sum(remaining) / len(remaining) outlier_detected = True target_val = math.ceil(adjusted_avg) entry = { "avg": avg_val, "target": target_val, "correlation": correlation, } if outlier_detected: entry["outlier_adjusted"] = True entry["original_target"] = math.ceil(avg_val) targets[current_section][field_name] = entry return targets # ------------------------------------------------------------------------- # Density Targets (from Strategic Overview rows 46-48) # ------------------------------------------------------------------------- def get_density_targets(self) -> dict: """Extract density targets from Strategic Overview rows 46-48. Row 46: Variation density Row 47: Entity density Row 48: LSI density Column D (index 3) = Page 1 Average. Returns per-result values so we can show distribution. """ if "Strategic Overview" not in self.wb.sheetnames: return {} ws = self.wb["Strategic Overview"] rows = list(ws.iter_rows(values_only=True)) # Find the density rows — they're the last 3 non-empty rows in the data section # Look for them near row 46-48 area, identified by having floats in col D # and being near the bottom of the data # Approach: find the row with "Relevant Density" and the 3 rows after the gap density_area_start = None for i, row in enumerate(rows): if row and len(row) > 2 and row[2] == "Relevant Density": # Density target rows are a few rows below this density_area_start = i break if density_area_start is None: return {} # The 3 density rows come after a gap. They have NO values in cols A, B, C — # only numeric values from col D onward. Row 44 (which has a correlation in # col A) is a count row, not a density row, so we skip it. density_rows = [] for i in range(density_area_start + 1, min(density_area_start + 10, len(rows))): row = rows[i] if not row: continue col_a = row[0] if len(row) > 0 else None col_b = row[1] if len(row) > 1 else None col_c = row[2] if len(row) > 2 else None col_d = row[3] if len(row) > 3 else None # Density rows have None in A, B, C and a float in D if col_a is None and col_b is None and col_c is None and col_d is not None: try: float(col_d) density_rows.append(row) except (ValueError, TypeError): pass # Get result domains from row 22 area for the site column result_start_col = 4 # Results start at col E (index 4) result = {} labels = ["variation_density", "entity_density", "lsi_density"] for idx, label in enumerate(labels): if idx >= len(density_rows): break row = density_rows[idx] avg = _safe_float(row, 3) # Collect per-competitor values competitor_vals = [] for j in range(result_start_col, min(result_start_col + 10, len(row))): v = _safe_float(row, j) if v is not None: competitor_vals.append(v) result[label] = { "avg": avg, "avg_pct": f"{avg * 100:.2f}%" if avg else "N/A", "competitor_values": competitor_vals, } return result # ------------------------------------------------------------------------- # Content Targets (word count, distinct entities, etc.) # ------------------------------------------------------------------------- def get_content_targets(self) -> dict: """Extract key content-level targets from Strategic Overview. Includes: word count distribution, distinct entities target, variations in HTML, etc. """ if "Strategic Overview" not in self.wb.sheetnames: return {} ws = self.wb["Strategic Overview"] rows = list(ws.iter_rows(values_only=True)) targets = {} result_start_col = 4 for i, row in enumerate(rows): if not row or len(row) < 4: continue factor_name = _safe_str(row, 2) factor_id = _safe_str(row, 1) correlation = _safe_float(row, 0) avg = _safe_float(row, 3) if not factor_name or avg is None: continue # Key factors we care about if factor_name == "Number of Distinct Entities Used": competitor_vals = [] for j in range(result_start_col, min(result_start_col + 10, len(row))): v = _safe_float(row, j) if v is not None: competitor_vals.append(int(v)) targets["distinct_entities"] = { "factor_id": factor_id, "avg": avg, "target": math.ceil(avg), "correlation": correlation, "competitor_values": competitor_vals, } elif factor_name == "Variations in HTML Tags": targets["variations_in_html"] = { "factor_id": factor_id, "avg": avg, "target": math.ceil(avg), "correlation": correlation, } elif factor_name == "Entities in the HTML Tag": targets["entities_in_html"] = { "factor_id": factor_id, "avg": avg, "target": math.ceil(avg), "correlation": correlation, } return targets def get_word_count_distribution(self) -> dict: """Get word count data for competitive cluster analysis. Returns the clean word count for each competitor from the Keywords sheet, sorted ascending, plus the Page 1 Average and suggested cluster target. """ if "Keywords" not in self.wb.sheetnames: return {} ws = self.wb["Keywords"] rows = list(ws.iter_rows(values_only=True)) if not rows: return {} headers = rows[0] col_map = {str(h).strip(): j for j, h in enumerate(headers) if h} host_idx = col_map.get("Host") clean_wc_idx = col_map.get("Clean Word Count") if host_idx is None or clean_wc_idx is None: return {} # Collect word counts for page 1 results (top 10) competitors = [] for row in rows[1:11]: if not row or not row[host_idx]: continue wc = _safe_int(row, clean_wc_idx) if wc and wc > 0: competitors.append({ "host": str(row[host_idx]), "clean_word_count": wc, }) if not competitors: return {} # Sort by word count competitors.sort(key=lambda x: x["clean_word_count"]) counts = [c["clean_word_count"] for c in competitors] # Calculate cluster target avg = sum(counts) / len(counts) median = counts[len(counts) // 2] cluster_target = _find_cluster_target(counts) return { "competitors": competitors, "counts_sorted": counts, "average": round(avg), "median": median, "cluster_target": cluster_target, "min": counts[0], "max": counts[-1], } # ------------------------------------------------------------------------- # Basic Tunings # ------------------------------------------------------------------------- def get_basic_tunings(self) -> list[dict]: """Extract on-page tuning factors from the Basic Tunings sheet.""" if "Basic Tunings" not in self.wb.sheetnames: return [] ws = self.wb["Basic Tunings"] rows = list(ws.iter_rows(values_only=True)) # Find sub-header row with "Factor ID", "Factor" header_idx = None for i, row in enumerate(rows): if row and len(row) > 2 and row[1] == "Factor ID" and row[2] == "Factor": header_idx = i break if header_idx is None: return [] tunings = [] for row in rows[header_idx + 1:]: if not row: continue factor_id = row[1] if len(row) > 1 else None if not factor_id or not str(factor_id).strip(): continue factor_id_str = str(factor_id).strip() if not re.match(r'^[A-Z]{2,}\d+', factor_id_str): continue tuning = { "factor_id": factor_id_str, "factor": _safe_str(row, 2), "current": _safe_str(row, 3), "goal": _safe_str(row, 4), "percent": _safe_float(row, 5), "recommendation": _safe_str(row, 6), } tunings.append(tuning) return tunings # ------------------------------------------------------------------------- # Competitor URLs (Results sheet) # ------------------------------------------------------------------------- def get_competitor_urls(self) -> list[dict]: """Extract competitor URLs from the Results sheet.""" if "Results" not in self.wb.sheetnames: return [] ws = self.wb["Results"] rows = list(ws.iter_rows(values_only=True)) if not rows: return [] headers = rows[0] col_map = {str(h).strip(): j for j, h in enumerate(headers) if h} results = [] for row in rows[1:]: if not row or not row[0]: continue result = { "rank": _safe_int(row, col_map.get("Rank")), "host": _safe_str(row, col_map.get("Host")), "url": _safe_str(row, col_map.get("URL")), "title": _safe_str(row, col_map.get("Link Text")), "summary": _safe_str(row, col_map.get("Summary")), } results.append(result) return results # ------------------------------------------------------------------------- # Summary # ------------------------------------------------------------------------- def get_summary(self) -> dict: """Get a high-level summary of the Cora report with all key targets.""" entities = self.get_entities() lsi = self.get_lsi_keywords() variations = self.get_variations_list() tunings = self.get_basic_tunings() results = self.get_competitor_urls() density = self.get_density_targets() content = self.get_content_targets() wc_dist = self.get_word_count_distribution() # Find word count goal from tunings word_count_goal = None for t in tunings: if t["factor"] == "Word Count": word_count_goal = t["goal"] break entities_with_deficit = [e for e in entities if e["deficit"] and e["deficit"] > 0] lsi_with_deficit = [l for l in lsi if l["deficit"] and l["deficit"] > 0] return { "search_term": self.get_search_term(), "site_domain": self.get_site_domain(), "keyword_variations": variations, "total_entities": len(entities), "entities_with_deficit": len(entities_with_deficit), "total_lsi_keywords": len(lsi), "lsi_with_deficit": len(lsi_with_deficit), "word_count_goal": word_count_goal, "word_count_cluster_target": wc_dist.get("cluster_target"), "word_count_distribution": wc_dist.get("counts_sorted", []), "variation_density_avg": density.get("variation_density", {}).get("avg_pct"), "entity_density_avg": density.get("entity_density", {}).get("avg_pct"), "lsi_density_avg": density.get("lsi_density", {}).get("avg_pct"), "distinct_entities_target": content.get("distinct_entities", {}).get("target"), "competitors_analyzed": len(results), "tuning_factors": len(tunings), "optimization_rules": OPTIMIZATION_RULES, } # ============================================================================= # Helper functions # ============================================================================= def _safe_str(row, idx) -> str: if idx is None or idx >= len(row) or row[idx] is None: return "" return str(row[idx]).strip() def _safe_float(row, idx) -> float | None: if idx is None or idx >= len(row) or row[idx] is None: return None try: return float(row[idx]) except (ValueError, TypeError): return None def _safe_int(row, idx) -> int | None: if idx is None or idx >= len(row) or row[idx] is None: return None try: return int(float(row[idx])) except (ValueError, TypeError): return None def _find_site_col_idx(headers) -> int | None: """Find site column by looking for domain pattern in header values.""" for j, h in enumerate(headers): if h and isinstance(h, str): h_str = h.strip() if re.search(r'[a-zA-Z0-9-]+\.[a-zA-Z]{2,}', h_str): # Skip known non-site headers if h_str in ("Best of Both", "LSI Keyword"): continue return j return None def _find_cluster_target(counts: list[int]) -> int: """Find the nearest competitive cluster target for word count. Strategy: Don't use the raw average (skewed by outliers). Instead, find clusters of 3+ competitors within 30% of each other and target slightly above the nearest cluster's center. """ if not counts: return 0 if len(counts) <= 3: return math.ceil(max(counts) * 1.05) # Simple clustering: find the densest grouping best_cluster = [] for i in range(len(counts)): cluster = [counts[i]] for j in range(i + 1, len(counts)): # Within 40% range of the cluster start if counts[j] <= counts[i] * 1.4: cluster.append(counts[j]) else: break if len(cluster) >= len(best_cluster): best_cluster = cluster if best_cluster: cluster_avg = sum(best_cluster) / len(best_cluster) # Target slightly above the cluster average return math.ceil(cluster_avg * 1.05) # Fallback: median + 5% median = counts[len(counts) // 2] return math.ceil(median * 1.05) # ============================================================================= # Output formatting # ============================================================================= def format_text(data, label: str = "") -> str: """Format data as human-readable text.""" lines = [] if label: lines.append(f"=== {label} ===") lines.append("") if isinstance(data, dict): for key, value in data.items(): if isinstance(value, list) and len(value) > 5: lines.append(f" {key}: [{len(value)} items]") elif isinstance(value, dict): lines.append(f" {key}:") for k2, v2 in value.items(): lines.append(f" {k2}: {v2}") else: lines.append(f" {key}: {value}") elif isinstance(data, list): for i, item in enumerate(data): if isinstance(item, dict): lines.append(f" [{i + 1}]") for key, value in item.items(): lines.append(f" {key}: {value}") else: lines.append(f" [{i + 1}] {item}") lines.append("") return "\n".join(lines) # ============================================================================= # CLI # ============================================================================= def main(): parser = argparse.ArgumentParser(description="Parse a Cora SEO XLSX report") parser.add_argument("xlsx_path", help="Path to the Cora XLSX file") parser.add_argument( "--sheet", choices=[ "entities", "lsi", "variations", "results", "tunings", "structure", "densities", "targets", "wordcount", "summary", "all", ], default="summary", help="Which data to extract (default: summary)", ) parser.add_argument( "--format", choices=["json", "text"], default="text", help="Output format (default: text)", ) parser.add_argument( "--top-n", type=int, default=0, help="Limit output to top N results (0 = all)", ) args = parser.parse_args() report = CoraReport(args.xlsx_path) extractors = { "entities": ("Entities", report.get_entities), "lsi": ("LSI Keywords", report.get_lsi_keywords), "variations": ("Keyword Variations", lambda: report.get_keyword_variations()), "results": ("Competitor URLs", report.get_competitor_urls), "tunings": ("Basic Tunings", report.get_basic_tunings), "structure": ("Structure Targets", report.get_structure_targets), "densities": ("Density Targets", report.get_density_targets), "targets": ("Content Targets", report.get_content_targets), "wordcount": ("Word Count Distribution", report.get_word_count_distribution), "summary": ("Summary", report.get_summary), } if args.sheet == "all": sheets_to_show = ["summary", "structure", "densities", "targets", "wordcount"] else: sheets_to_show = [args.sheet] for sheet_key in sheets_to_show: label, extractor = extractors[sheet_key] data = extractor() if args.top_n > 0 and isinstance(data, list): data = data[:args.top_n] if args.format == "json": print(json.dumps(data, indent=2, default=str)) else: print(format_text(data, label)) if __name__ == "__main__": main()