""" SEO Content Optimizer Checks keyword density and content structure of a draft against Cora targets. Usage: uv run --with openpyxl python seo_optimizer.py [--keyword ] [--cora-xlsx ] [--format json|text] Works standalone for basic checks, or with a Cora XLSX report for keyword-specific targets via cora_parser.CoraReport. """ import argparse import json import re import sys from pathlib import Path # Optional Cora integration — script works without it try: from cora_parser import CoraReport except ImportError: CoraReport = None # --------------------------------------------------------------------------- # Helpers # --------------------------------------------------------------------------- def _split_words(text: str) -> list[str]: """Extract words from text (alphabetic sequences).""" return re.findall(r"[a-zA-Z']+", text) def _strip_markdown_headings(text: str) -> str: """Remove markdown heading markers from text for word counting.""" return re.sub(r"^#{1,6}\s+", "", text, flags=re.MULTILINE) def _extract_headings(text: str) -> list[dict]: """Extract markdown-style headings with their levels.""" headings = [] for match in re.finditer(r"^(#{1,6})\s+(.+)$", text, re.MULTILINE): level = len(match.group(1)) headings.append({"level": level, "text": match.group(2).strip()}) return headings # --------------------------------------------------------------------------- # SEOOptimizer # --------------------------------------------------------------------------- class SEOOptimizer: """Analyze a content draft for keyword density and structure.""" def __init__(self): self._results = {} # -- public entry point ------------------------------------------------- def analyze( self, draft_path: str, primary_keyword: str | None = None, cora_xlsx_path: str | None = None, ) -> dict: """Run checks on *draft_path* and return an analysis dict.""" path = Path(draft_path) if not path.exists(): raise FileNotFoundError(f"Draft not found: {draft_path}") text = path.read_text(encoding="utf-8") # Optionally load Cora data cora = None if cora_xlsx_path: if CoraReport is None: print( "Warning: cora_parser not available. " "Install openpyxl and ensure cora_parser.py is importable.", file=sys.stderr, ) else: cora = CoraReport(cora_xlsx_path) # Determine keyword list keywords = [] if primary_keyword: keywords.append(primary_keyword) if cora: search_term = cora.get_search_term() if search_term and search_term.lower() not in [k.lower() for k in keywords]: keywords.insert(0, search_term) for var in cora.get_keyword_variations(): v = var["variation"] if v.lower() not in [k.lower() for k in keywords]: keywords.append(v) # If still no keywords but Cora gave a search term, use it if not keywords and cora: st = cora.get_search_term() if st: keywords.append(st) # Word-count target from Cora word_count_target = None if cora: for t in cora.get_basic_tunings(): if t["factor"] == "Word Count": try: word_count_target = int(float(t["goal"])) except (ValueError, TypeError): pass break # Build Cora keyword targets (page1_avg) for comparison cora_keyword_targets = {} if cora: for var in cora.get_keyword_variations(): cora_keyword_targets[var["variation"].lower()] = { "page1_avg": var.get("page1_avg", 0), "page1_max": var.get("page1_max", 0), } # Run checks self._results["content_length"] = self.check_content_length(text, target=word_count_target) self._results["structure"] = self.check_structure(text) self._results["keyword_density"] = self.check_keyword_density( text, keywords=keywords or None, cora_targets=cora_keyword_targets, ) return self._results # -- individual checks -------------------------------------------------- def check_keyword_density( self, text: str, keywords: list[str] | None = None, cora_targets: dict | None = None, ) -> dict: """Return per-keyword density information. Only reports variations that have page1_avg > 0 (competitors actually use them) when Cora targets are available. """ clean_text = _strip_markdown_headings(text).lower() words = _split_words(clean_text) total_words = len(words) if total_words == 0: return {"total_words": 0, "keywords": []} results: list[dict] = [] if keywords: for kw in keywords: kw_lower = kw.lower() # Skip zero-avg variations — competitors don't use them if cora_targets and kw_lower in cora_targets: if cora_targets[kw_lower].get("page1_avg", 0) == 0: continue kw_words = kw_lower.split() if len(kw_words) > 1: pattern = re.compile(r"\b" + re.escape(kw_lower) + r"\b") count = len(pattern.findall(clean_text)) else: count = sum(1 for w in words if w == kw_lower) density = (count / total_words) * 100 if total_words else 0 entry = { "keyword": kw, "count": count, "density_pct": round(density, 2), } # Add Cora target if available if cora_targets and kw_lower in cora_targets: entry["target_avg"] = cora_targets[kw_lower]["page1_avg"] entry["target_max"] = cora_targets[kw_lower]["page1_max"] results.append(entry) else: # Fallback: top frequent words (>= 4 chars) freq: dict[str, int] = {} for w in words: if len(w) >= 4: freq[w] = freq.get(w, 0) + 1 top = sorted(freq.items(), key=lambda x: x[1], reverse=True)[:10] for w, count in top: density = (count / total_words) * 100 results.append({ "keyword": w, "count": count, "density_pct": round(density, 2), }) return {"total_words": total_words, "keywords": results} def check_structure(self, text: str) -> dict: """Analyze heading hierarchy, paragraph count, and list usage.""" headings = _extract_headings(text) # Count headings per level heading_counts = {f"h{i}": 0 for i in range(1, 7)} for h in headings: heading_counts[f"h{h['level']}"] += 1 # Check nesting issues nesting_issues: list[str] = [] if heading_counts["h1"] > 1: nesting_issues.append(f"Multiple H1 tags found ({heading_counts['h1']}); use exactly one.") prev_level = 0 for h in headings: if prev_level > 0 and h["level"] > prev_level + 1: nesting_issues.append( f"Heading skip: H{prev_level} -> H{h['level']} " f"(at \"{h['text'][:40]}...\")" if len(h["text"]) > 40 else f"Heading skip: H{prev_level} -> H{h['level']} " f"(at \"{h['text']}\")" ) prev_level = h["level"] # Paragraphs paragraphs = [] for block in re.split(r"\n\s*\n", text): block = block.strip() if not block: continue if re.match(r"^#{1,6}\s+", block) and "\n" not in block: continue if all(re.match(r"^\s*[-*+]\s|^\s*\d+\.\s", line) for line in block.splitlines() if line.strip()): continue paragraphs.append(block) paragraph_count = len(paragraphs) # List usage unordered_items = len(re.findall(r"^\s*[-*+]\s", text, re.MULTILINE)) ordered_items = len(re.findall(r"^\s*\d+\.\s", text, re.MULTILINE)) return { "heading_counts": heading_counts, "headings": [{"level": h["level"], "text": h["text"]} for h in headings], "nesting_issues": nesting_issues, "paragraph_count": paragraph_count, "unordered_list_items": unordered_items, "ordered_list_items": ordered_items, } def check_content_length(self, text: str, target: int | None = None) -> dict: """Compare word count against an optional target.""" clean = _strip_markdown_headings(text) words = _split_words(clean) word_count = len(words) result: dict = {"word_count": word_count} if target is not None: result["target"] = target result["difference"] = word_count - target if word_count >= target: result["status"] = "meets_target" elif word_count >= target * 0.8: result["status"] = "close" else: result["status"] = "below_target" return result # --------------------------------------------------------------------------- # Text-mode formatting # --------------------------------------------------------------------------- def _format_text_report(results: dict) -> str: """Format analysis results as a human-readable text report.""" lines: list[str] = [] sep = "-" * 60 # 1. Content Stats cl = results.get("content_length", {}) lines.append(sep) lines.append(" CONTENT STATS") lines.append(sep) lines.append(f" Word count: {cl.get('word_count', 0)}") if cl.get("target"): lines.append(f" Target: {cl['target']} ({cl.get('status', '')})") diff = cl.get("difference", 0) sign = "+" if diff >= 0 else "" lines.append(f" Difference: {sign}{diff}") lines.append("") # 2. Structure st = results.get("structure", {}) lines.append(sep) lines.append(" STRUCTURE") lines.append(sep) hc = st.get("heading_counts", {}) for lvl in range(1, 7): count = hc.get(f"h{lvl}", 0) if count > 0: lines.append(f" H{lvl}: {count}") issues = st.get("nesting_issues", []) if issues: lines.append(" Nesting issues:") for issue in issues: lines.append(f" - {issue}") else: lines.append(" Nesting: OK") lines.append("") # 3. Keyword Density (only variations with targets) kd = results.get("keyword_density", {}) kw_list = kd.get("keywords", []) lines.append(sep) lines.append(" KEYWORD DENSITY") lines.append(sep) if kw_list: lines.append(f" {'Variation':<30s} {'Count':>5s} {'Density':>7s} {'Avg':>5s} {'Max':>5s}") lines.append(f" {'-'*30} {'-'*5} {'-'*7} {'-'*5} {'-'*5}") for kw in kw_list: avg_str = str(kw.get("target_avg", "")) if "target_avg" in kw else "" max_str = str(kw.get("target_max", "")) if "target_max" in kw else "" lines.append( f" {kw['keyword']:<30s} " f"{kw['count']:>5d} " f"{kw['density_pct']:>6.2f}% " f"{avg_str:>5s} " f"{max_str:>5s}" ) else: lines.append(" No keywords specified.") lines.append("") lines.append(sep) return "\n".join(lines) # --------------------------------------------------------------------------- # CLI # --------------------------------------------------------------------------- def main(): parser = argparse.ArgumentParser( description="Check keyword density and structure of a content draft.", epilog="Example: uv run --with openpyxl python seo_optimizer.py draft.md --cora-xlsx report.xlsx", ) parser.add_argument( "draft_path", help="Path to the content draft (plain text or markdown)", ) parser.add_argument( "--keyword", dest="keyword", default=None, help="Primary keyword to evaluate", ) parser.add_argument( "--cora-xlsx", dest="cora_xlsx", default=None, help="Path to a Cora XLSX report for keyword-specific targets", ) parser.add_argument( "--format", choices=["json", "text"], default="text", help="Output format (default: text)", ) args = parser.parse_args() optimizer = SEOOptimizer() try: results = optimizer.analyze( draft_path=args.draft_path, primary_keyword=args.keyword, cora_xlsx_path=args.cora_xlsx, ) except FileNotFoundError as e: print(f"Error: {e}", file=sys.stderr) sys.exit(1) except Exception as e: print(f"Error during analysis: {e}", file=sys.stderr) sys.exit(1) if args.format == "json": print(json.dumps(results, indent=2, default=str)) else: print(_format_text_report(results)) if __name__ == "__main__": main()