CheddahBot/.claude/skills/content-researcher/scripts/seo_optimizer.py

403 lines
13 KiB
Python

"""
SEO Content Optimizer
Checks keyword density and content structure of a draft against Cora targets.
Usage:
uv run --with openpyxl python seo_optimizer.py <draft_path>
[--keyword <kw>] [--cora-xlsx <path>] [--format json|text]
Works standalone for basic checks, or with a Cora XLSX report for
keyword-specific targets via cora_parser.CoraReport.
"""
import argparse
import json
import re
import sys
from pathlib import Path
# Optional Cora integration — script works without it
try:
from cora_parser import CoraReport
except ImportError:
CoraReport = None
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def _split_words(text: str) -> list[str]:
"""Extract words from text (alphabetic sequences)."""
return re.findall(r"[a-zA-Z']+", text)
def _strip_markdown_headings(text: str) -> str:
"""Remove markdown heading markers from text for word counting."""
return re.sub(r"^#{1,6}\s+", "", text, flags=re.MULTILINE)
def _extract_headings(text: str) -> list[dict]:
"""Extract markdown-style headings with their levels."""
headings = []
for match in re.finditer(r"^(#{1,6})\s+(.+)$", text, re.MULTILINE):
level = len(match.group(1))
headings.append({"level": level, "text": match.group(2).strip()})
return headings
# ---------------------------------------------------------------------------
# SEOOptimizer
# ---------------------------------------------------------------------------
class SEOOptimizer:
"""Analyze a content draft for keyword density and structure."""
def __init__(self):
self._results = {}
# -- public entry point -------------------------------------------------
def analyze(
self,
draft_path: str,
primary_keyword: str | None = None,
cora_xlsx_path: str | None = None,
) -> dict:
"""Run checks on *draft_path* and return an analysis dict."""
path = Path(draft_path)
if not path.exists():
raise FileNotFoundError(f"Draft not found: {draft_path}")
text = path.read_text(encoding="utf-8")
# Optionally load Cora data
cora = None
if cora_xlsx_path:
if CoraReport is None:
print(
"Warning: cora_parser not available. "
"Install openpyxl and ensure cora_parser.py is importable.",
file=sys.stderr,
)
else:
cora = CoraReport(cora_xlsx_path)
# Determine keyword list
keywords = []
if primary_keyword:
keywords.append(primary_keyword)
if cora:
search_term = cora.get_search_term()
if search_term and search_term.lower() not in [k.lower() for k in keywords]:
keywords.insert(0, search_term)
for var in cora.get_keyword_variations():
v = var["variation"]
if v.lower() not in [k.lower() for k in keywords]:
keywords.append(v)
# If still no keywords but Cora gave a search term, use it
if not keywords and cora:
st = cora.get_search_term()
if st:
keywords.append(st)
# Word-count target from Cora
word_count_target = None
if cora:
for t in cora.get_basic_tunings():
if t["factor"] == "Word Count":
try:
word_count_target = int(float(t["goal"]))
except (ValueError, TypeError):
pass
break
# Build Cora keyword targets (page1_avg) for comparison
cora_keyword_targets = {}
if cora:
for var in cora.get_keyword_variations():
cora_keyword_targets[var["variation"].lower()] = {
"page1_avg": var.get("page1_avg", 0),
"page1_max": var.get("page1_max", 0),
}
# Run checks
self._results["content_length"] = self.check_content_length(text, target=word_count_target)
self._results["structure"] = self.check_structure(text)
self._results["keyword_density"] = self.check_keyword_density(
text, keywords=keywords or None, cora_targets=cora_keyword_targets,
)
return self._results
# -- individual checks --------------------------------------------------
def check_keyword_density(
self,
text: str,
keywords: list[str] | None = None,
cora_targets: dict | None = None,
) -> dict:
"""Return per-keyword density information.
Only reports variations that have page1_avg > 0 (competitors actually
use them) when Cora targets are available.
"""
clean_text = _strip_markdown_headings(text).lower()
words = _split_words(clean_text)
total_words = len(words)
if total_words == 0:
return {"total_words": 0, "keywords": []}
results: list[dict] = []
if keywords:
for kw in keywords:
kw_lower = kw.lower()
# Skip zero-avg variations — competitors don't use them
if cora_targets and kw_lower in cora_targets:
if cora_targets[kw_lower].get("page1_avg", 0) == 0:
continue
kw_words = kw_lower.split()
if len(kw_words) > 1:
pattern = re.compile(r"\b" + re.escape(kw_lower) + r"\b")
count = len(pattern.findall(clean_text))
else:
count = sum(1 for w in words if w == kw_lower)
density = (count / total_words) * 100 if total_words else 0
entry = {
"keyword": kw,
"count": count,
"density_pct": round(density, 2),
}
# Add Cora target if available
if cora_targets and kw_lower in cora_targets:
entry["target_avg"] = cora_targets[kw_lower]["page1_avg"]
entry["target_max"] = cora_targets[kw_lower]["page1_max"]
results.append(entry)
else:
# Fallback: top frequent words (>= 4 chars)
freq: dict[str, int] = {}
for w in words:
if len(w) >= 4:
freq[w] = freq.get(w, 0) + 1
top = sorted(freq.items(), key=lambda x: x[1], reverse=True)[:10]
for w, count in top:
density = (count / total_words) * 100
results.append({
"keyword": w,
"count": count,
"density_pct": round(density, 2),
})
return {"total_words": total_words, "keywords": results}
def check_structure(self, text: str) -> dict:
"""Analyze heading hierarchy, paragraph count, and list usage."""
headings = _extract_headings(text)
# Count headings per level
heading_counts = {f"h{i}": 0 for i in range(1, 7)}
for h in headings:
heading_counts[f"h{h['level']}"] += 1
# Check nesting issues
nesting_issues: list[str] = []
if heading_counts["h1"] > 1:
nesting_issues.append(f"Multiple H1 tags found ({heading_counts['h1']}); use exactly one.")
prev_level = 0
for h in headings:
if prev_level > 0 and h["level"] > prev_level + 1:
nesting_issues.append(
f"Heading skip: H{prev_level} -> H{h['level']} "
f"(at \"{h['text'][:40]}...\")"
if len(h["text"]) > 40 else
f"Heading skip: H{prev_level} -> H{h['level']} "
f"(at \"{h['text']}\")"
)
prev_level = h["level"]
# Paragraphs
paragraphs = []
for block in re.split(r"\n\s*\n", text):
block = block.strip()
if not block:
continue
if re.match(r"^#{1,6}\s+", block) and "\n" not in block:
continue
if all(re.match(r"^\s*[-*+]\s|^\s*\d+\.\s", line) for line in block.splitlines() if line.strip()):
continue
paragraphs.append(block)
paragraph_count = len(paragraphs)
# List usage
unordered_items = len(re.findall(r"^\s*[-*+]\s", text, re.MULTILINE))
ordered_items = len(re.findall(r"^\s*\d+\.\s", text, re.MULTILINE))
return {
"heading_counts": heading_counts,
"headings": [{"level": h["level"], "text": h["text"]} for h in headings],
"nesting_issues": nesting_issues,
"paragraph_count": paragraph_count,
"unordered_list_items": unordered_items,
"ordered_list_items": ordered_items,
}
def check_content_length(self, text: str, target: int | None = None) -> dict:
"""Compare word count against an optional target."""
clean = _strip_markdown_headings(text)
words = _split_words(clean)
word_count = len(words)
result: dict = {"word_count": word_count}
if target is not None:
result["target"] = target
result["difference"] = word_count - target
if word_count >= target:
result["status"] = "meets_target"
elif word_count >= target * 0.8:
result["status"] = "close"
else:
result["status"] = "below_target"
return result
# ---------------------------------------------------------------------------
# Text-mode formatting
# ---------------------------------------------------------------------------
def _format_text_report(results: dict) -> str:
"""Format analysis results as a human-readable text report."""
lines: list[str] = []
sep = "-" * 60
# 1. Content Stats
cl = results.get("content_length", {})
lines.append(sep)
lines.append(" CONTENT STATS")
lines.append(sep)
lines.append(f" Word count: {cl.get('word_count', 0)}")
if cl.get("target"):
lines.append(f" Target: {cl['target']} ({cl.get('status', '')})")
diff = cl.get("difference", 0)
sign = "+" if diff >= 0 else ""
lines.append(f" Difference: {sign}{diff}")
lines.append("")
# 2. Structure
st = results.get("structure", {})
lines.append(sep)
lines.append(" STRUCTURE")
lines.append(sep)
hc = st.get("heading_counts", {})
for lvl in range(1, 7):
count = hc.get(f"h{lvl}", 0)
if count > 0:
lines.append(f" H{lvl}: {count}")
issues = st.get("nesting_issues", [])
if issues:
lines.append(" Nesting issues:")
for issue in issues:
lines.append(f" - {issue}")
else:
lines.append(" Nesting: OK")
lines.append("")
# 3. Keyword Density (only variations with targets)
kd = results.get("keyword_density", {})
kw_list = kd.get("keywords", [])
lines.append(sep)
lines.append(" KEYWORD DENSITY")
lines.append(sep)
if kw_list:
lines.append(f" {'Variation':<30s} {'Count':>5s} {'Density':>7s} {'Avg':>5s} {'Max':>5s}")
lines.append(f" {'-'*30} {'-'*5} {'-'*7} {'-'*5} {'-'*5}")
for kw in kw_list:
avg_str = str(kw.get("target_avg", "")) if "target_avg" in kw else ""
max_str = str(kw.get("target_max", "")) if "target_max" in kw else ""
lines.append(
f" {kw['keyword']:<30s} "
f"{kw['count']:>5d} "
f"{kw['density_pct']:>6.2f}% "
f"{avg_str:>5s} "
f"{max_str:>5s}"
)
else:
lines.append(" No keywords specified.")
lines.append("")
lines.append(sep)
return "\n".join(lines)
# ---------------------------------------------------------------------------
# CLI
# ---------------------------------------------------------------------------
def main():
parser = argparse.ArgumentParser(
description="Check keyword density and structure of a content draft.",
epilog="Example: uv run --with openpyxl python seo_optimizer.py draft.md --cora-xlsx report.xlsx",
)
parser.add_argument(
"draft_path",
help="Path to the content draft (plain text or markdown)",
)
parser.add_argument(
"--keyword",
dest="keyword",
default=None,
help="Primary keyword to evaluate",
)
parser.add_argument(
"--cora-xlsx",
dest="cora_xlsx",
default=None,
help="Path to a Cora XLSX report for keyword-specific targets",
)
parser.add_argument(
"--format",
choices=["json", "text"],
default="text",
help="Output format (default: text)",
)
args = parser.parse_args()
optimizer = SEOOptimizer()
try:
results = optimizer.analyze(
draft_path=args.draft_path,
primary_keyword=args.keyword,
cora_xlsx_path=args.cora_xlsx,
)
except FileNotFoundError as e:
print(f"Error: {e}", file=sys.stderr)
sys.exit(1)
except Exception as e:
print(f"Error during analysis: {e}", file=sys.stderr)
sys.exit(1)
if args.format == "json":
print(json.dumps(results, indent=2, default=str))
else:
print(_format_text_report(results))
if __name__ == "__main__":
main()