403 lines
13 KiB
Python
403 lines
13 KiB
Python
"""
|
|
SEO Content Optimizer
|
|
|
|
Checks keyword density and content structure of a draft against Cora targets.
|
|
|
|
Usage:
|
|
uv run --with openpyxl python seo_optimizer.py <draft_path>
|
|
[--keyword <kw>] [--cora-xlsx <path>] [--format json|text]
|
|
|
|
Works standalone for basic checks, or with a Cora XLSX report for
|
|
keyword-specific targets via cora_parser.CoraReport.
|
|
"""
|
|
|
|
import argparse
|
|
import json
|
|
import re
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
# Optional Cora integration — script works without it
|
|
try:
|
|
from cora_parser import CoraReport
|
|
except ImportError:
|
|
CoraReport = None
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Helpers
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def _split_words(text: str) -> list[str]:
|
|
"""Extract words from text (alphabetic sequences)."""
|
|
return re.findall(r"[a-zA-Z']+", text)
|
|
|
|
|
|
def _strip_markdown_headings(text: str) -> str:
|
|
"""Remove markdown heading markers from text for word counting."""
|
|
return re.sub(r"^#{1,6}\s+", "", text, flags=re.MULTILINE)
|
|
|
|
|
|
def _extract_headings(text: str) -> list[dict]:
|
|
"""Extract markdown-style headings with their levels."""
|
|
headings = []
|
|
for match in re.finditer(r"^(#{1,6})\s+(.+)$", text, re.MULTILINE):
|
|
level = len(match.group(1))
|
|
headings.append({"level": level, "text": match.group(2).strip()})
|
|
return headings
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# SEOOptimizer
|
|
# ---------------------------------------------------------------------------
|
|
|
|
class SEOOptimizer:
|
|
"""Analyze a content draft for keyword density and structure."""
|
|
|
|
def __init__(self):
|
|
self._results = {}
|
|
|
|
# -- public entry point -------------------------------------------------
|
|
|
|
def analyze(
|
|
self,
|
|
draft_path: str,
|
|
primary_keyword: str | None = None,
|
|
cora_xlsx_path: str | None = None,
|
|
) -> dict:
|
|
"""Run checks on *draft_path* and return an analysis dict."""
|
|
path = Path(draft_path)
|
|
if not path.exists():
|
|
raise FileNotFoundError(f"Draft not found: {draft_path}")
|
|
|
|
text = path.read_text(encoding="utf-8")
|
|
|
|
# Optionally load Cora data
|
|
cora = None
|
|
if cora_xlsx_path:
|
|
if CoraReport is None:
|
|
print(
|
|
"Warning: cora_parser not available. "
|
|
"Install openpyxl and ensure cora_parser.py is importable.",
|
|
file=sys.stderr,
|
|
)
|
|
else:
|
|
cora = CoraReport(cora_xlsx_path)
|
|
|
|
# Determine keyword list
|
|
keywords = []
|
|
if primary_keyword:
|
|
keywords.append(primary_keyword)
|
|
if cora:
|
|
search_term = cora.get_search_term()
|
|
if search_term and search_term.lower() not in [k.lower() for k in keywords]:
|
|
keywords.insert(0, search_term)
|
|
for var in cora.get_keyword_variations():
|
|
v = var["variation"]
|
|
if v.lower() not in [k.lower() for k in keywords]:
|
|
keywords.append(v)
|
|
|
|
# If still no keywords but Cora gave a search term, use it
|
|
if not keywords and cora:
|
|
st = cora.get_search_term()
|
|
if st:
|
|
keywords.append(st)
|
|
|
|
# Word-count target from Cora
|
|
word_count_target = None
|
|
if cora:
|
|
for t in cora.get_basic_tunings():
|
|
if t["factor"] == "Word Count":
|
|
try:
|
|
word_count_target = int(float(t["goal"]))
|
|
except (ValueError, TypeError):
|
|
pass
|
|
break
|
|
|
|
# Build Cora keyword targets (page1_avg) for comparison
|
|
cora_keyword_targets = {}
|
|
if cora:
|
|
for var in cora.get_keyword_variations():
|
|
cora_keyword_targets[var["variation"].lower()] = {
|
|
"page1_avg": var.get("page1_avg", 0),
|
|
"page1_max": var.get("page1_max", 0),
|
|
}
|
|
|
|
# Run checks
|
|
self._results["content_length"] = self.check_content_length(text, target=word_count_target)
|
|
self._results["structure"] = self.check_structure(text)
|
|
self._results["keyword_density"] = self.check_keyword_density(
|
|
text, keywords=keywords or None, cora_targets=cora_keyword_targets,
|
|
)
|
|
|
|
return self._results
|
|
|
|
# -- individual checks --------------------------------------------------
|
|
|
|
def check_keyword_density(
|
|
self,
|
|
text: str,
|
|
keywords: list[str] | None = None,
|
|
cora_targets: dict | None = None,
|
|
) -> dict:
|
|
"""Return per-keyword density information.
|
|
|
|
Only reports variations that have page1_avg > 0 (competitors actually
|
|
use them) when Cora targets are available.
|
|
"""
|
|
clean_text = _strip_markdown_headings(text).lower()
|
|
words = _split_words(clean_text)
|
|
total_words = len(words)
|
|
|
|
if total_words == 0:
|
|
return {"total_words": 0, "keywords": []}
|
|
|
|
results: list[dict] = []
|
|
|
|
if keywords:
|
|
for kw in keywords:
|
|
kw_lower = kw.lower()
|
|
|
|
# Skip zero-avg variations — competitors don't use them
|
|
if cora_targets and kw_lower in cora_targets:
|
|
if cora_targets[kw_lower].get("page1_avg", 0) == 0:
|
|
continue
|
|
|
|
kw_words = kw_lower.split()
|
|
if len(kw_words) > 1:
|
|
pattern = re.compile(r"\b" + re.escape(kw_lower) + r"\b")
|
|
count = len(pattern.findall(clean_text))
|
|
else:
|
|
count = sum(1 for w in words if w == kw_lower)
|
|
|
|
density = (count / total_words) * 100 if total_words else 0
|
|
|
|
entry = {
|
|
"keyword": kw,
|
|
"count": count,
|
|
"density_pct": round(density, 2),
|
|
}
|
|
|
|
# Add Cora target if available
|
|
if cora_targets and kw_lower in cora_targets:
|
|
entry["target_avg"] = cora_targets[kw_lower]["page1_avg"]
|
|
entry["target_max"] = cora_targets[kw_lower]["page1_max"]
|
|
|
|
results.append(entry)
|
|
else:
|
|
# Fallback: top frequent words (>= 4 chars)
|
|
freq: dict[str, int] = {}
|
|
for w in words:
|
|
if len(w) >= 4:
|
|
freq[w] = freq.get(w, 0) + 1
|
|
top = sorted(freq.items(), key=lambda x: x[1], reverse=True)[:10]
|
|
for w, count in top:
|
|
density = (count / total_words) * 100
|
|
results.append({
|
|
"keyword": w,
|
|
"count": count,
|
|
"density_pct": round(density, 2),
|
|
})
|
|
|
|
return {"total_words": total_words, "keywords": results}
|
|
|
|
def check_structure(self, text: str) -> dict:
|
|
"""Analyze heading hierarchy, paragraph count, and list usage."""
|
|
headings = _extract_headings(text)
|
|
|
|
# Count headings per level
|
|
heading_counts = {f"h{i}": 0 for i in range(1, 7)}
|
|
for h in headings:
|
|
heading_counts[f"h{h['level']}"] += 1
|
|
|
|
# Check nesting issues
|
|
nesting_issues: list[str] = []
|
|
if heading_counts["h1"] > 1:
|
|
nesting_issues.append(f"Multiple H1 tags found ({heading_counts['h1']}); use exactly one.")
|
|
|
|
prev_level = 0
|
|
for h in headings:
|
|
if prev_level > 0 and h["level"] > prev_level + 1:
|
|
nesting_issues.append(
|
|
f"Heading skip: H{prev_level} -> H{h['level']} "
|
|
f"(at \"{h['text'][:40]}...\")"
|
|
if len(h["text"]) > 40 else
|
|
f"Heading skip: H{prev_level} -> H{h['level']} "
|
|
f"(at \"{h['text']}\")"
|
|
)
|
|
prev_level = h["level"]
|
|
|
|
# Paragraphs
|
|
paragraphs = []
|
|
for block in re.split(r"\n\s*\n", text):
|
|
block = block.strip()
|
|
if not block:
|
|
continue
|
|
if re.match(r"^#{1,6}\s+", block) and "\n" not in block:
|
|
continue
|
|
if all(re.match(r"^\s*[-*+]\s|^\s*\d+\.\s", line) for line in block.splitlines() if line.strip()):
|
|
continue
|
|
paragraphs.append(block)
|
|
|
|
paragraph_count = len(paragraphs)
|
|
|
|
# List usage
|
|
unordered_items = len(re.findall(r"^\s*[-*+]\s", text, re.MULTILINE))
|
|
ordered_items = len(re.findall(r"^\s*\d+\.\s", text, re.MULTILINE))
|
|
|
|
return {
|
|
"heading_counts": heading_counts,
|
|
"headings": [{"level": h["level"], "text": h["text"]} for h in headings],
|
|
"nesting_issues": nesting_issues,
|
|
"paragraph_count": paragraph_count,
|
|
"unordered_list_items": unordered_items,
|
|
"ordered_list_items": ordered_items,
|
|
}
|
|
|
|
def check_content_length(self, text: str, target: int | None = None) -> dict:
|
|
"""Compare word count against an optional target."""
|
|
clean = _strip_markdown_headings(text)
|
|
words = _split_words(clean)
|
|
word_count = len(words)
|
|
|
|
result: dict = {"word_count": word_count}
|
|
|
|
if target is not None:
|
|
result["target"] = target
|
|
result["difference"] = word_count - target
|
|
if word_count >= target:
|
|
result["status"] = "meets_target"
|
|
elif word_count >= target * 0.8:
|
|
result["status"] = "close"
|
|
else:
|
|
result["status"] = "below_target"
|
|
|
|
return result
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Text-mode formatting
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def _format_text_report(results: dict) -> str:
|
|
"""Format analysis results as a human-readable text report."""
|
|
lines: list[str] = []
|
|
sep = "-" * 60
|
|
|
|
# 1. Content Stats
|
|
cl = results.get("content_length", {})
|
|
|
|
lines.append(sep)
|
|
lines.append(" CONTENT STATS")
|
|
lines.append(sep)
|
|
lines.append(f" Word count: {cl.get('word_count', 0)}")
|
|
if cl.get("target"):
|
|
lines.append(f" Target: {cl['target']} ({cl.get('status', '')})")
|
|
diff = cl.get("difference", 0)
|
|
sign = "+" if diff >= 0 else ""
|
|
lines.append(f" Difference: {sign}{diff}")
|
|
lines.append("")
|
|
|
|
# 2. Structure
|
|
st = results.get("structure", {})
|
|
lines.append(sep)
|
|
lines.append(" STRUCTURE")
|
|
lines.append(sep)
|
|
hc = st.get("heading_counts", {})
|
|
for lvl in range(1, 7):
|
|
count = hc.get(f"h{lvl}", 0)
|
|
if count > 0:
|
|
lines.append(f" H{lvl}: {count}")
|
|
issues = st.get("nesting_issues", [])
|
|
if issues:
|
|
lines.append(" Nesting issues:")
|
|
for issue in issues:
|
|
lines.append(f" - {issue}")
|
|
else:
|
|
lines.append(" Nesting: OK")
|
|
lines.append("")
|
|
|
|
# 3. Keyword Density (only variations with targets)
|
|
kd = results.get("keyword_density", {})
|
|
kw_list = kd.get("keywords", [])
|
|
lines.append(sep)
|
|
lines.append(" KEYWORD DENSITY")
|
|
lines.append(sep)
|
|
if kw_list:
|
|
lines.append(f" {'Variation':<30s} {'Count':>5s} {'Density':>7s} {'Avg':>5s} {'Max':>5s}")
|
|
lines.append(f" {'-'*30} {'-'*5} {'-'*7} {'-'*5} {'-'*5}")
|
|
for kw in kw_list:
|
|
avg_str = str(kw.get("target_avg", "")) if "target_avg" in kw else ""
|
|
max_str = str(kw.get("target_max", "")) if "target_max" in kw else ""
|
|
lines.append(
|
|
f" {kw['keyword']:<30s} "
|
|
f"{kw['count']:>5d} "
|
|
f"{kw['density_pct']:>6.2f}% "
|
|
f"{avg_str:>5s} "
|
|
f"{max_str:>5s}"
|
|
)
|
|
else:
|
|
lines.append(" No keywords specified.")
|
|
lines.append("")
|
|
lines.append(sep)
|
|
|
|
return "\n".join(lines)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# CLI
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
description="Check keyword density and structure of a content draft.",
|
|
epilog="Example: uv run --with openpyxl python seo_optimizer.py draft.md --cora-xlsx report.xlsx",
|
|
)
|
|
parser.add_argument(
|
|
"draft_path",
|
|
help="Path to the content draft (plain text or markdown)",
|
|
)
|
|
parser.add_argument(
|
|
"--keyword",
|
|
dest="keyword",
|
|
default=None,
|
|
help="Primary keyword to evaluate",
|
|
)
|
|
parser.add_argument(
|
|
"--cora-xlsx",
|
|
dest="cora_xlsx",
|
|
default=None,
|
|
help="Path to a Cora XLSX report for keyword-specific targets",
|
|
)
|
|
parser.add_argument(
|
|
"--format",
|
|
choices=["json", "text"],
|
|
default="text",
|
|
help="Output format (default: text)",
|
|
)
|
|
args = parser.parse_args()
|
|
|
|
optimizer = SEOOptimizer()
|
|
|
|
try:
|
|
results = optimizer.analyze(
|
|
draft_path=args.draft_path,
|
|
primary_keyword=args.keyword,
|
|
cora_xlsx_path=args.cora_xlsx,
|
|
)
|
|
except FileNotFoundError as e:
|
|
print(f"Error: {e}", file=sys.stderr)
|
|
sys.exit(1)
|
|
except Exception as e:
|
|
print(f"Error during analysis: {e}", file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
if args.format == "json":
|
|
print(json.dumps(results, indent=2, default=str))
|
|
else:
|
|
print(_format_text_report(results))
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|