CheddahBot/.claude/skills/content-researcher/scripts/seo_optimizer.py

"""
SEO Content Optimizer

Checks keyword density and content structure of a draft against Cora targets.

Usage:
    uv run --with openpyxl python seo_optimizer.py <draft_path>
        [--keyword <kw>] [--cora-xlsx <path>] [--format json|text]

Works standalone for basic checks, or with a Cora XLSX report for
keyword-specific targets via cora_parser.CoraReport.
"""

import argparse
import json
import re
import sys
from pathlib import Path

# Optional Cora integration — script works without it
try:
    from cora_parser import CoraReport
except ImportError:
    CoraReport = None


# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------

def _split_words(text: str) -> list[str]:
    """Extract words from text (alphabetic sequences)."""
    return re.findall(r"[a-zA-Z']+", text)


def _strip_markdown_headings(text: str) -> str:
    """Remove markdown heading markers from text for word counting."""
    return re.sub(r"^#{1,6}\s+", "", text, flags=re.MULTILINE)


def _extract_headings(text: str) -> list[dict]:
    """Extract markdown-style headings with their levels."""
    headings = []
    for match in re.finditer(r"^(#{1,6})\s+(.+)$", text, re.MULTILINE):
        level = len(match.group(1))
        headings.append({"level": level, "text": match.group(2).strip()})
    return headings


# ---------------------------------------------------------------------------
# SEOOptimizer
# ---------------------------------------------------------------------------

class SEOOptimizer:
    """Analyze a content draft for keyword density and structure."""

    def __init__(self):
        self._results = {}

    # -- public entry point -------------------------------------------------

    def analyze(
        self,
        draft_path: str,
        primary_keyword: str | None = None,
        cora_xlsx_path: str | None = None,
    ) -> dict:
        """Run checks on *draft_path* and return an analysis dict."""
        path = Path(draft_path)
        if not path.exists():
            raise FileNotFoundError(f"Draft not found: {draft_path}")

        text = path.read_text(encoding="utf-8")

        # Optionally load Cora data
        cora = None
        if cora_xlsx_path:
            if CoraReport is None:
                print(
                    "Warning: cora_parser not available. "
                    "Install openpyxl and ensure cora_parser.py is importable.",
                    file=sys.stderr,
                )
            else:
                cora = CoraReport(cora_xlsx_path)

        # Determine keyword list
        keywords = []
        if primary_keyword:
            keywords.append(primary_keyword)
        if cora:
            search_term = cora.get_search_term()
            if search_term and search_term.lower() not in [k.lower() for k in keywords]:
                keywords.insert(0, search_term)
            for var in cora.get_keyword_variations():
                v = var["variation"]
                if v.lower() not in [k.lower() for k in keywords]:
                    keywords.append(v)

        # If still no keywords but Cora gave a search term, use it
        if not keywords and cora:
            st = cora.get_search_term()
            if st:
                keywords.append(st)

        # Word-count target from Cora
        word_count_target = None
        if cora:
            for t in cora.get_basic_tunings():
                if t["factor"] == "Word Count":
                    try:
                        word_count_target = int(float(t["goal"]))
                    except (ValueError, TypeError):
                        pass
                    break

        # Build Cora keyword targets (page1_avg) for comparison
        cora_keyword_targets = {}
        if cora:
            for var in cora.get_keyword_variations():
                cora_keyword_targets[var["variation"].lower()] = {
                    "page1_avg": var.get("page1_avg", 0),
                    "page1_max": var.get("page1_max", 0),
                }

        # Run checks
        self._results["content_length"] = self.check_content_length(text, target=word_count_target)
        self._results["structure"] = self.check_structure(text)
        self._results["keyword_density"] = self.check_keyword_density(
            text, keywords=keywords or None, cora_targets=cora_keyword_targets,
        )

        return self._results

    # -- individual checks --------------------------------------------------

    def check_keyword_density(
        self,
        text: str,
        keywords: list[str] | None = None,
        cora_targets: dict | None = None,
    ) -> dict:
        """Return per-keyword density information.

        Only reports variations that have page1_avg > 0 (competitors actually
        use them) when Cora targets are available.
        """
        clean_text = _strip_markdown_headings(text).lower()
        words = _split_words(clean_text)
        total_words = len(words)

        if total_words == 0:
            return {"total_words": 0, "keywords": []}

        results: list[dict] = []

        if keywords:
            for kw in keywords:
                kw_lower = kw.lower()

                # Skip zero-avg variations — competitors don't use them
                if cora_targets and kw_lower in cora_targets:
                    if cora_targets[kw_lower].get("page1_avg", 0) == 0:
                        continue

                kw_words = kw_lower.split()
                if len(kw_words) > 1:
                    pattern = re.compile(r"\b" + re.escape(kw_lower) + r"\b")
                    count = len(pattern.findall(clean_text))
                else:
                    count = sum(1 for w in words if w == kw_lower)

                density = (count / total_words) * 100 if total_words else 0

                entry = {
                    "keyword": kw,
                    "count": count,
                    "density_pct": round(density, 2),
                }

                # Add Cora target if available
                if cora_targets and kw_lower in cora_targets:
                    entry["target_avg"] = cora_targets[kw_lower]["page1_avg"]
                    entry["target_max"] = cora_targets[kw_lower]["page1_max"]

                results.append(entry)
        else:
            # Fallback: top frequent words (>= 4 chars)
            freq: dict[str, int] = {}
            for w in words:
                if len(w) >= 4:
                    freq[w] = freq.get(w, 0) + 1
            top = sorted(freq.items(), key=lambda x: x[1], reverse=True)[:10]
            for w, count in top:
                density = (count / total_words) * 100
                results.append({
                    "keyword": w,
                    "count": count,
                    "density_pct": round(density, 2),
                })

        return {"total_words": total_words, "keywords": results}

    def check_structure(self, text: str) -> dict:
        """Analyze heading hierarchy, paragraph count, and list usage."""
        headings = _extract_headings(text)

        # Count headings per level
        heading_counts = {f"h{i}": 0 for i in range(1, 7)}
        for h in headings:
            heading_counts[f"h{h['level']}"] += 1

        # Check nesting issues
        nesting_issues: list[str] = []
        if heading_counts["h1"] > 1:
            nesting_issues.append(f"Multiple H1 tags found ({heading_counts['h1']}); use exactly one.")

        prev_level = 0
        for h in headings:
            if prev_level > 0 and h["level"] > prev_level + 1:
                nesting_issues.append(
                    f"Heading skip: H{prev_level} -> H{h['level']} "
                    f"(at \"{h['text'][:40]}...\")"
                    if len(h["text"]) > 40 else
                    f"Heading skip: H{prev_level} -> H{h['level']} "
                    f"(at \"{h['text']}\")"
                )
            prev_level = h["level"]

        # Paragraphs
        paragraphs = []
        for block in re.split(r"\n\s*\n", text):
            block = block.strip()
            if not block:
                continue
            if re.match(r"^#{1,6}\s+", block) and "\n" not in block:
                continue
            if all(re.match(r"^\s*[-*+]\s|^\s*\d+\.\s", line) for line in block.splitlines() if line.strip()):
                continue
            paragraphs.append(block)

        paragraph_count = len(paragraphs)

        # List usage
        unordered_items = len(re.findall(r"^\s*[-*+]\s", text, re.MULTILINE))
        ordered_items = len(re.findall(r"^\s*\d+\.\s", text, re.MULTILINE))

        return {
            "heading_counts": heading_counts,
            "headings": [{"level": h["level"], "text": h["text"]} for h in headings],
            "nesting_issues": nesting_issues,
            "paragraph_count": paragraph_count,
            "unordered_list_items": unordered_items,
            "ordered_list_items": ordered_items,
        }

    def check_content_length(self, text: str, target: int | None = None) -> dict:
        """Compare word count against an optional target."""
        clean = _strip_markdown_headings(text)
        words = _split_words(clean)
        word_count = len(words)

        result: dict = {"word_count": word_count}

        if target is not None:
            result["target"] = target
            result["difference"] = word_count - target
            if word_count >= target:
                result["status"] = "meets_target"
            elif word_count >= target * 0.8:
                result["status"] = "close"
            else:
                result["status"] = "below_target"

        return result


# ---------------------------------------------------------------------------
# Text-mode formatting
# ---------------------------------------------------------------------------

def _format_text_report(results: dict) -> str:
    """Format analysis results as a human-readable text report."""
    lines: list[str] = []
    sep = "-" * 60

    # 1. Content Stats
    cl = results.get("content_length", {})

    lines.append(sep)
    lines.append("  CONTENT STATS")
    lines.append(sep)
    lines.append(f"  Word count:      {cl.get('word_count', 0)}")
    if cl.get("target"):
        lines.append(f"  Target:          {cl['target']}  ({cl.get('status', '')})")
        diff = cl.get("difference", 0)
        sign = "+" if diff >= 0 else ""
        lines.append(f"  Difference:      {sign}{diff}")
    lines.append("")

    # 2. Structure
    st = results.get("structure", {})
    lines.append(sep)
    lines.append("  STRUCTURE")
    lines.append(sep)
    hc = st.get("heading_counts", {})
    for lvl in range(1, 7):
        count = hc.get(f"h{lvl}", 0)
        if count > 0:
            lines.append(f"  H{lvl}: {count}")
    issues = st.get("nesting_issues", [])
    if issues:
        lines.append("  Nesting issues:")
        for issue in issues:
            lines.append(f"    - {issue}")
    else:
        lines.append("  Nesting: OK")
    lines.append("")

    # 3. Keyword Density (only variations with targets)
    kd = results.get("keyword_density", {})
    kw_list = kd.get("keywords", [])
    lines.append(sep)
    lines.append("  KEYWORD DENSITY")
    lines.append(sep)
    if kw_list:
        lines.append(f"  {'Variation':<30s}  {'Count':>5s}  {'Density':>7s}  {'Avg':>5s}  {'Max':>5s}")
        lines.append(f"  {'-'*30}  {'-'*5}  {'-'*7}  {'-'*5}  {'-'*5}")
        for kw in kw_list:
            avg_str = str(kw.get("target_avg", "")) if "target_avg" in kw else ""
            max_str = str(kw.get("target_max", "")) if "target_max" in kw else ""
            lines.append(
                f"  {kw['keyword']:<30s}  "
                f"{kw['count']:>5d}  "
                f"{kw['density_pct']:>6.2f}%  "
                f"{avg_str:>5s}  "
                f"{max_str:>5s}"
            )
    else:
        lines.append("  No keywords specified.")
    lines.append("")
    lines.append(sep)

    return "\n".join(lines)


# ---------------------------------------------------------------------------
# CLI
# ---------------------------------------------------------------------------

def main():
    parser = argparse.ArgumentParser(
        description="Check keyword density and structure of a content draft.",
        epilog="Example: uv run --with openpyxl python seo_optimizer.py draft.md --cora-xlsx report.xlsx",
    )
    parser.add_argument(
        "draft_path",
        help="Path to the content draft (plain text or markdown)",
    )
    parser.add_argument(
        "--keyword",
        dest="keyword",
        default=None,
        help="Primary keyword to evaluate",
    )
    parser.add_argument(
        "--cora-xlsx",
        dest="cora_xlsx",
        default=None,
        help="Path to a Cora XLSX report for keyword-specific targets",
    )
    parser.add_argument(
        "--format",
        choices=["json", "text"],
        default="text",
        help="Output format (default: text)",
    )
    args = parser.parse_args()

    optimizer = SEOOptimizer()

    try:
        results = optimizer.analyze(
            draft_path=args.draft_path,
            primary_keyword=args.keyword,
            cora_xlsx_path=args.cora_xlsx,
        )
    except FileNotFoundError as e:
        print(f"Error: {e}", file=sys.stderr)
        sys.exit(1)
    except Exception as e:
        print(f"Error during analysis: {e}", file=sys.stderr)
        sys.exit(1)

    if args.format == "json":
        print(json.dumps(results, indent=2, default=str))
    else:
        print(_format_text_report(results))


if __name__ == "__main__":
    main()