CheddahBot/.claude/skills/content-researcher/scripts/competitor_scraper.py

"""
Competitor Content Scraper

Fetches web pages and extracts clean text content for analysis.
Used as a utility when the user provides a list of URLs to examine.

Usage:
    uv run --with requests,beautifulsoup4 python competitor_scraper.py URL1 URL2 ...
        [--output-dir ./working/competitor_content/]
        [--format json|text]
"""

import argparse
import json
import re
import sys
import time
from pathlib import Path
from urllib.parse import urlparse

try:
    import requests
    from bs4 import BeautifulSoup
except ImportError:
    print(
        "Error: requests and beautifulsoup4 are required.\n"
        "Install with: uv add requests beautifulsoup4",
        file=sys.stderr,
    )
    sys.exit(1)


UNWANTED_TAGS = [
    "nav", "footer", "header", "aside", "script", "style", "noscript",
    "iframe", "form", "button", "svg", "img", "video", "audio",
]

UNWANTED_CLASSES = [
    "nav", "navbar", "navigation", "menu", "sidebar", "footer", "header",
    "breadcrumb", "cookie", "popup", "modal", "advertisement", "ad-",
    "social", "share", "comment", "related-posts",
]

DEFAULT_HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
        "(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
    ),
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
    "Accept-Language": "en-US,en;q=0.5",
}


class CompetitorScraper:
    """Fetches and cleans web page content for competitor analysis."""

    def __init__(self, timeout: int = 15, delay: float = 1.0):
        """
        Args:
            timeout: Request timeout in seconds.
            delay: Delay between requests in seconds (rate limiting).
        """
        self.timeout = timeout
        self.delay = delay
        self.session = requests.Session()
        self.session.headers.update(DEFAULT_HEADERS)

    def scrape_url(self, url: str) -> dict:
        """Scrape a single URL and extract clean content.

        Returns:
            Dict with: url, host, title, meta_description, headings, text, word_count, error
        """
        result = {
            "url": url,
            "host": urlparse(url).netloc,
            "title": "",
            "meta_description": "",
            "headings": [],
            "text": "",
            "word_count": 0,
            "error": None,
        }

        try:
            response = self.session.get(url, timeout=self.timeout)
            response.raise_for_status()
            response.encoding = response.apparent_encoding or "utf-8"
            html = response.text
        except requests.RequestException as e:
            result["error"] = str(e)
            return result

        soup = BeautifulSoup(html, "html.parser")

        # Extract title
        title_tag = soup.find("title")
        if title_tag:
            result["title"] = title_tag.get_text(strip=True)

        # Extract meta description
        meta_desc = soup.find("meta", attrs={"name": "description"})
        if meta_desc and meta_desc.get("content"):
            result["meta_description"] = meta_desc["content"].strip()

        # Extract headings before cleaning
        result["headings"] = self._extract_headings(soup)

        # Clean the HTML and extract main text
        result["text"] = self._extract_text(soup)
        result["word_count"] = len(result["text"].split())

        return result

    def scrape_urls(self, urls: list[str]) -> list[dict]:
        """Scrape multiple URLs with rate limiting.

        Args:
            urls: List of URLs to scrape.

        Returns:
            List of result dicts from scrape_url.
        """
        results = []
        for i, url in enumerate(urls):
            if i > 0:
                time.sleep(self.delay)

            print(f"  Scraping [{i + 1}/{len(urls)}]: {url}", file=sys.stderr)
            result = self.scrape_url(url)

            if result["error"]:
                print(f"    Error: {result['error']}", file=sys.stderr)
            else:
                print(f"    OK: {result['word_count']} words", file=sys.stderr)

            results.append(result)

        return results

    def save_results(self, results: list[dict], output_dir: str) -> list[str]:
        """Save scraped results as individual text files.

        Args:
            results: List of result dicts from scrape_urls.
            output_dir: Directory to write files to.

        Returns:
            List of file paths written.
        """
        out_path = Path(output_dir)
        out_path.mkdir(parents=True, exist_ok=True)

        saved = []
        for result in results:
            if result["error"] or not result["text"]:
                continue

            # Create filename from host
            host = result["host"].replace("www.", "")
            safe_name = re.sub(r'[^\w\-.]', '_', host)
            filepath = out_path / f"{safe_name}.txt"

            content = self._format_output(result)
            filepath.write_text(content, encoding="utf-8")
            saved.append(str(filepath))

        return saved

    def _extract_headings(self, soup: BeautifulSoup) -> list[dict]:
        """Extract all headings (h1-h6) with their level and text."""
        headings = []
        for tag in soup.find_all(re.compile(r'^h[1-6]$')):
            level = int(tag.name[1])
            text = tag.get_text(strip=True)
            if text:
                headings.append({"level": level, "text": text})
        return headings

    def _extract_text(self, soup: BeautifulSoup) -> str:
        """Extract clean body text from HTML, stripping navigation and boilerplate."""
        # Remove unwanted tags
        for tag_name in UNWANTED_TAGS:
            for tag in soup.find_all(tag_name):
                tag.decompose()

        # Remove elements with unwanted class names
        for element in list(soup.find_all(True)):
            if element.attrs is None:
                continue
            classes = element.get("class", [])
            if isinstance(classes, list):
                class_str = " ".join(classes).lower()
            else:
                class_str = str(classes).lower()

            el_id = str(element.get("id", "")).lower()

            for pattern in UNWANTED_CLASSES:
                if pattern in class_str or pattern in el_id:
                    element.decompose()
                    break

        # Try to find main content area
        main_content = (
            soup.find("main")
            or soup.find("article")
            or soup.find("div", {"role": "main"})
            or soup.find("div", class_=re.compile(r'content|article|post|entry', re.I))
            or soup.body
            or soup
        )

        # Extract text with some structure preserved
        text = main_content.get_text(separator="\n", strip=True)

        # Clean up excessive whitespace
        lines = []
        for line in text.splitlines():
            line = line.strip()
            if line:
                lines.append(line)

        return "\n".join(lines)

    def _format_output(self, result: dict) -> str:
        """Format a single result as a readable text file."""
        lines = [
            f"URL: {result['url']}",
            f"Title: {result['title']}",
            f"Meta Description: {result['meta_description']}",
            f"Word Count: {result['word_count']}",
            "",
            "--- HEADINGS ---",
        ]

        for h in result["headings"]:
            indent = "  " * (h["level"] - 1)
            lines.append(f"{indent}H{h['level']}: {h['text']}")

        lines.extend(["", "--- CONTENT ---", "", result["text"]])

        return "\n".join(lines)


def main():
    parser = argparse.ArgumentParser(description="Scrape competitor web pages for content analysis")
    parser.add_argument("urls", nargs="+", help="URLs to scrape")
    parser.add_argument(
        "--output-dir",
        default="./working/competitor_content",
        help="Directory to save scraped content (default: ./working/competitor_content/)",
    )
    parser.add_argument(
        "--format",
        choices=["json", "text"],
        default="text",
        help="Output format for stdout (default: text)",
    )
    parser.add_argument(
        "--timeout",
        type=int,
        default=15,
        help="Request timeout in seconds (default: 15)",
    )
    parser.add_argument(
        "--delay",
        type=float,
        default=1.0,
        help="Delay between requests in seconds (default: 1.0)",
    )
    args = parser.parse_args()

    scraper = CompetitorScraper(timeout=args.timeout, delay=args.delay)
    results = scraper.scrape_urls(args.urls)

    # Save files
    saved = scraper.save_results(results, args.output_dir)
    print(f"\nSaved {len(saved)} files to {args.output_dir}", file=sys.stderr)

    # Output to stdout
    successful = [r for r in results if not r["error"]]
    if args.format == "json":
        print(json.dumps(successful, indent=2))
    else:
        for r in successful:
            print(scraper._format_output(r))
            print("\n" + "=" * 80 + "\n")


if __name__ == "__main__":
    main()