CheddahBot/.claude/skills/content-researcher/scripts/competitor_scraper.py

293 lines
8.9 KiB
Python

"""
Competitor Content Scraper
Fetches web pages and extracts clean text content for analysis.
Used as a utility when the user provides a list of URLs to examine.
Usage:
uv run --with requests,beautifulsoup4 python competitor_scraper.py URL1 URL2 ...
[--output-dir ./working/competitor_content/]
[--format json|text]
"""
import argparse
import json
import re
import sys
import time
from pathlib import Path
from urllib.parse import urlparse
try:
import requests
from bs4 import BeautifulSoup
except ImportError:
print(
"Error: requests and beautifulsoup4 are required.\n"
"Install with: uv add requests beautifulsoup4",
file=sys.stderr,
)
sys.exit(1)
UNWANTED_TAGS = [
"nav", "footer", "header", "aside", "script", "style", "noscript",
"iframe", "form", "button", "svg", "img", "video", "audio",
]
UNWANTED_CLASSES = [
"nav", "navbar", "navigation", "menu", "sidebar", "footer", "header",
"breadcrumb", "cookie", "popup", "modal", "advertisement", "ad-",
"social", "share", "comment", "related-posts",
]
DEFAULT_HEADERS = {
"User-Agent": (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
),
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.5",
}
class CompetitorScraper:
"""Fetches and cleans web page content for competitor analysis."""
def __init__(self, timeout: int = 15, delay: float = 1.0):
"""
Args:
timeout: Request timeout in seconds.
delay: Delay between requests in seconds (rate limiting).
"""
self.timeout = timeout
self.delay = delay
self.session = requests.Session()
self.session.headers.update(DEFAULT_HEADERS)
def scrape_url(self, url: str) -> dict:
"""Scrape a single URL and extract clean content.
Returns:
Dict with: url, host, title, meta_description, headings, text, word_count, error
"""
result = {
"url": url,
"host": urlparse(url).netloc,
"title": "",
"meta_description": "",
"headings": [],
"text": "",
"word_count": 0,
"error": None,
}
try:
response = self.session.get(url, timeout=self.timeout)
response.raise_for_status()
response.encoding = response.apparent_encoding or "utf-8"
html = response.text
except requests.RequestException as e:
result["error"] = str(e)
return result
soup = BeautifulSoup(html, "html.parser")
# Extract title
title_tag = soup.find("title")
if title_tag:
result["title"] = title_tag.get_text(strip=True)
# Extract meta description
meta_desc = soup.find("meta", attrs={"name": "description"})
if meta_desc and meta_desc.get("content"):
result["meta_description"] = meta_desc["content"].strip()
# Extract headings before cleaning
result["headings"] = self._extract_headings(soup)
# Clean the HTML and extract main text
result["text"] = self._extract_text(soup)
result["word_count"] = len(result["text"].split())
return result
def scrape_urls(self, urls: list[str]) -> list[dict]:
"""Scrape multiple URLs with rate limiting.
Args:
urls: List of URLs to scrape.
Returns:
List of result dicts from scrape_url.
"""
results = []
for i, url in enumerate(urls):
if i > 0:
time.sleep(self.delay)
print(f" Scraping [{i + 1}/{len(urls)}]: {url}", file=sys.stderr)
result = self.scrape_url(url)
if result["error"]:
print(f" Error: {result['error']}", file=sys.stderr)
else:
print(f" OK: {result['word_count']} words", file=sys.stderr)
results.append(result)
return results
def save_results(self, results: list[dict], output_dir: str) -> list[str]:
"""Save scraped results as individual text files.
Args:
results: List of result dicts from scrape_urls.
output_dir: Directory to write files to.
Returns:
List of file paths written.
"""
out_path = Path(output_dir)
out_path.mkdir(parents=True, exist_ok=True)
saved = []
for result in results:
if result["error"] or not result["text"]:
continue
# Create filename from host
host = result["host"].replace("www.", "")
safe_name = re.sub(r'[^\w\-.]', '_', host)
filepath = out_path / f"{safe_name}.txt"
content = self._format_output(result)
filepath.write_text(content, encoding="utf-8")
saved.append(str(filepath))
return saved
def _extract_headings(self, soup: BeautifulSoup) -> list[dict]:
"""Extract all headings (h1-h6) with their level and text."""
headings = []
for tag in soup.find_all(re.compile(r'^h[1-6]$')):
level = int(tag.name[1])
text = tag.get_text(strip=True)
if text:
headings.append({"level": level, "text": text})
return headings
def _extract_text(self, soup: BeautifulSoup) -> str:
"""Extract clean body text from HTML, stripping navigation and boilerplate."""
# Remove unwanted tags
for tag_name in UNWANTED_TAGS:
for tag in soup.find_all(tag_name):
tag.decompose()
# Remove elements with unwanted class names
for element in list(soup.find_all(True)):
if element.attrs is None:
continue
classes = element.get("class", [])
if isinstance(classes, list):
class_str = " ".join(classes).lower()
else:
class_str = str(classes).lower()
el_id = str(element.get("id", "")).lower()
for pattern in UNWANTED_CLASSES:
if pattern in class_str or pattern in el_id:
element.decompose()
break
# Try to find main content area
main_content = (
soup.find("main")
or soup.find("article")
or soup.find("div", {"role": "main"})
or soup.find("div", class_=re.compile(r'content|article|post|entry', re.I))
or soup.body
or soup
)
# Extract text with some structure preserved
text = main_content.get_text(separator="\n", strip=True)
# Clean up excessive whitespace
lines = []
for line in text.splitlines():
line = line.strip()
if line:
lines.append(line)
return "\n".join(lines)
def _format_output(self, result: dict) -> str:
"""Format a single result as a readable text file."""
lines = [
f"URL: {result['url']}",
f"Title: {result['title']}",
f"Meta Description: {result['meta_description']}",
f"Word Count: {result['word_count']}",
"",
"--- HEADINGS ---",
]
for h in result["headings"]:
indent = " " * (h["level"] - 1)
lines.append(f"{indent}H{h['level']}: {h['text']}")
lines.extend(["", "--- CONTENT ---", "", result["text"]])
return "\n".join(lines)
def main():
parser = argparse.ArgumentParser(description="Scrape competitor web pages for content analysis")
parser.add_argument("urls", nargs="+", help="URLs to scrape")
parser.add_argument(
"--output-dir",
default="./working/competitor_content",
help="Directory to save scraped content (default: ./working/competitor_content/)",
)
parser.add_argument(
"--format",
choices=["json", "text"],
default="text",
help="Output format for stdout (default: text)",
)
parser.add_argument(
"--timeout",
type=int,
default=15,
help="Request timeout in seconds (default: 15)",
)
parser.add_argument(
"--delay",
type=float,
default=1.0,
help="Delay between requests in seconds (default: 1.0)",
)
args = parser.parse_args()
scraper = CompetitorScraper(timeout=args.timeout, delay=args.delay)
results = scraper.scrape_urls(args.urls)
# Save files
saved = scraper.save_results(results, args.output_dir)
print(f"\nSaved {len(saved)} files to {args.output_dir}", file=sys.stderr)
# Output to stdout
successful = [r for r in results if not r["error"]]
if args.format == "json":
print(json.dumps(successful, indent=2))
else:
for r in successful:
print(scraper._format_output(r))
print("\n" + "=" * 80 + "\n")
if __name__ == "__main__":
main()