293 lines
8.9 KiB
Python
293 lines
8.9 KiB
Python
"""
|
|
Competitor Content Scraper
|
|
|
|
Fetches web pages and extracts clean text content for analysis.
|
|
Used as a utility when the user provides a list of URLs to examine.
|
|
|
|
Usage:
|
|
uv run --with requests,beautifulsoup4 python competitor_scraper.py URL1 URL2 ...
|
|
[--output-dir ./working/competitor_content/]
|
|
[--format json|text]
|
|
"""
|
|
|
|
import argparse
|
|
import json
|
|
import re
|
|
import sys
|
|
import time
|
|
from pathlib import Path
|
|
from urllib.parse import urlparse
|
|
|
|
try:
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
except ImportError:
|
|
print(
|
|
"Error: requests and beautifulsoup4 are required.\n"
|
|
"Install with: uv add requests beautifulsoup4",
|
|
file=sys.stderr,
|
|
)
|
|
sys.exit(1)
|
|
|
|
|
|
UNWANTED_TAGS = [
|
|
"nav", "footer", "header", "aside", "script", "style", "noscript",
|
|
"iframe", "form", "button", "svg", "img", "video", "audio",
|
|
]
|
|
|
|
UNWANTED_CLASSES = [
|
|
"nav", "navbar", "navigation", "menu", "sidebar", "footer", "header",
|
|
"breadcrumb", "cookie", "popup", "modal", "advertisement", "ad-",
|
|
"social", "share", "comment", "related-posts",
|
|
]
|
|
|
|
DEFAULT_HEADERS = {
|
|
"User-Agent": (
|
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
|
|
"(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
|
|
),
|
|
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
|
"Accept-Language": "en-US,en;q=0.5",
|
|
}
|
|
|
|
|
|
class CompetitorScraper:
|
|
"""Fetches and cleans web page content for competitor analysis."""
|
|
|
|
def __init__(self, timeout: int = 15, delay: float = 1.0):
|
|
"""
|
|
Args:
|
|
timeout: Request timeout in seconds.
|
|
delay: Delay between requests in seconds (rate limiting).
|
|
"""
|
|
self.timeout = timeout
|
|
self.delay = delay
|
|
self.session = requests.Session()
|
|
self.session.headers.update(DEFAULT_HEADERS)
|
|
|
|
def scrape_url(self, url: str) -> dict:
|
|
"""Scrape a single URL and extract clean content.
|
|
|
|
Returns:
|
|
Dict with: url, host, title, meta_description, headings, text, word_count, error
|
|
"""
|
|
result = {
|
|
"url": url,
|
|
"host": urlparse(url).netloc,
|
|
"title": "",
|
|
"meta_description": "",
|
|
"headings": [],
|
|
"text": "",
|
|
"word_count": 0,
|
|
"error": None,
|
|
}
|
|
|
|
try:
|
|
response = self.session.get(url, timeout=self.timeout)
|
|
response.raise_for_status()
|
|
response.encoding = response.apparent_encoding or "utf-8"
|
|
html = response.text
|
|
except requests.RequestException as e:
|
|
result["error"] = str(e)
|
|
return result
|
|
|
|
soup = BeautifulSoup(html, "html.parser")
|
|
|
|
# Extract title
|
|
title_tag = soup.find("title")
|
|
if title_tag:
|
|
result["title"] = title_tag.get_text(strip=True)
|
|
|
|
# Extract meta description
|
|
meta_desc = soup.find("meta", attrs={"name": "description"})
|
|
if meta_desc and meta_desc.get("content"):
|
|
result["meta_description"] = meta_desc["content"].strip()
|
|
|
|
# Extract headings before cleaning
|
|
result["headings"] = self._extract_headings(soup)
|
|
|
|
# Clean the HTML and extract main text
|
|
result["text"] = self._extract_text(soup)
|
|
result["word_count"] = len(result["text"].split())
|
|
|
|
return result
|
|
|
|
def scrape_urls(self, urls: list[str]) -> list[dict]:
|
|
"""Scrape multiple URLs with rate limiting.
|
|
|
|
Args:
|
|
urls: List of URLs to scrape.
|
|
|
|
Returns:
|
|
List of result dicts from scrape_url.
|
|
"""
|
|
results = []
|
|
for i, url in enumerate(urls):
|
|
if i > 0:
|
|
time.sleep(self.delay)
|
|
|
|
print(f" Scraping [{i + 1}/{len(urls)}]: {url}", file=sys.stderr)
|
|
result = self.scrape_url(url)
|
|
|
|
if result["error"]:
|
|
print(f" Error: {result['error']}", file=sys.stderr)
|
|
else:
|
|
print(f" OK: {result['word_count']} words", file=sys.stderr)
|
|
|
|
results.append(result)
|
|
|
|
return results
|
|
|
|
def save_results(self, results: list[dict], output_dir: str) -> list[str]:
|
|
"""Save scraped results as individual text files.
|
|
|
|
Args:
|
|
results: List of result dicts from scrape_urls.
|
|
output_dir: Directory to write files to.
|
|
|
|
Returns:
|
|
List of file paths written.
|
|
"""
|
|
out_path = Path(output_dir)
|
|
out_path.mkdir(parents=True, exist_ok=True)
|
|
|
|
saved = []
|
|
for result in results:
|
|
if result["error"] or not result["text"]:
|
|
continue
|
|
|
|
# Create filename from host
|
|
host = result["host"].replace("www.", "")
|
|
safe_name = re.sub(r'[^\w\-.]', '_', host)
|
|
filepath = out_path / f"{safe_name}.txt"
|
|
|
|
content = self._format_output(result)
|
|
filepath.write_text(content, encoding="utf-8")
|
|
saved.append(str(filepath))
|
|
|
|
return saved
|
|
|
|
def _extract_headings(self, soup: BeautifulSoup) -> list[dict]:
|
|
"""Extract all headings (h1-h6) with their level and text."""
|
|
headings = []
|
|
for tag in soup.find_all(re.compile(r'^h[1-6]$')):
|
|
level = int(tag.name[1])
|
|
text = tag.get_text(strip=True)
|
|
if text:
|
|
headings.append({"level": level, "text": text})
|
|
return headings
|
|
|
|
def _extract_text(self, soup: BeautifulSoup) -> str:
|
|
"""Extract clean body text from HTML, stripping navigation and boilerplate."""
|
|
# Remove unwanted tags
|
|
for tag_name in UNWANTED_TAGS:
|
|
for tag in soup.find_all(tag_name):
|
|
tag.decompose()
|
|
|
|
# Remove elements with unwanted class names
|
|
for element in list(soup.find_all(True)):
|
|
if element.attrs is None:
|
|
continue
|
|
classes = element.get("class", [])
|
|
if isinstance(classes, list):
|
|
class_str = " ".join(classes).lower()
|
|
else:
|
|
class_str = str(classes).lower()
|
|
|
|
el_id = str(element.get("id", "")).lower()
|
|
|
|
for pattern in UNWANTED_CLASSES:
|
|
if pattern in class_str or pattern in el_id:
|
|
element.decompose()
|
|
break
|
|
|
|
# Try to find main content area
|
|
main_content = (
|
|
soup.find("main")
|
|
or soup.find("article")
|
|
or soup.find("div", {"role": "main"})
|
|
or soup.find("div", class_=re.compile(r'content|article|post|entry', re.I))
|
|
or soup.body
|
|
or soup
|
|
)
|
|
|
|
# Extract text with some structure preserved
|
|
text = main_content.get_text(separator="\n", strip=True)
|
|
|
|
# Clean up excessive whitespace
|
|
lines = []
|
|
for line in text.splitlines():
|
|
line = line.strip()
|
|
if line:
|
|
lines.append(line)
|
|
|
|
return "\n".join(lines)
|
|
|
|
def _format_output(self, result: dict) -> str:
|
|
"""Format a single result as a readable text file."""
|
|
lines = [
|
|
f"URL: {result['url']}",
|
|
f"Title: {result['title']}",
|
|
f"Meta Description: {result['meta_description']}",
|
|
f"Word Count: {result['word_count']}",
|
|
"",
|
|
"--- HEADINGS ---",
|
|
]
|
|
|
|
for h in result["headings"]:
|
|
indent = " " * (h["level"] - 1)
|
|
lines.append(f"{indent}H{h['level']}: {h['text']}")
|
|
|
|
lines.extend(["", "--- CONTENT ---", "", result["text"]])
|
|
|
|
return "\n".join(lines)
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description="Scrape competitor web pages for content analysis")
|
|
parser.add_argument("urls", nargs="+", help="URLs to scrape")
|
|
parser.add_argument(
|
|
"--output-dir",
|
|
default="./working/competitor_content",
|
|
help="Directory to save scraped content (default: ./working/competitor_content/)",
|
|
)
|
|
parser.add_argument(
|
|
"--format",
|
|
choices=["json", "text"],
|
|
default="text",
|
|
help="Output format for stdout (default: text)",
|
|
)
|
|
parser.add_argument(
|
|
"--timeout",
|
|
type=int,
|
|
default=15,
|
|
help="Request timeout in seconds (default: 15)",
|
|
)
|
|
parser.add_argument(
|
|
"--delay",
|
|
type=float,
|
|
default=1.0,
|
|
help="Delay between requests in seconds (default: 1.0)",
|
|
)
|
|
args = parser.parse_args()
|
|
|
|
scraper = CompetitorScraper(timeout=args.timeout, delay=args.delay)
|
|
results = scraper.scrape_urls(args.urls)
|
|
|
|
# Save files
|
|
saved = scraper.save_results(results, args.output_dir)
|
|
print(f"\nSaved {len(saved)} files to {args.output_dir}", file=sys.stderr)
|
|
|
|
# Output to stdout
|
|
successful = [r for r in results if not r["error"]]
|
|
if args.format == "json":
|
|
print(json.dumps(successful, indent=2))
|
|
else:
|
|
for r in successful:
|
|
print(scraper._format_output(r))
|
|
print("\n" + "=" * 80 + "\n")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|