""" Competitor Content Scraper Fetches web pages and extracts clean text content for analysis. Used as a utility when the user provides a list of URLs to examine. Usage: uv run --with requests,beautifulsoup4 python competitor_scraper.py URL1 URL2 ... [--output-dir ./working/competitor_content/] [--format json|text] """ import argparse import json import re import sys import time from pathlib import Path from urllib.parse import urlparse try: import requests from bs4 import BeautifulSoup except ImportError: print( "Error: requests and beautifulsoup4 are required.\n" "Install with: uv add requests beautifulsoup4", file=sys.stderr, ) sys.exit(1) UNWANTED_TAGS = [ "nav", "footer", "header", "aside", "script", "style", "noscript", "iframe", "form", "button", "svg", "img", "video", "audio", ] UNWANTED_CLASSES = [ "nav", "navbar", "navigation", "menu", "sidebar", "footer", "header", "breadcrumb", "cookie", "popup", "modal", "advertisement", "ad-", "social", "share", "comment", "related-posts", ] DEFAULT_HEADERS = { "User-Agent": ( "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " "(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" ), "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language": "en-US,en;q=0.5", } class CompetitorScraper: """Fetches and cleans web page content for competitor analysis.""" def __init__(self, timeout: int = 15, delay: float = 1.0): """ Args: timeout: Request timeout in seconds. delay: Delay between requests in seconds (rate limiting). """ self.timeout = timeout self.delay = delay self.session = requests.Session() self.session.headers.update(DEFAULT_HEADERS) def scrape_url(self, url: str) -> dict: """Scrape a single URL and extract clean content. Returns: Dict with: url, host, title, meta_description, headings, text, word_count, error """ result = { "url": url, "host": urlparse(url).netloc, "title": "", "meta_description": "", "headings": [], "text": "", "word_count": 0, "error": None, } try: response = self.session.get(url, timeout=self.timeout) response.raise_for_status() response.encoding = response.apparent_encoding or "utf-8" html = response.text except requests.RequestException as e: result["error"] = str(e) return result soup = BeautifulSoup(html, "html.parser") # Extract title title_tag = soup.find("title") if title_tag: result["title"] = title_tag.get_text(strip=True) # Extract meta description meta_desc = soup.find("meta", attrs={"name": "description"}) if meta_desc and meta_desc.get("content"): result["meta_description"] = meta_desc["content"].strip() # Extract headings before cleaning result["headings"] = self._extract_headings(soup) # Clean the HTML and extract main text result["text"] = self._extract_text(soup) result["word_count"] = len(result["text"].split()) return result def scrape_urls(self, urls: list[str]) -> list[dict]: """Scrape multiple URLs with rate limiting. Args: urls: List of URLs to scrape. Returns: List of result dicts from scrape_url. """ results = [] for i, url in enumerate(urls): if i > 0: time.sleep(self.delay) print(f" Scraping [{i + 1}/{len(urls)}]: {url}", file=sys.stderr) result = self.scrape_url(url) if result["error"]: print(f" Error: {result['error']}", file=sys.stderr) else: print(f" OK: {result['word_count']} words", file=sys.stderr) results.append(result) return results def save_results(self, results: list[dict], output_dir: str) -> list[str]: """Save scraped results as individual text files. Args: results: List of result dicts from scrape_urls. output_dir: Directory to write files to. Returns: List of file paths written. """ out_path = Path(output_dir) out_path.mkdir(parents=True, exist_ok=True) saved = [] for result in results: if result["error"] or not result["text"]: continue # Create filename from host host = result["host"].replace("www.", "") safe_name = re.sub(r'[^\w\-.]', '_', host) filepath = out_path / f"{safe_name}.txt" content = self._format_output(result) filepath.write_text(content, encoding="utf-8") saved.append(str(filepath)) return saved def _extract_headings(self, soup: BeautifulSoup) -> list[dict]: """Extract all headings (h1-h6) with their level and text.""" headings = [] for tag in soup.find_all(re.compile(r'^h[1-6]$')): level = int(tag.name[1]) text = tag.get_text(strip=True) if text: headings.append({"level": level, "text": text}) return headings def _extract_text(self, soup: BeautifulSoup) -> str: """Extract clean body text from HTML, stripping navigation and boilerplate.""" # Remove unwanted tags for tag_name in UNWANTED_TAGS: for tag in soup.find_all(tag_name): tag.decompose() # Remove elements with unwanted class names for element in list(soup.find_all(True)): if element.attrs is None: continue classes = element.get("class", []) if isinstance(classes, list): class_str = " ".join(classes).lower() else: class_str = str(classes).lower() el_id = str(element.get("id", "")).lower() for pattern in UNWANTED_CLASSES: if pattern in class_str or pattern in el_id: element.decompose() break # Try to find main content area main_content = ( soup.find("main") or soup.find("article") or soup.find("div", {"role": "main"}) or soup.find("div", class_=re.compile(r'content|article|post|entry', re.I)) or soup.body or soup ) # Extract text with some structure preserved text = main_content.get_text(separator="\n", strip=True) # Clean up excessive whitespace lines = [] for line in text.splitlines(): line = line.strip() if line: lines.append(line) return "\n".join(lines) def _format_output(self, result: dict) -> str: """Format a single result as a readable text file.""" lines = [ f"URL: {result['url']}", f"Title: {result['title']}", f"Meta Description: {result['meta_description']}", f"Word Count: {result['word_count']}", "", "--- HEADINGS ---", ] for h in result["headings"]: indent = " " * (h["level"] - 1) lines.append(f"{indent}H{h['level']}: {h['text']}") lines.extend(["", "--- CONTENT ---", "", result["text"]]) return "\n".join(lines) def main(): parser = argparse.ArgumentParser(description="Scrape competitor web pages for content analysis") parser.add_argument("urls", nargs="+", help="URLs to scrape") parser.add_argument( "--output-dir", default="./working/competitor_content", help="Directory to save scraped content (default: ./working/competitor_content/)", ) parser.add_argument( "--format", choices=["json", "text"], default="text", help="Output format for stdout (default: text)", ) parser.add_argument( "--timeout", type=int, default=15, help="Request timeout in seconds (default: 15)", ) parser.add_argument( "--delay", type=float, default=1.0, help="Delay between requests in seconds (default: 1.0)", ) args = parser.parse_args() scraper = CompetitorScraper(timeout=args.timeout, delay=args.delay) results = scraper.scrape_urls(args.urls) # Save files saved = scraper.save_results(results, args.output_dir) print(f"\nSaved {len(saved)} files to {args.output_dir}", file=sys.stderr) # Output to stdout successful = [r for r in results if not r["error"]] if args.format == "json": print(json.dumps(successful, indent=2)) else: for r in successful: print(scraper._format_output(r)) print("\n" + "=" * 80 + "\n") if __name__ == "__main__": main()