CheddahBot/cheddahbot/tools/web.py

"""Web tools: search, fetch URL, scrape."""

from __future__ import annotations

import httpx
from bs4 import BeautifulSoup

from . import tool


@tool("web_search", "Search the web using DuckDuckGo", category="web")
def web_search(query: str, max_results: int = 5) -> str:
    try:
        # Use DuckDuckGo HTML search (no API key needed)
        r = httpx.get(
            "https://html.duckduckgo.com/html/",
            params={"q": query},
            headers={"User-Agent": "Mozilla/5.0 (compatible; CheddahBot/1.0)"},
            timeout=15,
            follow_redirects=True,
        )
        soup = BeautifulSoup(r.text, "html.parser")
        results = []
        for item in soup.select(".result")[:max_results]:
            title_el = item.select_one(".result__title a")
            snippet_el = item.select_one(".result__snippet")
            if title_el:
                title = title_el.get_text(strip=True)
                url = title_el.get("href", "")
                snippet = snippet_el.get_text(strip=True) if snippet_el else ""
                results.append(f"**{title}**\n{url}\n{snippet}")
        return "\n\n".join(results) if results else "No results found."
    except Exception as e:
        return f"Search error: {e}"


@tool("fetch_url", "Fetch and extract text content from a URL", category="web")
def fetch_url(url: str) -> str:
    try:
        r = httpx.get(
            url,
            headers={"User-Agent": "Mozilla/5.0 (compatible; CheddahBot/1.0)"},
            timeout=20,
            follow_redirects=True,
        )
        content_type = r.headers.get("content-type", "")
        if "html" in content_type:
            soup = BeautifulSoup(r.text, "html.parser")
            # Remove script/style elements
            for tag in soup(["script", "style", "nav", "footer", "header"]):
                tag.decompose()
            text = soup.get_text(separator="\n", strip=True)
            # Collapse whitespace
            lines = [line.strip() for line in text.split("\n") if line.strip()]
            text = "\n".join(lines)
            if len(text) > 15000:
                text = text[:15000] + "\n... (truncated)"
            return text
        elif "json" in content_type:
            return r.text[:15000]
        else:
            return r.text[:5000]
    except Exception as e:
        return f"Fetch error: {e}"