"""Two-brain LLM adapter. Chat Brain: - OpenRouter / Ollama / LM Studio (OpenAI-compatible APIs) - Full control over system prompt — Cheddah personality works here - Claude models available via OpenRouter mapping Execution Brain: - Claude Code CLI (subprocess) - Used for heartbeat, scheduled tasks, delegated system-level work - Claude's built-in tools (Bash, Read, Edit, etc.) are a feature here """ from __future__ import annotations import json import logging import os import shutil import subprocess import sys import time from collections.abc import Generator from dataclasses import dataclass import httpx log = logging.getLogger(__name__) # Pricing per million tokens: (input_cost, output_cost) in USD MODEL_PRICING: dict[str, tuple[float, float]] = { "anthropic/claude-sonnet": (3.00, 15.00), "anthropic/claude-opus": (5.00, 25.00), "anthropic/claude-haiku": (0.80, 4.00), "x-ai/grok-4.1-fast": (0.20, 0.50), "google/gemini-3-flash": (0.50, 3.00), "google/gemini-2.5-flash": (0.15, 0.60), "openai/gpt-4o-mini": (0.15, 0.60), "openai/gpt-5-nano": (0.10, 0.40), "deepseek/deepseek-v3": (0.24, 0.38), "minimax/minimax-m2.5": (0.30, 1.20), "moonshotai/kimi-k2.5": (0.45, 2.20), } def _estimate_cost(model: str, prompt_tokens: int, completion_tokens: int) -> float: """Estimate cost in USD using prefix matching against MODEL_PRICING.""" for prefix, (input_rate, output_rate) in MODEL_PRICING.items(): if model.startswith(prefix): return (prompt_tokens * input_rate + completion_tokens * output_rate) / 1_000_000 return 0.0 @dataclass class ModelInfo: id: str name: str provider: str # "openrouter" | "ollama" | "lmstudio" context_length: int | None = None # Claude model IDs → OpenRouter equivalents (for chat dropdown) CLAUDE_OPENROUTER_MAP = { "claude-sonnet-4.5": "anthropic/claude-sonnet-4.5", "claude-opus-4.6": "anthropic/claude-opus-4.6", "claude-haiku-4.5": "anthropic/claude-haiku-4.5", } def _provider_for(model_id: str, openrouter_key: str) -> str: """Determine which OpenAI-compatible provider to route a chat model to.""" if model_id.startswith("local/ollama/"): return "ollama" if model_id.startswith("local/lmstudio/"): return "lmstudio" # Everything else goes through OpenRouter (including mapped Claude models) return "openrouter" class LLMAdapter: def __init__( self, default_model: str = "claude-sonnet-4.5", openrouter_key: str = "", ollama_url: str = "http://localhost:11434", lmstudio_url: str = "http://localhost:1234", ): self.current_model = default_model self.openrouter_key = openrouter_key self.ollama_url = ollama_url.rstrip("/") self.lmstudio_url = lmstudio_url.rstrip("/") self._openai_mod = None # lazy import @property def provider(self) -> str: return _provider_for(self.current_model, self.openrouter_key) def switch_model(self, model_id: str): self.current_model = model_id log.info("Switched chat model to: %s (provider: %s)", model_id, self.provider) # ── Chat Brain (OpenAI-compatible only) ── def chat( self, messages: list[dict], tools: list[dict] | None = None, stream: bool = True, ) -> Generator[dict, None, None]: """Chat brain: routes through OpenAI-compatible APIs only. Yields chunks: {"type": "text", "content": "..."} or {"type": "tool_use", ...}. """ provider = self.provider model_id = self._resolve_model_id(provider) # If a Claude model ID was selected, map it to OpenRouter equivalent if model_id in CLAUDE_OPENROUTER_MAP: if self.openrouter_key: model_id = CLAUDE_OPENROUTER_MAP[model_id] provider = "openrouter" else: yield { "type": "text", "content": ( "To chat with Claude models, you need an OpenRouter API key " "(set OPENROUTER_API_KEY in .env). Alternatively, select a local " "model from Ollama or LM Studio." ), } return # Check if provider is available if provider == "openrouter" and not self.openrouter_key: yield { "type": "text", "content": ( "No API key configured. To use cloud models:\n" "1. Get an OpenRouter API key at https://openrouter.ai/keys\n" "2. Set OPENROUTER_API_KEY in your .env file\n\n" "Or install Ollama (free, local) and pull a model:\n" " ollama pull llama3.2" ), } return base_url, api_key = self._resolve_endpoint(provider) yield from self._chat_openai_sdk(messages, tools, stream, base_url, api_key, model_id) # ── Execution Brain (Claude Code CLI) ── def execute( self, prompt: str, system_prompt: str = "", working_dir: str | None = None, tools: str = "Bash,Read,Edit,Write,Glob,Grep", model: str | None = None, skip_permissions: bool = False, timeout: int = 2700, ) -> str: """Execution brain: calls Claude Code CLI with full tool access. Used for heartbeat checks, scheduled tasks, and delegated complex tasks. Returns the full result string (non-streaming). Args: tools: Comma-separated Claude Code tool names (default: standard set). model: Override the CLI model (e.g. "claude-sonnet-4.5"). skip_permissions: If True, append --dangerously-skip-permissions to timeout: Max seconds to wait for CLI completion (default: 2700 / 45 min). the CLI invocation (used for automated pipelines). """ claude_bin = shutil.which("claude") if not claude_bin: return ( "Error: `claude` CLI not found in PATH. " "Install Claude Code: npm install -g @anthropic-ai/claude-code" ) # Pipe prompt through stdin to avoid Windows 8191-char command-line limit. cmd = [ claude_bin, "-p", "--output-format", "json", "--tools", tools, "--allowedTools", tools, ] if model: cmd.extend(["--model", model]) if system_prompt: cmd.extend(["--system-prompt", system_prompt]) if skip_permissions: cmd.append("--dangerously-skip-permissions") log.debug("Execution brain cmd: %s", " ".join(cmd[:6]) + "...") # Strip CLAUDECODE env var so the subprocess doesn't think it's nested env = {k: v for k, v in os.environ.items() if k != "CLAUDECODE"} try: proc = subprocess.Popen( cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, encoding="utf-8", shell=(sys.platform == "win32"), cwd=working_dir, env=env, ) except FileNotFoundError: return ( "Error: `claude` CLI not found. " "Install Claude Code: npm install -g @anthropic-ai/claude-code" ) try: stdout, stderr = proc.communicate(input=prompt, timeout=timeout) except subprocess.TimeoutExpired: proc.kill() minutes = timeout // 60 return f"Error: Claude Code execution timed out after {minutes} minutes." if proc.returncode != 0: return f"Execution error: {stderr or 'unknown error'}" try: result = json.loads(stdout) text = result.get("result", "") if text: return text if result.get("is_error"): return f"Execution error: {result.get('result', 'unknown')}" return "(No output from execution brain)" except json.JSONDecodeError: return stdout.strip() if stdout.strip() else "(No output from execution brain)" def is_execution_brain_available(self) -> bool: """Check if the Claude Code CLI is available.""" return shutil.which("claude") is not None # ── OpenAI-compatible SDK (OpenRouter / Ollama / LM Studio) ── def _chat_openai_sdk( self, messages: list[dict], tools: list[dict] | None, stream: bool, base_url: str, api_key: str, model_id: str, ) -> Generator[dict, None, None]: openai = self._get_openai() client = openai.OpenAI(base_url=base_url, api_key=api_key) kwargs: dict = { "model": model_id, "messages": messages, "stream": stream, } if stream: kwargs["stream_options"] = {"include_usage": True} if tools: kwargs["tools"] = tools max_retries = 2 has_yielded = False for attempt in range(max_retries + 1): try: if stream: response = client.chat.completions.create(**kwargs) tool_calls_accum: dict[int, dict] = {} stream_usage = None for chunk in response: # Capture usage from the final stream chunk if hasattr(chunk, "usage") and chunk.usage: stream_usage = chunk.usage delta = chunk.choices[0].delta if chunk.choices else None if not delta: continue if delta.content: has_yielded = True yield {"type": "text", "content": delta.content} if delta.tool_calls: for tc in delta.tool_calls: idx = tc.index if idx not in tool_calls_accum: tool_calls_accum[idx] = { "id": tc.id or "", "name": tc.function.name if tc.function and tc.function.name else "", "arguments": "", } if tc.function and tc.function.arguments: tool_calls_accum[idx]["arguments"] += tc.function.arguments if tc.id: tool_calls_accum[idx]["id"] = tc.id for _, tc in sorted(tool_calls_accum.items()): try: args = json.loads(tc["arguments"]) except json.JSONDecodeError: args = {} yield { "type": "tool_use", "id": tc["id"], "name": tc["name"], "input": args, } # Yield usage chunk if available if stream_usage: pt = getattr(stream_usage, "prompt_tokens", 0) or 0 ct = getattr(stream_usage, "completion_tokens", 0) or 0 yield { "type": "usage", "model": model_id, "prompt_tokens": pt, "completion_tokens": ct, "total_tokens": pt + ct, "estimated_cost": _estimate_cost(model_id, pt, ct), } else: response = client.chat.completions.create(**kwargs) msg = response.choices[0].message if msg.content: has_yielded = True yield {"type": "text", "content": msg.content} if msg.tool_calls: for tc in msg.tool_calls: try: args = json.loads(tc.function.arguments) except json.JSONDecodeError: args = {} yield { "type": "tool_use", "id": tc.id, "name": tc.function.name, "input": args, } # Yield usage chunk for non-streaming if hasattr(response, "usage") and response.usage: pt = response.usage.prompt_tokens or 0 ct = response.usage.completion_tokens or 0 yield { "type": "usage", "model": model_id, "prompt_tokens": pt, "completion_tokens": ct, "total_tokens": pt + ct, "estimated_cost": _estimate_cost(model_id, pt, ct), } # Success — break out of retry loop return except Exception as e: if not has_yielded and attempt < max_retries and _is_retryable_error(e): wait = 2**attempt log.warning( "Retryable LLM error (attempt %d/%d), retrying in %ds: %s", attempt + 1, max_retries + 1, wait, e, ) time.sleep(wait) continue yield {"type": "text", "content": _friendly_error(e, self.provider)} # ── Helpers ── def _resolve_endpoint(self, provider: str) -> tuple[str, str]: if provider == "openrouter": # noqa: SIM116 return "https://openrouter.ai/api/v1", self.openrouter_key or "sk-placeholder" elif provider == "ollama": return f"{self.ollama_url}/v1", "ollama" elif provider == "lmstudio": return f"{self.lmstudio_url}/v1", "lm-studio" return "https://openrouter.ai/api/v1", self.openrouter_key or "sk-placeholder" def _resolve_model_id(self, provider: str) -> str: model = self.current_model if provider == "ollama" and model.startswith("local/ollama/"): return model.removeprefix("local/ollama/") if provider == "lmstudio" and model.startswith("local/lmstudio/"): return model.removeprefix("local/lmstudio/") return model def _get_openai(self): if self._openai_mod is None: import openai self._openai_mod = openai return self._openai_mod # ── Model Discovery ── def discover_local_models(self) -> list[ModelInfo]: models = [] # Ollama try: r = httpx.get(f"{self.ollama_url}/api/tags", timeout=3) if r.status_code == 200: for m in r.json().get("models", []): models.append( ModelInfo( id=f"local/ollama/{m['name']}", name=f"[Ollama] {m['name']}", provider="ollama", ) ) except Exception: pass # LM Studio try: r = httpx.get(f"{self.lmstudio_url}/v1/models", timeout=3) if r.status_code == 200: for m in r.json().get("data", []): models.append( ModelInfo( id=f"local/lmstudio/{m['id']}", name=f"[LM Studio] {m['id']}", provider="lmstudio", ) ) except Exception: pass return models def list_chat_models(self) -> list[ModelInfo]: """Return models available for the chat brain (no direct Claude SDK entries).""" models = [] if self.openrouter_key: models.extend( [ # Anthropic (via OpenRouter — system prompts work correctly) ModelInfo("anthropic/claude-sonnet-4.5", "Claude Sonnet 4.5", "openrouter"), ModelInfo("anthropic/claude-opus-4.6", "Claude Opus 4.6", "openrouter"), # Google ModelInfo( "google/gemini-3-flash-preview", "Gemini 3 Flash Preview", "openrouter" ), ModelInfo("google/gemini-2.5-flash", "Gemini 2.5 Flash", "openrouter"), ModelInfo( "google/gemini-2.5-flash-lite", "Gemini 2.5 Flash Lite", "openrouter" ), # OpenAI ModelInfo("openai/gpt-5-nano", "GPT-5 Nano", "openrouter"), ModelInfo("openai/gpt-4o-mini", "GPT-4o Mini", "openrouter"), # DeepSeek / xAI / Others ModelInfo("deepseek/deepseek-v3.2", "DeepSeek V3.2", "openrouter"), ModelInfo("x-ai/grok-4.1-fast", "Grok 4.1 Fast", "openrouter"), ModelInfo("moonshotai/kimi-k2.5", "Kimi K2.5", "openrouter"), ModelInfo("minimax/minimax-m2.5", "MiniMax M2.5", "openrouter"), ] ) models.extend(self.discover_local_models()) return models def list_available_models(self) -> list[ModelInfo]: """Backwards-compatible alias for list_chat_models().""" return self.list_chat_models() def _is_retryable_error(e: Exception) -> bool: """Return True for transient errors worth retrying (5xx, timeout, rate limit).""" name = type(e).__name__ # openai library exceptions if name in ("APITimeoutError", "InternalServerError", "RateLimitError", "APIConnectionError"): return True # Status-code based (works with openai.APIStatusError subclasses) status = getattr(e, "status_code", None) if status and status >= 500: return True return status == 429 def _friendly_error(e: Exception, provider: str) -> str: """Map common LLM exceptions to plain-English messages.""" name = type(e).__name__ if name == "AuthenticationError" or "401" in str(e): return f"Authentication failed for {provider}. Please check your API key." if name == "RateLimitError" or "429" in str(e): return f"Rate limited by {provider}. Please wait a moment and try again." if name in ("APITimeoutError", "APIConnectionError") or "timeout" in str(e).lower(): return ( f"Could not reach {provider} — the service may be down " "or your connection is interrupted." ) if name == "InternalServerError" or (getattr(e, "status_code", None) or 0) >= 500: return f"{provider} returned a server error. Please try again shortly." # Generic fallback — still friendlier than a raw traceback log.error("LLM error (%s): %s", provider, e, exc_info=True) return f"Something went wrong talking to {provider}. Check the logs for details."