diff --git a/codec_compare.py b/codec_compare.py new file mode 100644 index 0000000..3a26678 --- /dev/null +++ b/codec_compare.py @@ -0,0 +1,167 @@ +"""CODEC Compare — fan one prompt out across model tiers, collect, return +labeled or blind. + +Sits directly on top of the rest of the stack — it reuses the canonical +callers rather than re-implementing HTTP: + * OpenAI-compatible endpoints (local Qwen @ 8083, every Cookbook-served + model on its 811x port) → `codec_llm.call` + * cloud tiers (Gemini/Claude/GPT via the AVA proxy) → `codec_ava_client` + +Endpoint set = three canonical tiers + anything Cookbook is currently serving: + 1. local — the local Qwen (config llm_base_url / llm_model) + 2. cloud-balanced — a mid cloud model via AVA (default gemini-2.5-flash) + 3. cloud-pro — a top cloud model via AVA (default gemini-2.5-pro) + + cookbook- — each healthy model from codec_cookbook.serve.list_served() + +The two cloud tiers + their model ids are overridable in +~/.codec/config.json:compare.cloud_tiers (a list of {label, model}); the +defaults above are grounded in codec_ava_client.choose_model's fast/balanced/pro +map. The fan-out is concurrent, per-endpoint timed, and never lets one +endpoint's failure sink the others. +""" +from __future__ import annotations + +import json +import logging +import os +import string +import time +from concurrent.futures import ThreadPoolExecutor +from typing import Optional + +log = logging.getLogger("codec_compare") + +_CONFIG_PATH = os.path.expanduser("~/.codec/config.json") +_DEFAULT_CLOUD_TIERS = [ + {"label": "cloud-balanced", "model": "gemini-2.5-flash"}, + {"label": "cloud-pro", "model": "gemini-2.5-pro"}, +] +_MAX_TOKENS = 1024 +_PER_ENDPOINT_TIMEOUT_S = 60 +_MAX_WORKERS = 6 + + +def _load_cfg() -> dict: + try: + with open(_CONFIG_PATH, encoding="utf-8") as f: + return json.load(f) + except (OSError, json.JSONDecodeError): + return {} + + +def _cloud_tiers(cfg: dict) -> list[dict]: + tiers = (cfg.get("compare") or {}).get("cloud_tiers") + if isinstance(tiers, list) and tiers: + return [t for t in tiers if t.get("model")] + return list(_DEFAULT_CLOUD_TIERS) + + +def _cookbook_endpoints() -> list[dict]: + """Every healthy Cookbook-served model as an OpenAI endpoint. Best-effort — + if Cookbook isn't installed / nothing is served, returns [].""" + eps = [] + try: + from codec_cookbook import serve + for r in serve.list_served(): + port = r.get("port") + if not port: + continue + if r.get("pm2_status") not in (None, "online") or r.get("healthy") is False: + continue # skip stopped/unhealthy + eps.append({ + "label": f"cookbook-{r.get('id', port)}", + "kind": "openai", + "model": r.get("hf_repo") or str(r.get("id")), + "base_url": f"http://127.0.0.1:{port}/v1", + "tier": "cookbook", + }) + except Exception as e: + log.debug("cookbook endpoint discovery skipped: %s", e) + return eps + + +def default_endpoints() -> list[dict]: + """The canonical comparison set: local + available cloud tiers + Cookbook.""" + cfg = _load_cfg() + eps: list[dict] = [{ + "label": "local", + "kind": "openai", + "model": cfg.get("llm_model", "local-qwen"), + "base_url": cfg.get("llm_base_url", "http://localhost:8083/v1"), + "tier": "local", + }] + # cloud tiers via AVA — only when the license/proxy is actually ready + try: + import codec_ava_client + ava = codec_ava_client.load_config() + if ava and ava.is_ready(): + for t in _cloud_tiers(cfg): + eps.append({"label": t["label"], "kind": "ava", + "model": t["model"], "tier": "cloud"}) + except Exception as e: + log.debug("AVA cloud tiers unavailable: %s", e) + eps.extend(_cookbook_endpoints()) + return eps + + +def _query_one(ep: dict, prompt: str, system: Optional[str], timeout: int) -> dict: + """Query a single endpoint. Never raises — failures are captured as + {ok: False, error}. Returns the endpoint dict enriched with the result.""" + t0 = time.monotonic() + base = {k: ep.get(k) for k in ("label", "model", "tier")} + try: + if ep["kind"] == "ava": + import codec_ava_client + text = codec_ava_client.ava_chat_simple( + prompt, system=system, model=ep["model"], + max_tokens=_MAX_TOKENS, timeout=timeout) + else: # openai-compatible (local + cookbook) + import codec_llm + messages = ([{"role": "system", "content": system}] if system else []) \ + + [{"role": "user", "content": prompt}] + text = codec_llm.call( + messages, base_url=ep["base_url"], model=ep["model"], + max_tokens=_MAX_TOKENS, timeout=timeout, raise_on_error=True) + return {**base, "ok": True, "response": (text or "").strip(), + "elapsed_ms": round((time.monotonic() - t0) * 1000)} + except Exception as e: + return {**base, "ok": False, "error": str(e)[:300], + "elapsed_ms": round((time.monotonic() - t0) * 1000)} + + +def compare(prompt: str, *, endpoints: Optional[list[dict]] = None, + blind: bool = False, system: Optional[str] = None, + timeout: int = _PER_ENDPOINT_TIMEOUT_S, + max_workers: int = _MAX_WORKERS) -> dict: + """Fan `prompt` out across `endpoints` (default: default_endpoints()) + concurrently and collect every reply. + + Returns {prompt, blind, results:[{label|display, model, tier, ok, response|error, + elapsed_ms}], mapping?}. In blind mode each result's display label is + anonymized (Model A/B/…) and a `mapping` of anon→real is returned separately + so the caller decides whether/when to reveal it. + """ + if not prompt or not prompt.strip(): + return {"prompt": "", "blind": blind, "results": [], "note": "empty prompt"} + eps = endpoints if endpoints is not None else default_endpoints() + if not eps: + return {"prompt": prompt[:200], "blind": blind, "results": [], + "note": "no endpoints available"} + + workers = max(1, min(max_workers, len(eps))) + with ThreadPoolExecutor(max_workers=workers) as ex: + # ex.map preserves input order + results = list(ex.map(lambda e: _query_one(e, prompt, system, timeout), eps)) + + out = {"prompt": prompt[:200], "blind": blind, "results": results} + if blind: + anon = {} + for i, r in enumerate(results): + tag = f"Model {string.ascii_uppercase[i % 26]}" + anon[tag] = r["label"] + r["display"] = tag + out["mapping"] = anon + else: + for r in results: + r["display"] = r["label"] + return out diff --git a/skills/.manifest.json b/skills/.manifest.json index c3305b3..e65f07e 100644 --- a/skills/.manifest.json +++ b/skills/.manifest.json @@ -26,6 +26,7 @@ "chrome_tabs.py": "bf971798a8455215655d37edb6bb326d908f4d33c0e3b232eb37d267fec68d7a", "clipboard.py": "f5ef9cc501fe38a3de95bf0b49896b928250c0e272060173668f6b195728d131", "clipboard_url_fetch.py": "c2733a92d6e99a0346b91c67bb70698e491be9570305377a82096a0ceb153488", + "compare.py": "f0ff94cfcdc3dd5a4f0dd88d641e042303982888a9838fb4b1781067a9c46839", "cookbook_download.py": "8fd9c8a5b82f8e910cc0721904b908f2d201908ff0ac6373994be922598c382c", "cookbook_list.py": "23c19b92742bfd88a801e74bd79e8bbd70f88d49ac1951f1c30e4857cc693a79", "cookbook_recommend.py": "66b09eac0510ca9ca1e19f5619ab10a167bda6e34d17be13bda77ba7ffe0b5e3", diff --git a/skills/compare.py b/skills/compare.py new file mode 100644 index 0000000..b3fd634 --- /dev/null +++ b/skills/compare.py @@ -0,0 +1,64 @@ +"""CODEC Skill: Compare one prompt across model tiers (labeled or blind).""" +import re + +from codec_compare import compare + +SKILL_NAME = "compare" +SKILL_DESCRIPTION = ( + "Send one prompt to multiple model tiers at once — the local Qwen, the " + "cloud tiers (via AVA), and any model Cookbook is currently serving — and " + "show their answers side by side. Add 'blind' to hide which model is which." +) +SKILL_TAGS = ["compare", "models", "eval", "llm", "cookbook"] +SKILL_TRIGGERS = [ + "compare models", "blind compare", "compare across models", "ask all models", + "compare llms", "model compare", "compare prompt", +] +SKILL_MCP_EXPOSE = True # query skill (no process/file mutation; same cost profile as chat) + +# Longest-first so "compare across models" wins over "compare". +_PREFIXES = ( + "blind compare across models", "compare across models", "blind compare", + "compare models", "ask all models", "compare llms", "model compare", + "compare prompt", "compare", +) + + +def _extract_prompt(task: str) -> str: + t = (task or "").strip() + low = t.lower() + for p in sorted(_PREFIXES, key=len, reverse=True): + if low.startswith(p): + t = t[len(p):].strip(" :->\n\t") + break + # drop a leading 'blind' keyword if it leaked into the prompt + return re.sub(r"^blind\s+", "", t, flags=re.I).strip() + + +def run(task, app="", ctx=""): + blind = bool(re.search(r"\bblind\b", (task or "").lower())) + prompt = _extract_prompt(task) + if not prompt: + return ("What should I compare? e.g. " + "'compare models: explain quantum tunneling in one paragraph' " + "(prefix with 'blind' to hide the model identities).") + + res = compare(prompt, blind=blind) + results = res.get("results", []) + if not results: + return f"No model endpoints available to compare ({res.get('note', 'none configured')})." + + head = (f"Compared {len(results)} model{'s' if len(results) != 1 else ''}" + + (" — blind" if blind else "") + f" on: {res['prompt']}") + lines = [head] + for r in results: + label = r.get("display") or r.get("label") + meta = f"{r.get('elapsed_ms')}ms" + ("" if blind else f", {r.get('tier')}") + if r.get("ok"): + lines.append(f"\n### {label} ({meta})\n{r.get('response', '')}") + else: + lines.append(f"\n### {label} — ✗ {r.get('error', 'failed')} ({meta})") + if blind and res.get("mapping"): + key = " ".join(f"{k} = {v}" for k, v in res["mapping"].items()) + lines.append(f"\n— Key (judge the answers first, then peek) —\n{key}") + return "\n".join(lines) diff --git a/tests/test_compare.py b/tests/test_compare.py new file mode 100644 index 0000000..1c6b686 --- /dev/null +++ b/tests/test_compare.py @@ -0,0 +1,187 @@ +"""CODEC Compare — model fan-out tests. + +All model callers (codec_llm.call, codec_ava_client.ava_chat_simple) and the +Cookbook registry are mocked, so the suite runs offline + side-effect-free. +""" +from __future__ import annotations + +import sys +from pathlib import Path +from types import SimpleNamespace + +_REPO = Path(__file__).resolve().parents[1] +if str(_REPO) not in sys.path: + sys.path.insert(0, str(_REPO)) + +import codec_compare as cc # noqa: E402 + + +def _eps(): + return [ + {"label": "local", "kind": "openai", "model": "qwen", + "base_url": "http://x/v1", "tier": "local"}, + {"label": "cloud-pro", "kind": "ava", "model": "gemini-2.5-pro", "tier": "cloud"}, + ] + + +# ── _query_one ─────────────────────────────────────────────────────────────── +class TestQueryOne: + def test_openai_path(self, monkeypatch): + import codec_llm + monkeypatch.setattr(codec_llm, "call", lambda m, **k: f"hi from {k['model']}") + r = cc._query_one(_eps()[0], "p", None, 10) + assert r["ok"] and r["response"] == "hi from qwen" + assert r["tier"] == "local" and isinstance(r["elapsed_ms"], int) + + def test_ava_path(self, monkeypatch): + import codec_ava_client + monkeypatch.setattr(codec_ava_client, "ava_chat_simple", + lambda p, system=None, **k: f"cloud:{k['model']}") + r = cc._query_one(_eps()[1], "p", None, 10) + assert r["ok"] and r["response"] == "cloud:gemini-2.5-pro" + + def test_error_is_captured_not_raised(self, monkeypatch): + import codec_llm + monkeypatch.setattr(codec_llm, "call", + lambda m, **k: (_ for _ in ()).throw(RuntimeError("down"))) + r = cc._query_one(_eps()[0], "p", None, 10) + assert r["ok"] is False and "down" in r["error"] + + def test_system_prompt_threaded(self, monkeypatch): + import codec_llm + seen = {} + monkeypatch.setattr(codec_llm, "call", + lambda m, **k: seen.update(msgs=m) or "ok") + cc._query_one(_eps()[0], "p", "be terse", 10) + assert seen["msgs"][0] == {"role": "system", "content": "be terse"} + + +# ── compare fan-out ────────────────────────────────────────────────────────── +class TestCompare: + def test_order_preserved_and_all_collected(self, monkeypatch): + monkeypatch.setattr(cc, "_query_one", + lambda e, p, s, t: {"label": e["label"], "ok": True, + "response": e["label"], "elapsed_ms": 1}) + out = cc.compare("hi", endpoints=_eps()) + assert [r["label"] for r in out["results"]] == ["local", "cloud-pro"] + assert out["blind"] is False + assert all(r["display"] == r["label"] for r in out["results"]) + + def test_blind_anonymizes_and_maps(self, monkeypatch): + monkeypatch.setattr(cc, "_query_one", + lambda e, p, s, t: {"label": e["label"], "ok": True, + "response": "x", "elapsed_ms": 1}) + out = cc.compare("hi", endpoints=_eps(), blind=True) + assert [r["display"] for r in out["results"]] == ["Model A", "Model B"] + assert out["mapping"] == {"Model A": "local", "Model B": "cloud-pro"} + + def test_empty_prompt(self): + assert cc.compare(" ")["results"] == [] + + def test_no_endpoints_note(self): + out = cc.compare("hi", endpoints=[]) + assert out["results"] == [] and "no endpoints" in out["note"] + + def test_one_failure_does_not_sink_others(self, monkeypatch): + import codec_llm + monkeypatch.setattr(codec_llm, "call", + lambda m, **k: "ok" if k["model"] == "qwen" + else (_ for _ in ()).throw(RuntimeError("boom"))) + eps = [_eps()[0], {"label": "b", "kind": "openai", "model": "z", + "base_url": "http://y/v1", "tier": "cookbook"}] + out = cc.compare("hi", endpoints=eps) + assert [r["ok"] for r in out["results"]] == [True, False] + + +# ── endpoint discovery ─────────────────────────────────────────────────────── +class TestDefaultEndpoints: + def test_local_always_present(self, monkeypatch): + monkeypatch.setattr(cc, "_load_cfg", lambda: {}) + monkeypatch.setattr(cc, "_cookbook_endpoints", lambda: []) + monkeypatch.setitem(sys.modules, "codec_ava_client", + SimpleNamespace(load_config=lambda: None)) + eps = cc.default_endpoints() + assert eps[0]["tier"] == "local" and eps[0]["kind"] == "openai" + + def test_cloud_tiers_only_when_ava_ready(self, monkeypatch): + monkeypatch.setattr(cc, "_load_cfg", lambda: {}) + monkeypatch.setattr(cc, "_cookbook_endpoints", lambda: []) + ready = SimpleNamespace(is_ready=lambda: True) + monkeypatch.setitem(sys.modules, "codec_ava_client", + SimpleNamespace(load_config=lambda: ready)) + labels = [e["label"] for e in cc.default_endpoints()] + assert "cloud-balanced" in labels and "cloud-pro" in labels + + def test_cloud_tiers_absent_when_not_ready(self, monkeypatch): + monkeypatch.setattr(cc, "_load_cfg", lambda: {}) + monkeypatch.setattr(cc, "_cookbook_endpoints", lambda: []) + notready = SimpleNamespace(is_ready=lambda: False) + monkeypatch.setitem(sys.modules, "codec_ava_client", + SimpleNamespace(load_config=lambda: notready)) + assert all(e["tier"] != "cloud" for e in cc.default_endpoints()) + + def test_config_overrides_cloud_tiers(self, monkeypatch): + cfg = {"compare": {"cloud_tiers": [{"label": "claude", "model": "claude-3-5-sonnet"}]}} + monkeypatch.setattr(cc, "_load_cfg", lambda: cfg) + monkeypatch.setattr(cc, "_cookbook_endpoints", lambda: []) + ready = SimpleNamespace(is_ready=lambda: True) + monkeypatch.setitem(sys.modules, "codec_ava_client", + SimpleNamespace(load_config=lambda: ready)) + cloud = [e for e in cc.default_endpoints() if e["tier"] == "cloud"] + assert len(cloud) == 1 and cloud[0]["model"] == "claude-3-5-sonnet" + + def test_cookbook_endpoints_skip_unhealthy(self, monkeypatch): + served = [ + {"id": "a", "port": 8112, "hf_repo": "r/a", "pm2_status": "online", "healthy": True}, + {"id": "b", "port": 8113, "hf_repo": "r/b", "pm2_status": "stopped", "healthy": False}, + ] + monkeypatch.setitem(sys.modules, "codec_cookbook", + SimpleNamespace(serve=SimpleNamespace(list_served=lambda: served))) + # import path inside _cookbook_endpoints is `from codec_cookbook import serve` + import codec_cookbook # noqa: F401 + eps = cc._cookbook_endpoints() + labels = [e["label"] for e in eps] + assert labels == ["cookbook-a"] # only the healthy/online one + assert eps[0]["base_url"] == "http://127.0.0.1:8112/v1" + + +# ── skill ──────────────────────────────────────────────────────────────────── +class TestSkill: + def test_discovered_and_exposed(self): + import codec_dispatch + codec_dispatch.load_skills() + reg = codec_dispatch.registry + assert "compare" in reg.names() + assert reg.get_mcp_expose("compare") is True + + def test_parses_prompt_and_formats_labeled(self, monkeypatch): + import skills.compare as sc + monkeypatch.setattr(sc, "compare", lambda prompt, blind=False: { + "prompt": prompt, "blind": blind, + "results": [{"label": "local", "display": "local", "tier": "local", + "ok": True, "response": "42", "elapsed_ms": 10}]}) + out = sc.run("compare models: meaning of life") + assert "meaning of life" in out and "local" in out and "42" in out + + def test_blind_flag_detected_and_key_shown(self, monkeypatch): + import skills.compare as sc + monkeypatch.setattr(sc, "compare", lambda prompt, blind=False: { + "prompt": prompt, "blind": blind, + "results": [{"label": "local", "display": "Model A", "tier": "local", + "ok": True, "response": "x", "elapsed_ms": 5}], + "mapping": {"Model A": "local"}}) + out = sc.run("blind compare what is 2+2") + assert "Model A" in out and "Key" in out and "local" in out + + def test_empty_prompt_asks(self): + import skills.compare as sc + assert "What should I compare" in sc.run("compare models") + + def test_failure_rendered(self, monkeypatch): + import skills.compare as sc + monkeypatch.setattr(sc, "compare", lambda prompt, blind=False: { + "prompt": prompt, "blind": False, + "results": [{"label": "cloud-pro", "display": "cloud-pro", "tier": "cloud", + "ok": False, "error": "license expired", "elapsed_ms": 3}]}) + out = sc.run("compare models hello") + assert "✗" in out and "license expired" in out