diff --git a/AGENTS.md b/AGENTS.md index 667a6d6..276da4c 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -56,6 +56,8 @@ docs/ API.md, MCP_HTTP_SETUP.md, CONTEXT_REPORT.md, desig Other engine modules (`codec_overlays`, `codec_metrics`, `codec_logging`, `codec_gdocs`, `codec_google_auth`, `codec_cdp`, `codec_llm_proxy`, `codec_retry`, `codec_alerts`, `codec_search`, `codec_textassist`, `codec_watcher`, `codec_watchdog`) are internal helpers — read them when you need them, but they're not part of the navigation surface for an agent making structural changes. (Keyboard handling — wake word, F13 toggle, F18 voice, double-tap — lives **inline in `codec.py`** in the `codec` PM2 process; the old standalone `codec_keyboard.py` was deleted as a dead duplicate per A-8.) +**Canonical LLM + vision helpers (PR-3E, A-11/A-12).** `codec_vision.py` is the SINGLE source for screen-vision (`describe_sync` / `describe_async`, Gemini-flash → local-Qwen-VL fallback, config read live from `codec_config`) — used by `codec.py`, `codec_voice`, `codec_session`. `codec_llm.py` is the canonical chat/completions caller (`call()` + `strip_think`/`extract_content` — headers, Bearer auth, `enable_thinking`, `` strip, `choices/reasoning` parse, retry+backoff, never-raises). NOTE: `codec_llm_proxy.py` is a priority *queue* (semaphore), NOT an HTTP caller — don't confuse the two. A-12 is migrating the ~45 inline `chat/completions` sites onto `codec_llm` in phased tranches; codec.py voice-reply + `codec_session.qwen_call` are done, streaming (`codec_llm.stream()`) + the rest are pending. + ## 3. Agent + Crew runtime CODEC has its own minimalist multi-agent runtime in `codec_agents.py`. **Zero dependency on CrewAI or LangChain** — it's self-contained, only depends on `requests` and `codec_skill_registry`. diff --git a/codec.py b/codec.py index a2ffeaa..2ae8960 100644 --- a/codec.py +++ b/codec.py @@ -22,7 +22,7 @@ # ── CONFIG (single source of truth: codec_config.py) ───────────────────────── from codec_config import ( cfg as _cfg, - QWEN_BASE_URL, QWEN_MODEL, LLM_API_KEY, LLM_KWARGS, QWEN_VISION_URL, QWEN_VISION_MODEL, + QWEN_BASE_URL, QWEN_MODEL, LLM_API_KEY, LLM_KWARGS, WHISPER_URL, TASK_QUEUE_FILE, DRAFT_TASK_FILE, SESSION_ALIVE, STREAMING, WAKE_WORD, WAKE_ENERGY, WAKE_CHUNK_SEC, WAKE_PHRASES, @@ -71,7 +71,7 @@ def _is_wake_utterance(text: str) -> bool: # ─��� SHARED (from codec_core.py — single source of truth) ───────────────────── import codec_core as _core from codec_core import ( - strip_think, is_draft, init_db, save_task, update_session_response, get_memory, get_recent_conversations, + is_draft, init_db, save_task, update_session_response, get_memory, get_recent_conversations, transcribe, speak_text, focused_app, get_text_dialog, terminal_session_exists, # A-14 (PR-3G): `close_session` import dropped — codec.py defines its own @@ -96,50 +96,16 @@ def _is_wake_utterance(text: str) -> bool: # safety gate AND plugin lifecycle hooks (run_with_hooks), both of which the # legacy path bypassed. -# ── VISION (Gemini Flash or local Qwen VL) ────────────────────────────────── -def _gemini_vision(img_b64, prompt, max_tokens=800): - """Call Gemini Flash vision API. Fast, reliable, free tier.""" - import requests - url = f"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent?key={GEMINI_API_KEY}" - payload = { - "contents": [{"parts": [ - {"inlineData": {"mimeType": "image/png", "data": img_b64}}, - {"text": prompt} - ]}], - "generationConfig": {"maxOutputTokens": max_tokens} - } - r = requests.post(url, json=payload, timeout=30) - if r.status_code == 200: - candidates = r.json().get("candidates", []) - if candidates: - parts = candidates[0].get("content", {}).get("parts", []) - if parts: - return parts[0].get("text", "").strip() - else: - print(f"[CODEC] Gemini error {r.status_code}: {r.text[:200]}") - return "" - -def _local_vision(img_b64, prompt, max_tokens=800): - """Call local Qwen VL vision API (fallback).""" - import requests - r = requests.post(f"{QWEN_VISION_URL}/chat/completions", - json={"model": QWEN_VISION_MODEL, - "messages": [{"role": "user", "content": [ - {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{img_b64}"}}, - {"type": "text", "text": prompt} - ]}], "max_tokens": max_tokens}, timeout=60) - if r.status_code == 200: - return r.json()["choices"][0]["message"].get("content", "").strip() - return "" +# ── VISION (A-11, PR-3E: canonical helper in codec_vision) ────────────────── +# The Gemini-Flash → local-Qwen-VL fallback used to be hand-rolled here (and in +# codec_voice + codec_session). It now lives in codec_vision; this is a thin +# delegate kept for any caller of codec.vision_describe. +import codec_vision +import codec_llm # A-12: canonical chat/completions caller def vision_describe(img_b64, prompt="Read all visible text on this screen. Include app name, window title, and all message/content text. Output raw text only.", max_tokens=800): - """Route vision to Gemini or local based on config.""" - if VISION_PROVIDER == "gemini" and GEMINI_API_KEY: - result = _gemini_vision(img_b64, prompt, max_tokens) - if result: - return result - print("[CODEC] Gemini failed, falling back to local vision...") - return _local_vision(img_b64, prompt, max_tokens) + """Route vision to Gemini or local based on config (codec_vision).""" + return codec_vision.describe_sync(img_b64, prompt, mime="image/png", max_tokens=max_tokens) def screenshot_ctx(): try: @@ -444,55 +410,41 @@ def _post_skill_screenshot(): push(lambda: show_processing_overlay('Thinking...', 15000)) try: - import requests as _llm_req - headers = {} - if LLM_API_KEY: - headers["Authorization"] = f"Bearer {LLM_API_KEY}" - payload = { - "model": QWEN_MODEL, - "messages": llm_messages, - "max_tokens": 400, - "temperature": 0.7, - "chat_template_kwargs": {"enable_thinking": False}, - } - payload.update(LLM_KWARGS) - r = _llm_req.post(f"{QWEN_BASE_URL}/chat/completions", json=payload, headers=headers, timeout=120) - if r.status_code == 200: - data = r.json() - answer = data.get("choices", [{}])[0].get("message", {}).get("content", "") - answer = strip_think(answer).strip() - if answer: - print(f"[CODEC] Voice reply (turn {voice_session['turn_count']+1}): {answer[:120]}") - log_event("tts_speak", "open-codec", - f"TTS: {answer[:60]}", - extra={"text_len": len(answer)}) - # Add assistant response to session history - voice_session["messages"].append({"role": "assistant", "content": answer}) - voice_session["turn_count"] += 1 - # Save response to DB (A-20: via codec_core helper with - # WAL + busy_timeout — replaces the inline lock-prone - # sqlite3.connect that risked "database is locked" under - # concurrent agent-runner + voice writes). Never raises. - update_session_response(rid, answer[:500]) - # Save to shared memory (same store as Chat) - try: - cm = CodecMemory() - cm.save("voice", "user", task) - cm.save("voice", "assistant", answer) - except Exception as e: - log.warning(f"[CODEC] Memory save failed after LLM: {e}") - _last_tts_text = answer[:200] - speak_text(answer) - _safe_ans = answer[:80].replace('\\', '\\\\').replace('"', '\\"') - subprocess.Popen(["osascript", "-e", - f'display notification "{_safe_ans}" with title "CODEC"'], - stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) - else: - print("[CODEC] Voice LLM returned empty response") - speak_text("Sorry, I didn't get a response.") + # A-12 (PR-3E): canonical codec_llm.call replaces the inline + # chat/completions POST + headers + enable_thinking + strip + + # choices parse. Returns the stripped answer, or "" on any failure + # (non-200 and empty now collapse to the same apology). + answer = codec_llm.call( + llm_messages, base_url=QWEN_BASE_URL, model=QWEN_MODEL, + api_key=LLM_API_KEY, max_tokens=400, temperature=0.7, + timeout=120, retries=1, extra_kwargs=LLM_KWARGS, + ) + if answer: + print(f"[CODEC] Voice reply (turn {voice_session['turn_count']+1}): {answer[:120]}") + log_event("tts_speak", "open-codec", + f"TTS: {answer[:60]}", + extra={"text_len": len(answer)}) + # Add assistant response to session history + voice_session["messages"].append({"role": "assistant", "content": answer}) + voice_session["turn_count"] += 1 + # Save response to DB (A-20: codec_core helper, WAL + busy_timeout). + update_session_response(rid, answer[:500]) + # Save to shared memory (same store as Chat) + try: + cm = CodecMemory() + cm.save("voice", "user", task) + cm.save("voice", "assistant", answer) + except Exception as e: + log.warning(f"[CODEC] Memory save failed after LLM: {e}") + _last_tts_text = answer[:200] + speak_text(answer) + _safe_ans = answer[:80].replace('\\', '\\\\').replace('"', '\\"') + subprocess.Popen(["osascript", "-e", + f'display notification "{_safe_ans}" with title "CODEC"'], + stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) else: - print(f"[CODEC] Voice LLM error: {r.status_code} {r.text[:200]}") - speak_text("Sorry, the language model is not responding.") + print("[CODEC] Voice LLM returned no response") + speak_text("Sorry, I didn't get a response.") except Exception as e: log.error("Voice LLM call failed: %s", e) import traceback; traceback.print_exc() diff --git a/codec_llm.py b/codec_llm.py new file mode 100644 index 0000000..40760ff --- /dev/null +++ b/codec_llm.py @@ -0,0 +1,108 @@ +"""CODEC LLM call helper — the single canonical OpenAI-style chat/completions caller. + +A-12 (PR-3E): before this, ~45 sites hand-rolled the same `chat/completions` +POST — build headers (`Authorization: Bearer …`, `Content-Type`), assemble the +payload (`model`/`messages`/`max_tokens`/`temperature`/ +`chat_template_kwargs.enable_thinking=False`), parse `choices[0].message` +(content, with a `reasoning` fallback), and strip ``. A model +upgrade or API-shape fix then meant editing 20+ places. + +This module centralizes the **non-streaming** call. It is intentionally +config-agnostic — each caller passes its own `base_url` / `model` / `api_key` +/ tuning — so it's a pure "build payload → POST → parse" helper with no import +cycle into codec_config. (Streaming SSE + the remaining call sites are migrated +in later A-12 tranches; this PR covers the call() API + codec.py + codec_session.) + +NOTE: `codec_llm_proxy` is a *priority queue* (semaphore), not an HTTP proxy — +orthogonal to this module. Callers that want prioritization still wrap the call +in `llm_queue_sync(...)`; behavior parity for the migrated sites means we do NOT +add queue acquisition here (none of them used it). +""" +from __future__ import annotations + +import logging +import re +import time +from typing import Any, Dict, List, Optional + +log = logging.getLogger("codec.llm") + +_THINK_RE = re.compile(r".*?", re.DOTALL) + + +def strip_think(text: str) -> str: + """Remove reasoning blocks and surrounding whitespace.""" + if not text: + return "" + return _THINK_RE.sub("", text).strip() + + +def extract_content(response_json: Dict[str, Any]) -> str: + """Pull the assistant text from an OpenAI-style response: prefer + `choices[0].message.content`, fall back to `.reasoning` (some local + servers put the answer there when content is empty). `` stripped. + Returns "" on any shape mismatch.""" + try: + msg = response_json["choices"][0]["message"] + except (KeyError, IndexError, TypeError): + return "" + content = (msg.get("content") or "").strip() + if content: + return strip_think(content) + reasoning = (msg.get("reasoning") or "").strip() + if reasoning: + return strip_think(reasoning) + return "" + + +def call( + messages: List[Dict[str, Any]], + *, + base_url: str, + model: str, + api_key: str = "", + max_tokens: int = 500, + temperature: float = 0.7, + timeout: float = 120.0, + retries: int = 1, + enable_thinking: bool = False, + extra_kwargs: Optional[Dict[str, Any]] = None, +) -> str: + """POST `messages` to `/chat/completions` and return the parsed, + ``-stripped assistant text (or "" on failure). + + `retries` includes the first attempt (retries=3 → up to 3 tries with + exponential 2**n backoff between them, matching codec_session.qwen_call). + Never raises — network/parse errors are logged and yield "". + """ + import requests + headers = {"Content-Type": "application/json"} + if api_key: + headers["Authorization"] = "Bearer " + api_key + payload: Dict[str, Any] = { + "model": model, + "messages": messages, + "max_tokens": max_tokens, + "temperature": temperature, + "chat_template_kwargs": {"enable_thinking": enable_thinking}, + } + if extra_kwargs: + payload.update(extra_kwargs) + + attempts = max(1, retries) + url = base_url.rstrip("/") + "/chat/completions" + for attempt in range(attempts): + try: + r = requests.post(url, json=payload, headers=headers, timeout=timeout) + if r.status_code == 200: + resp = extract_content(r.json()) + if resp: + return resp + # 200 but empty/odd shape — don't retry, nothing more to get. + return "" + log.warning("LLM call %s returned %s: %s", url, r.status_code, r.text[:200]) + except Exception as e: + log.warning("LLM call attempt %d/%d failed: %s", attempt + 1, attempts, e) + if attempt < attempts - 1: + time.sleep(2 ** attempt) + return "" diff --git a/codec_session.py b/codec_session.py index db49329..342a35b 100644 --- a/codec_session.py +++ b/codec_session.py @@ -72,15 +72,9 @@ def strip_think(t): return re.sub(r".*?", "", t, flags=re.DOTALL).strip() -def extract_content(rj): - msg = rj["choices"][0]["message"] - c = msg.get("content", "").strip() - if c: - return strip_think(c) - r = msg.get("reasoning", "").strip() - if r: - return strip_think(r) - return "" +# A-12 (PR-3E): local `extract_content` removed — its only caller was `qwen_call`, +# now migrated to codec_llm.call (which owns the canonical content→reasoning +# extraction). `strip_think` above is kept; qwen_stream still uses it. def clean_resp(text): @@ -210,26 +204,12 @@ def screenshot_ctx(self): ib = base64.b64encode(f.read()).decode() os.unlink(tmp.name) print("[C] Reading screen...") - import requests - r = requests.post( - self.qwen_vision_url + "/chat/completions", - json={ - "model": self.qwen_vision_model, - "messages": [ - { - "role": "user", - "content": [ - {"type": "image_url", "image_url": {"url": "data:image/png;base64," + ib}}, - {"type": "text", "text": "Read all visible text. Include app name and content. Raw text only."}, - ], - } - ], - "max_tokens": 800, - }, - timeout=120, - ) - if r.status_code == 200: - return r.json()["choices"][0]["message"].get("content", "")[:2000] + # A-11 (PR-3E): canonical vision helper. Was local-Qwen-VL only here; + # now gains the Gemini-Flash fallback for free (config-gated). + import codec_vision + return codec_vision.describe_sync( + ib, "Read all visible text. Include app name and content. Raw text only.", + mime="image/png", max_tokens=800)[:2000] except Exception as e: log.warning(f"Screenshot capture or vision analysis failed: {e}") return "" @@ -266,28 +246,14 @@ def speak(self, text): # ── LLM Calls ──────────────────────────────────────────────────────── def qwen_call(self, messages): - import requests - headers = {"Content-Type": "application/json"} - if self.llm_api_key: - headers["Authorization"] = "Bearer " + self.llm_api_key - payload = {"model": self.qwen_model, "messages": messages, "max_tokens": 500, "temperature": 0.5} - payload.update(self.llm_kwargs) - for attempt in range(3): - try: - r = requests.post( - self.qwen_base_url + "/chat/completions", - json=payload, - headers=headers, - timeout=90, - ) - if r.status_code == 200: - resp = extract_content(r.json()) - if resp: - return resp - except Exception as e: - log.warning(f"LLM API call attempt {attempt+1} failed: {e}") - time.sleep(2 ** attempt) - return "" + # A-12 (PR-3E): canonical codec_llm.call (3 retries + backoff, content→ + # reasoning extraction, strip) — was an inline chat/completions POST. + import codec_llm + return codec_llm.call( + messages, base_url=self.qwen_base_url, model=self.qwen_model, + api_key=self.llm_api_key, max_tokens=500, temperature=0.5, + timeout=90, retries=3, extra_kwargs=self.llm_kwargs, + ) def qwen_stream(self, messages): import requests diff --git a/codec_vision.py b/codec_vision.py new file mode 100644 index 0000000..318e7b0 --- /dev/null +++ b/codec_vision.py @@ -0,0 +1,147 @@ +"""CODEC Vision — the single canonical screen-vision helper (A-11, PR-3E). + +Before this, the Gemini-Flash → local-Qwen-VL fallback was hand-rolled in three +places with drifting shapes: `codec.py` (sync), `codec_voice._analyze_screenshot` +(async), and `codec_session.screenshot_ctx` (sync, local-only). A model upgrade +or vision-API fix meant editing all three. + +Canonical API: + describe_sync(image_b64, prompt, *, mime, max_tokens) -> str + await describe_async(image_b64, prompt, *, mime, max_tokens, http) -> str + +Both: try Gemini Flash first (when `vision_provider == "gemini"` and a key is +present), fall back to the local Qwen-VL `/chat/completions` endpoint. Return +the description text, or "" on failure. Config is read live from codec_config +(so provider/model/key changes + Keychain migration take effect without restart). +""" +from __future__ import annotations + +import logging +from typing import Any, Optional, Tuple + +log = logging.getLogger("codec.vision") + +_GEMINI_MODEL = "gemini-2.0-flash" + + +def _vision_config() -> Tuple[str, str, str, str]: + """(provider, gemini_key, local_url, local_model) read live from config. + Falls back to safe defaults if codec_config can't be imported.""" + try: + from codec_config import cfg, QWEN_VISION_URL, QWEN_VISION_MODEL, get_gemini_api_key + gem = get_gemini_api_key() or "" + provider = cfg.get("vision_provider", "gemini" if gem else "local") + return provider, gem, QWEN_VISION_URL, QWEN_VISION_MODEL + except Exception as e: # pragma: no cover — defensive + log.warning("vision config unavailable: %s", e) + return "local", "", "http://localhost:8082/v1", "qwen-vl" + + +def _gemini_payload(image_b64: str, prompt: str, mime: str, max_tokens: int) -> dict: + return { + "contents": [{"parts": [ + {"inlineData": {"mimeType": mime, "data": image_b64}}, + {"text": prompt}, + ]}], + "generationConfig": {"maxOutputTokens": max_tokens}, + } + + +def _gemini_url(api_key: str) -> str: + return (f"https://generativelanguage.googleapis.com/v1beta/models/" + f"{_GEMINI_MODEL}:generateContent?key={api_key}") + + +def _parse_gemini(rj: dict) -> str: + try: + parts = rj.get("candidates", [])[0].get("content", {}).get("parts", []) + return (parts[0].get("text", "") if parts else "").strip() + except (IndexError, AttributeError, TypeError): + return "" + + +def _local_payload(image_b64: str, prompt: str, mime: str, model: str, max_tokens: int) -> dict: + return { + "model": model, + "messages": [{"role": "user", "content": [ + {"type": "image_url", "image_url": {"url": f"data:{mime};base64,{image_b64}"}}, + {"type": "text", "text": prompt}, + ]}], + "max_tokens": max_tokens, + } + + +def _parse_local(rj: dict) -> str: + try: + return (rj["choices"][0]["message"].get("content") or "").strip() + except (KeyError, IndexError, TypeError): + return "" + + +def describe_sync(image_b64: str, prompt: str, *, mime: str = "image/png", + max_tokens: int = 800, timeout: float = 120.0) -> str: + """Synchronous (requests) vision describe. Gemini Flash → local Qwen-VL.""" + import requests + provider, gem_key, local_url, local_model = _vision_config() + + if provider == "gemini" and gem_key: + try: + r = requests.post(_gemini_url(gem_key), + json=_gemini_payload(image_b64, prompt, mime, max_tokens), + timeout=min(timeout, 30.0)) + if r.status_code == 200: + txt = _parse_gemini(r.json()) + if txt: + return txt + log.info("Gemini vision %s; falling back to local", r.status_code) + except Exception as e: + log.info("Gemini vision error (%s); falling back to local", e) + + try: + r = requests.post(local_url.rstrip("/") + "/chat/completions", + json=_local_payload(image_b64, prompt, mime, local_model, max_tokens), + headers={"Content-Type": "application/json"}, timeout=timeout) + if r.status_code == 200: + return _parse_local(r.json()) + log.warning("Local vision returned %s: %s", r.status_code, r.text[:200]) + except Exception as e: + log.warning("Local vision error: %s", e) + return "" + + +async def describe_async(image_b64: str, prompt: str, *, mime: str = "image/jpeg", + max_tokens: int = 500, timeout: float = 120.0, + http: Optional[Any] = None) -> str: + """Async (httpx) vision describe. Gemini Flash → local Qwen-VL. Reuses the + caller's httpx client if given (e.g. VoicePipeline._http), else makes one.""" + import httpx + provider, gem_key, local_url, local_model = _vision_config() + own_client = http is None + client = http or httpx.AsyncClient(timeout=timeout) + try: + if provider == "gemini" and gem_key: + try: + r = await client.post(_gemini_url(gem_key), + json=_gemini_payload(image_b64, prompt, mime, max_tokens), + timeout=min(timeout, 30.0)) + if r.status_code == 200: + txt = _parse_gemini(r.json()) + if txt: + return txt + log.info("Gemini vision %s; falling back to local", r.status_code) + except Exception as e: + log.info("Gemini vision error (%s); falling back to local", e) + + try: + r = await client.post(local_url.rstrip("/") + "/chat/completions", + json=_local_payload(image_b64, prompt, mime, local_model, max_tokens), + headers={"Content-Type": "application/json"}, timeout=timeout) + if r.status_code == 200: + return _parse_local(r.json()) + log.warning("Local vision returned %s: %s", r.status_code, r.text[:200]) + except Exception as e: + log.warning("Local vision error: %s", e) + return "" + finally: + if own_client: + await client.aclose() diff --git a/codec_voice.py b/codec_voice.py index 9f24251..d02de9a 100644 --- a/codec_voice.py +++ b/codec_voice.py @@ -666,54 +666,12 @@ async def _analyze_screenshot(self, image_b64: str, user_text: str) -> str: "Focus on the main content, app, or task visible. " "Be specific about text, UI elements, and what the user appears to be working on." ) - # Try Gemini Flash first (fast, reliable) - if VISION_PROVIDER == "gemini" and GEMINI_API_KEY: - try: - url = f"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent?key={GEMINI_API_KEY}" - payload = { - "contents": [{"parts": [ - {"inlineData": {"mimeType": "image/jpeg", "data": image_b64}}, - {"text": prompt} - ]}], - "generationConfig": {"maxOutputTokens": 500} - } - print("[Voice] Sending to Gemini Flash vision...") - r = await self._http.post(url, json=payload, timeout=30.0) - if r.status_code == 200: - candidates = r.json().get("candidates", []) - if candidates: - parts = candidates[0].get("content", {}).get("parts", []) - if parts: - result = parts[0].get("text", "").strip() - if result: - print(f"[Voice] Gemini vision OK: {len(result)} chars") - return result - print(f"[Voice] Gemini failed ({r.status_code}), falling back to local...") - except Exception as e: - print(f"[Voice] Gemini error: {e}, falling back to local...") - - # Fallback: local Qwen VL - payload = { - "model": VISION_MODEL, - "messages": [{"role": "user", "content": [ - {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image_b64}"}}, - {"type": "text", "text": prompt}, - ]}], - "max_tokens": 500, - "temperature": 0.7, - } - try: - r = await self._http.post( - VISION_URL, json=payload, - headers={"Content-Type": "application/json"}, - timeout=120.0, - ) - if r.status_code == 200: - return r.json()["choices"][0]["message"]["content"].strip() - print(f"[Voice] Vision model returned {r.status_code}: {r.text[:200]}") - except Exception as e: - print(f"[Voice] Vision analysis error: {e}") - return "" + # A-11 (PR-3E): canonical vision helper (Gemini Flash → local Qwen-VL). + # Reuses this pipeline's httpx client. Was an inline duplicate of the + # same fallback logic in codec.py + codec_session. + import codec_vision + return await codec_vision.describe_async( + image_b64, prompt, mime="image/jpeg", max_tokens=500, http=self._http) async def generate_response(self, user_text: str): self.messages.append({"role": "user", "content": user_text}) diff --git a/docs/PR3E-LLM-VISION-DEDUP-DESIGN.md b/docs/PR3E-LLM-VISION-DEDUP-DESIGN.md new file mode 100644 index 0000000..92a5829 --- /dev/null +++ b/docs/PR3E-LLM-VISION-DEDUP-DESIGN.md @@ -0,0 +1,117 @@ +# PR-3E — LLM-call + vision dedup (DESIGN) + +**Status:** IMPLEMENTED — **Option 2** chosen (A-11 vision dedup + A-12 canonical `codec_llm` API + first chat tranche). See §8 for what actually shipped. +**Findings:** A-11 (vision dup, MEDIUM) + A-12 (51→45 `chat/completions` sites, MEDIUM, audit-flagged **large**). +**Wave:** 3. This is the **hottest code path in the repo** (every feature calls an LLM), so it gets design-first + a phased plan. + +--- + +## 1. Reality check (what the trace found) + +- **`codec_llm_proxy.py` is NOT a proxy.** It's a priority *queue* (semaphore) — its own docstring: *"Does NOT proxy HTTP — callers still make their own requests."* So A-12's "the module already exists, just add `call()`/`stream()`" is **inaccurate**: there is no call/stream helper to reuse. A-12 means **building a new canonical call API** (which uses the queue internally). +- **45 `chat/completions` sites** (was 51; some removed in earlier PRs) across **three shapes**: sync `requests`, async `httpx`, and streaming SSE — with copy-pasted headers, `Authorization: Bearer`, `enable_thinking=False`, `` stripping, and `choices[0].message.content`/`.reasoning` parsing. +- **A-11 vision = 3 divergent impls:** + - `codec.py` `vision_describe`/`_gemini_vision`/`_local_vision` — **sync** (`requests`), Gemini-flash → local-Qwen-VL fallback, PNG. + - `codec_voice._analyze_screenshot` — **async** (`httpx`), Gemini → local fallback, JPEG. + - `codec_session.screenshot_ctx` — **sync**, local-Qwen-VL **only** (no Gemini), PNG, with inline screencapture. + +## 2. Why this is high-risk + +These are the call paths behind voice, chat, vision, agents, bridges. A subtle +regression in payload shape, `` stripping, streaming chunk parsing, +timeout, or error handling silently degrades a core feature. Blast radius = +everything. So: **small, behavior-parity tranches with mocked-HTTP tests that +assert payload/response equivalence — never a 45-site big-bang.** + +## 3. Recommended plan — split A-11 from A-12, phase A-12 + +The audit lumps A-11 + A-12 as "PR-3E," but they're independent and A-12 is +"large." Recommended: + +### This PR (PR-3E) — **A-11 vision dedup only** (contained, ~3 consumers) +- New **`codec_vision.py`**: the single canonical vision helper. + - `describe_sync(image_b64, prompt, *, mime="image/png", max_tokens=800) -> str` + - `async describe_async(image_b64, prompt, *, mime="image/jpeg", max_tokens=500, http=None) -> str` + - Both: Gemini-flash (if `VISION_PROVIDER=="gemini"` and key present) → local-Qwen-VL fallback, reading config (`vision_base_url`, `vision_model`, `get_gemini_api_key`). One place to change the model / provider / API shape. +- Migrate the 3 consumers to delegate: + - `codec.py`: `vision_describe` → `codec_vision.describe_sync`; drop `_gemini_vision`/`_local_vision`. + - `codec_voice._analyze_screenshot` → `await codec_vision.describe_async(..., http=self._http)`. + - `codec_session.screenshot_ctx` → `codec_vision.describe_sync` (gains Gemini fallback it lacked — a minor *improvement*, behaviorally a superset; flagged in the PR). +- **Tests:** mock HTTP; assert Gemini-first + local-fallback, payload shapes, mime handling, empty-on-failure. ~8 tests. +- **Risk:** medium-low (vision is less hot than chat; 3 well-understood sites). Behavior parity except session gaining the Gemini fallback (documented). + +### Follow-on (PR-3E-2+, separate design) — **A-12 chat/completions** +- Build **`codec_llm.py`**: `call(messages, *, model, temperature, max_tokens, priority, **kw) -> str` (sync) + `stream(...)` (SSE generator) + an async variant. Centralizes headers, `enable_thinking`, `` strip, `choices/reasoning` parse, queue-slot acquisition, timeouts, error shape. +- Migrate the 45 sites **in small tranches by subsystem**, each its own PR with parity tests: e.g. (1) codec.py + codec_session, (2) dashboard, (3) voice, (4) agents/agent_plan/agent_runner, (5) bridges (telegram/imessage), (6) misc (compaction/self_improve/watcher/textassist/dictate). Each tranche is independently revertable. +- This is deliberately **not** in this PR — 45 hot-path sites in one diff is unreviewable + high-risk. + +## 4. API / schema changes +- New module `codec_vision.py` (this PR). No on-disk schema, no config changes + (reuses existing `vision_*` config keys + `get_gemini_api_key`). +- `codec.py` loses `_gemini_vision`/`_local_vision` (internal); `vision_describe` + kept as a thin delegate for any external caller. +- (A-12's `codec_llm.py` is a later PR.) + +## 5. Test plan (this PR — A-11) +- New `tests/test_vision_dedup.py`: + - `describe_sync`: Gemini path returns text; Gemini failure → local fallback; + both fail → `""`; correct payload shape per provider; mime respected. + - `describe_async`: same matrix with a mocked httpx client. + - Source invariants: codec.py no longer defines `_gemini_vision`/`_local_vision`; + voice + session call `codec_vision`. +- Regression: full suite (expect the 23 known failures, zero new). No `skills/` + touched → no manifest regen. +- Manual (Mac Studio): voice "look at my screen" + a chat screenshot still + describe correctly via both providers. + +## 6. Risk + rollback +- **Blast radius (this PR):** 3 files edited + 1 new module. Vision only — chat + paths untouched. +- **Rollback:** single-commit revert restores the inline impls. No persistent + state touched. +- A-12 risk is deferred to its own phased PRs (each small + revertable). + +## 7. Open question for you (Mickael) +**Q: scope of PR-3E?** +- **Option 1 (recommended):** PR-3E = **A-11 vision dedup only**, now. A-12 + (chat/completions) becomes its own phased effort with a separate design doc + (build `codec_llm.call/stream` + migrate sites tranche-by-tranche). Keeps every + PR reviewable + low-risk on the hottest path. +- **Option 2:** PR-3E = A-11 **+** A-12's canonical `codec_llm` API **+** the + first chat tranche (codec.py + codec_session). Bigger, riskier single PR. +- **Option 3:** Do A-12 API first (no A-11 yet). + +I recommend **Option 1**. Pick one and I'll implement + open the PR +(chat-review-then-merge — hot path). + +> **Decision: Option 2.** Mickael chose A-11 + A-12-API + first chat tranche in +> one PR. Implemented as §8 below. + +--- + +## 8. Implementation (shipped — Option 2) + +### New modules +- **`codec_vision.py` (A-11)** — single canonical screen-vision helper. + - `describe_sync(image_b64, prompt, *, mime="image/png", max_tokens=800, timeout=120.0) -> str` + - `async describe_async(image_b64, prompt, *, mime="image/jpeg", max_tokens=500, timeout=120.0, http=None) -> str` + - Both: Gemini-flash (`gemini-2.0-flash`, when `vision_provider=="gemini"` + key present) → local-Qwen-VL `/chat/completions` fallback; return `""` on total failure. + - `_vision_config()` reads provider/key/url/model **live** from `codec_config` each call (so provider/model/Keychain changes take effect without restart); safe defaults if `codec_config` can't import. + - `describe_async` reuses the caller's httpx client when passed (`http=self._http`), else makes + closes its own. +- **`codec_llm.py` (A-12 — canonical call API)** — config-agnostic (no `codec_config` import → no import cycle). + - `strip_think(text)` — drops `` (DOTALL) + trims. + - `extract_content(response_json)` — `choices[0].message.content` → `.reasoning` fallback, ``-stripped, `""` on malformed shape. + - `call(messages, *, base_url, model, api_key="", max_tokens=500, temperature=0.7, timeout=120.0, retries=1, enable_thinking=False, extra_kwargs=None) -> str` — builds headers (`Bearer` only when `api_key`), payload (`model`/`messages`/`max_tokens`/`temperature`/`chat_template_kwargs.enable_thinking` + merged `extra_kwargs`), POSTs to `base_url.rstrip("/")+"/chat/completions"`, retries with `2**attempt` backoff, returns extracted+stripped text or `""` (never raises). + +### Migrated sites (this PR) +- **`codec.py`** — `vision_describe` → `codec_vision.describe_sync` (deleted `_gemini_vision`/`_local_vision`); voice-reply chat block in `_dispatch_inner` → `codec_llm.call`. Removed now-unused imports (`QWEN_VISION_URL`, `QWEN_VISION_MODEL`, `strip_think`). +- **`codec_voice.py`** — `_analyze_screenshot` → `await codec_vision.describe_async(..., http=self._http)`. (Module-level `VISION_PROVIDER`/`GEMINI_API_KEY` retained — still used by observer transport logic; `VISION_URL`/`VISION_MODEL` now vestigial, cleanup deferred to the voice A-12 tranche.) +- **`codec_session.py`** — `screenshot_ctx` → `codec_vision.describe_sync` (**gains** the Gemini fallback it previously lacked — a documented behavioral superset); `qwen_call` → `codec_llm.call` (retries=3). + +### Deliberately deferred (follow-on tranches, each its own PR + design) +- `codec_session.qwen_stream` (SSE streaming) — needs a `codec_llm.stream()` generator; not in this PR. +- The remaining ~40 `chat/completions` sites (dashboard, voice generate_response, agents/agent_plan/agent_runner, bridges, compaction/self_improve/watcher/textassist/dictate) — migrated tranche-by-tranche per §3. + +### Tests +- **`tests/test_llm_vision_dedup.py`** — 19 tests: `strip_think`/`extract_content` matrix; `codec_llm.call` success / no-key-omits-auth / retries-then-empty / exception-returns-empty; `codec_vision.describe_sync` gemini-first / gemini→local fallback / local-only-when-provider-local / both-fail-empty; `describe_async` gemini + fallback (driven via `asyncio.run` + a fake httpx client — no `pytest-asyncio` dependency); source-level migration invariants (codec.py/voice/session call the canonical helpers, inline impls gone). +- Full suite: **23 known-baseline failures, zero new.** No `skills/` touched → no manifest regen. diff --git a/docs/audits/PHASE-1-CODE-QUALITY.md b/docs/audits/PHASE-1-CODE-QUALITY.md index b6113f1..1c16749 100644 --- a/docs/audits/PHASE-1-CODE-QUALITY.md +++ b/docs/audits/PHASE-1-CODE-QUALITY.md @@ -113,12 +113,16 @@ Both scan `SKILLS_DIR` independently, so a skill file is loaded twice in differe **Effort:** small ### A-11 — `vision_describe` / `_gemini_vision` / `_local_vision` duplicated between codec.py and codec_voice.py [MEDIUM] + +> **Closed by PR-3E** (Option 2). New canonical **`codec_vision.py`** (`describe_sync` + `describe_async`, Gemini-flash → local-Qwen-VL fallback, config read live from `codec_config`). All three consumers now delegate: `codec.py:vision_describe` (deleted `_gemini_vision`/`_local_vision`), `codec_voice._analyze_screenshot` (async, reuses `self._http`), `codec_session.screenshot_ctx` (now **gains** the Gemini fallback it previously lacked — documented behavioral superset). A model/provider/API change is now a one-file edit. Pinned by `tests/test_llm_vision_dedup.py` (describe_sync/async provider matrix + source-invariant checks). See `docs/PR3E-LLM-VISION-DEDUP-DESIGN.md` §8. **Location:** `codec.py:69-111` (full implementation) and `codec_voice.py:659-714` (`_analyze_screenshot` — different shape but same Gemini Flash + local Qwen VL fallback pattern, same hardcoded `gemini-2.0-flash` model name and OpenAI vision-message shape). `codec_session.py:202-238` has yet another inline `screenshot_ctx` + vision call. **Impact:** When the user upgrades Gemini model, switches vision provider, or fixes a vision-API regression, they have to touch 3 places. Investor-grade: violates "single source of truth" principle stated in CLAUDE.md §10 and codec_core.py docstring. **Recommended fix:** Move `vision_describe` + provider routing into `codec_core.py` (or a new `codec_vision.py`) as the single canonical helper. Update `codec_voice.py._analyze_screenshot` to call it; update `codec_session.py:screenshot_ctx` to call it. **Effort:** medium ### A-12 — 51 separate `chat/completions` HTTP call sites with copy-pasted payload shapes [MEDIUM] + +> **First tranche closed by PR-3E** (Option 2); remainder phased. The audit's premise that `codec_llm_proxy.py` already has a `call()`/`stream()` to reuse was **inaccurate** — that module is a priority *queue*, not an HTTP caller. PR-3E instead built the genuinely-new canonical **`codec_llm.py`** (`call()` non-streaming, plus `strip_think`/`extract_content`: headers, `Bearer` auth, `chat_template_kwargs.enable_thinking`, `` strip, `choices/reasoning` parse, retry+backoff, never-raises). **Migrated this PR:** `codec.py` voice-reply chat + `codec_session.qwen_call`. **Deferred (each its own tranche/PR + design):** `codec_session.qwen_stream` (needs `codec_llm.stream()` SSE generator) + the remaining ~40 sites (dashboard, voice `generate_response`, agents/agent_plan/agent_runner, telegram/imessage bridges, compaction/self_improve/watcher/textassist/dictate). Pinned by `tests/test_llm_vision_dedup.py`. See `docs/PR3E-LLM-VISION-DEDUP-DESIGN.md` §3 (phased plan) + §8 (shipped). **Location:** Sample sites: `codec.py:702`, `codec_dashboard.py:980,1076,1215`, `codec_voice.py:180,196,208,213`, `codec_session.py:215,278,307`, `codec_agents.py:51`, `codec_agent_plan.py:239`, `codec_agent_runner.py:148`, `codec_compaction.py:78`, `codec_self_improve.py:238`, `codec_telegram.py:471,508`, `codec_imessage.py:341,391`, `codec_textassist.py:33`, `codec_dictate.py:492`, `codec_watcher.py:86,182`. Total: 51 occurrences via `grep -rn "chat/completions"`. **Impact:** Each site repeats: headers build, `Authorization: Bearer {api_key}` formatting, `{Content-Type: application/json}`, payload assembly with `chat_template_kwargs.enable_thinking=False`, `` stripping, `try/except` for `r.json()` shape (`choices[0].message.content` or `.reasoning` fallback). Many also re-implement streaming SSE parsing. When the Qwen-3.6 upgrade landed, this is exactly the kind of change that needs to be applied in 20+ places. **Recommended fix:** Add `codec_llm_proxy.call(messages, **kwargs)` and `codec_llm_proxy.stream(messages, **kwargs)` as the single canonical API (the module already exists at `codec_llm_proxy.py`, only 130 LOC, only used by codec_voice + codec_agents). Migrate all 51 sites over the course of the Phase 1 hardening. As a first step, just covering the 5 sites in codec.py + codec_dashboard.py + codec_session.py would remove ~80% of the most-edited duplication. diff --git a/docs/audits/PHASE-1-CONSOLIDATED-TRIAGE.md b/docs/audits/PHASE-1-CONSOLIDATED-TRIAGE.md index 6c1ed9b..99ccb88 100644 --- a/docs/audits/PHASE-1-CONSOLIDATED-TRIAGE.md +++ b/docs/audits/PHASE-1-CONSOLIDATED-TRIAGE.md @@ -242,7 +242,7 @@ Mirror the Intake Phase 3 wave pattern. 7 waves planned; sizes are PR-counts, NO - PR-3C: A-16 + A-17 + A-21 — wire `WAKE_PHRASES` (deduped homophone keywords + length-guarded phrase match in a testable `_is_wake_utterance`) + wire `draft_keywords` into `codec_core.is_draft` + remove dead `AGENT_NAME` constant ✅ (branch `fix/pr3c-wire-config-knobs`; 13 tests; zero net-new ruff; full suite 1338 passing). **A-4 (skill-loader unification) deliberately split out → its own PR**: it refactors the LIVE multi-file skill-dispatch path (`codec_core.load_skills` is called from codec.py + dashboard ×2 + voice + agent_runner), needs a careful voice-path test pass, and doesn't belong bundled with these contained config-wiring fixes. - A-4: skill-loader unification ✅ (branch `fix/pr3-a4-skill-loader-unification`, design-first per §11 → `docs/A4-SKILL-LOADER-UNIFICATION-DESIGN.md`). Deleted legacy `codec_core.{loaded_skills,load_skills,run_skill}`; codec.py + cortex_skills now use canonical `codec_dispatch` registry. Closed a real **security gap** (legacy path skipped the PR-1A AST gate) + a **hooks bypass** (voice path now fires run_with_hooks). Option A: `custom_triggers.json` now honored everywhere via SkillRegistry. 10 tests; full suite 1376 passing. - PR-3D: A-5 + A-6 + A-7 — extract helpers from the 3 monolithic functions (`_dispatch_inner`, `chat_completion`, `Agent.run`) -- PR-3E: A-11 + A-12 — unify vision + 51-site `chat/completions` through `codec_llm_proxy` +- PR-3E: A-11 + A-12 — unify vision + `chat/completions` ✅ (branch `fix/pr3e-llm-vision-dedup`, design-first per §11 → `docs/PR3E-LLM-VISION-DEDUP-DESIGN.md`; **Option 2** chosen by Mickael). **A-11 fully closed**: new `codec_vision.py` (sync+async, Gemini→local fallback, live config); all 3 consumers (codec.py/voice/session) delegate; session gains a Gemini fallback it lacked. **A-12 first tranche**: discovered `codec_llm_proxy` is a *queue*, not an HTTP caller — built genuinely-new `codec_llm.py` (`call()` + `strip_think`/`extract_content`, retry, never-raises) and migrated codec.py voice-reply chat + `codec_session.qwen_call`. **Deferred to phased follow-ons**: `qwen_stream` SSE (needs `codec_llm.stream()`) + ~40 remaining sites (dashboard/voice/agents/bridges/misc), each its own tranche. 19 tests (`tests/test_llm_vision_dedup.py`); full suite zero new failures. - PR-3F (optional, large): A-19 — bridge unification (iMessage + Telegram → `BridgeRouter`) - PR-3G: small misc ✅ (branch `fix/pr3g-small-misc-cleanup`) — closed A-9 (DISABLED overlay, ~90 LOC), A-10 (run_session_module, 33 LOC + orphan `import sys`), A-14 (close_session shadow import), A-18 (9 unused Pydantic models + dead typing import). A-13 (dashboard pattern blocker) verified **already closed by PR-2C**. 6 regression tests; zero net-new ruff (net −); full suite 1344 passing. **Deferred from this batch (each needs its own focused PR):** A-8 (codec_keyboard.py 398 LOC — verify-first delete-or-migrate decision), A-15 (config_version — additive migration feature touching `load_config`), A-20 (inline sqlite in the live dispatch path — reliability fix needing a CodecMemory method). - A-15: config schema versioning ✅ (branch `fix/pr3-a15-config-versioning`; `CONFIG_SCHEMA_VERSION=1` + migration ladder + idempotent atomic write-back in `load_config`; never creates-on-missing or overwrites-corrupt; 12 tests; zero net-new ruff; full suite 1356 passing). diff --git a/tests/test_llm_vision_dedup.py b/tests/test_llm_vision_dedup.py new file mode 100644 index 0000000..ff33677 --- /dev/null +++ b/tests/test_llm_vision_dedup.py @@ -0,0 +1,262 @@ +"""Tests for PR-3E — LLM-call (A-12) + vision (A-11) dedup. + +- codec_llm.call: the canonical chat/completions caller (headers, payload, + enable_thinking, strip, content→reasoning extraction, retries). +- codec_vision.describe_sync/_async: canonical Gemini-Flash → local-Qwen-VL. +- First-tranche migrations: codec.py (vision + voice chat), codec_session + (vision + qwen_call), codec_voice._analyze_screenshot. + +Reference: docs/PR3E-LLM-VISION-DEDUP-DESIGN.md (Option 2). +""" +from __future__ import annotations + +import asyncio +import sys +from pathlib import Path + +REPO = Path(__file__).resolve().parent.parent +sys.path.insert(0, str(REPO)) + +import codec_llm # noqa: E402 +import codec_vision # noqa: E402 + + +# ── helpers ────────────────────────────────────────────────────────────────── + + +class _Resp: + def __init__(self, status, payload=None, text=""): + self.status_code = status + self._payload = payload or {} + self.text = text + + def json(self): + return self._payload + + +def _msg(content=None, reasoning=None): + m = {} + if content is not None: + m["content"] = content + if reasoning is not None: + m["reasoning"] = reasoning + return {"choices": [{"message": m}]} + + +# ── codec_llm.extract_content + strip_think ────────────────────────────────── + + +def test_strip_think(): + assert codec_llm.strip_think("plananswer") == "answer" + assert codec_llm.strip_think(" hi ") == "hi" + assert codec_llm.strip_think("") == "" + + +def test_extract_content_prefers_content(): + assert codec_llm.extract_content(_msg(content="hello")) == "hello" + + +def test_extract_content_reasoning_fallback(): + assert codec_llm.extract_content(_msg(content="", reasoning="fallback")) == "fallback" + + +def test_extract_content_strips_think(): + assert codec_llm.extract_content(_msg(content="xreal")) == "real" + + +def test_extract_content_bad_shape_returns_empty(): + assert codec_llm.extract_content({}) == "" + assert codec_llm.extract_content({"choices": []}) == "" + + +# ── codec_llm.call ─────────────────────────────────────────────────────────── + + +def test_call_success(monkeypatch): + captured = {} + + def fake_post(url, json=None, headers=None, timeout=None): + captured["url"] = url + captured["json"] = json + captured["headers"] = headers + return _Resp(200, _msg(content="42")) + + import requests + monkeypatch.setattr(requests, "post", fake_post) + out = codec_llm.call([{"role": "user", "content": "q"}], + base_url="http://x/v1", model="qwen", api_key="k", + max_tokens=400, temperature=0.7, extra_kwargs={"top_p": 0.9}) + assert out == "42" + assert captured["url"] == "http://x/v1/chat/completions" + assert captured["headers"]["Authorization"] == "Bearer k" + p = captured["json"] + assert p["model"] == "qwen" + assert p["max_tokens"] == 400 and p["temperature"] == 0.7 + assert p["chat_template_kwargs"] == {"enable_thinking": False} + assert p["top_p"] == 0.9 # extra_kwargs merged + + +def test_call_no_api_key_omits_auth(monkeypatch): + captured = {} + + def fake_post(url, json=None, headers=None, timeout=None): + captured["headers"] = headers + return _Resp(200, _msg(content="ok")) + + import requests + monkeypatch.setattr(requests, "post", fake_post) + codec_llm.call([{"role": "user", "content": "q"}], base_url="http://x/v1", model="m") + assert "Authorization" not in captured["headers"] + + +def test_call_retries_then_empty(monkeypatch): + calls = {"n": 0} + + def fake_post(url, json=None, headers=None, timeout=None): + calls["n"] += 1 + return _Resp(500, text="err") + + import requests + monkeypatch.setattr(requests, "post", fake_post) + monkeypatch.setattr(codec_llm.time, "sleep", lambda *_: None) # no real backoff + out = codec_llm.call([{"role": "user", "content": "q"}], + base_url="http://x/v1", model="m", retries=3) + assert out == "" + assert calls["n"] == 3 # all attempts used + + +def test_call_exception_returns_empty(monkeypatch): + def fake_post(*a, **k): + raise ConnectionError("down") + + import requests + monkeypatch.setattr(requests, "post", fake_post) + monkeypatch.setattr(codec_llm.time, "sleep", lambda *_: None) + assert codec_llm.call([{"role": "user", "content": "q"}], + base_url="http://x/v1", model="m", retries=2) == "" + + +# ── codec_vision.describe_sync ─────────────────────────────────────────────── + + +def test_describe_sync_gemini_first(monkeypatch): + monkeypatch.setattr(codec_vision, "_vision_config", + lambda: ("gemini", "gemkey", "http://local/v1", "qwen-vl")) + captured = {} + + def fake_post(url, json=None, headers=None, timeout=None): + captured["url"] = url + # Gemini response shape + return _Resp(200, {"candidates": [{"content": {"parts": [{"text": "a chart"}]}}]}) + + import requests + monkeypatch.setattr(requests, "post", fake_post) + out = codec_vision.describe_sync("b64", "what is this?", mime="image/png") + assert out == "a chart" + assert "generativelanguage.googleapis.com" in captured["url"] + + +def test_describe_sync_falls_back_to_local(monkeypatch): + monkeypatch.setattr(codec_vision, "_vision_config", + lambda: ("gemini", "gemkey", "http://local/v1", "qwen-vl")) + seen = [] + + def fake_post(url, json=None, headers=None, timeout=None): + seen.append(url) + if "googleapis" in url: + return _Resp(500, text="gemini down") + return _Resp(200, _msg(content="local says hi")) + + import requests + monkeypatch.setattr(requests, "post", fake_post) + out = codec_vision.describe_sync("b64", "p") + assert out == "local says hi" + assert any("googleapis" in u for u in seen) and any("local/v1" in u for u in seen) + + +def test_describe_sync_local_only_when_provider_local(monkeypatch): + monkeypatch.setattr(codec_vision, "_vision_config", + lambda: ("local", "", "http://local/v1", "qwen-vl")) + seen = [] + + def fake_post(url, json=None, headers=None, timeout=None): + seen.append(url) + return _Resp(200, _msg(content="local")) + + import requests + monkeypatch.setattr(requests, "post", fake_post) + out = codec_vision.describe_sync("b64", "p") + assert out == "local" + assert not any("googleapis" in u for u in seen) # Gemini never tried + + +def test_describe_sync_both_fail_returns_empty(monkeypatch): + monkeypatch.setattr(codec_vision, "_vision_config", + lambda: ("gemini", "k", "http://local/v1", "m")) + import requests + monkeypatch.setattr(requests, "post", lambda *a, **k: _Resp(500, text="x")) + assert codec_vision.describe_sync("b64", "p") == "" + + +# ── codec_vision.describe_async ────────────────────────────────────────────── + + +class _FakeAsyncClient: + def __init__(self, handler): + self._handler = handler + + async def post(self, url, json=None, headers=None, timeout=None): + return self._handler(url, json) + + +def test_describe_async_gemini(monkeypatch): + monkeypatch.setattr(codec_vision, "_vision_config", + lambda: ("gemini", "k", "http://local/v1", "m")) + + def handler(url, json): + return _Resp(200, {"candidates": [{"content": {"parts": [{"text": "async vision"}]}}]}) + + out = asyncio.run(codec_vision.describe_async("b64", "p", http=_FakeAsyncClient(handler))) + assert out == "async vision" + + +def test_describe_async_fallback(monkeypatch): + monkeypatch.setattr(codec_vision, "_vision_config", + lambda: ("gemini", "k", "http://local/v1", "m")) + + def handler(url, json): + if "googleapis" in url: + return _Resp(500) + return _Resp(200, _msg(content="local async")) + + out = asyncio.run(codec_vision.describe_async("b64", "p", http=_FakeAsyncClient(handler))) + assert out == "local async" + + +# ── source-level migration invariants ─────────────────────────────────────── + + +def test_codec_vision_is_single_source(): + src = (REPO / "codec.py").read_text() + assert "def _gemini_vision" not in src and "def _local_vision" not in src + assert "codec_vision.describe_sync" in src + + +def test_codec_chat_uses_codec_llm(): + src = (REPO / "codec.py").read_text() + assert "codec_llm.call(" in src + # No inline chat/completions POST left in _dispatch_inner's LLM block + assert 'f"{QWEN_BASE_URL}/chat/completions"' not in src + + +def test_voice_uses_codec_vision(): + src = (REPO / "codec_voice.py").read_text() + assert "codec_vision.describe_async" in src + # Inline gemini URL gone from the analyze path + assert "generativelanguage.googleapis.com" not in src + + +def test_session_uses_canonical_helpers(): + src = (REPO / "codec_session.py").read_text() + assert "codec_llm.call(" in src + assert "codec_vision.describe_sync" in src