AVADSA25 · AVADSA25 · May 22, 2026 · May 22, 2026 · May 22, 2026
diff --git a/AGENTS.md b/AGENTS.md
@@ -56,6 +56,8 @@ docs/                        API.md, MCP_HTTP_SETUP.md, CONTEXT_REPORT.md, desig
 
 Other engine modules (`codec_overlays`, `codec_metrics`, `codec_logging`, `codec_gdocs`, `codec_google_auth`, `codec_cdp`, `codec_llm_proxy`, `codec_retry`, `codec_alerts`, `codec_search`, `codec_textassist`, `codec_watcher`, `codec_watchdog`) are internal helpers — read them when you need them, but they're not part of the navigation surface for an agent making structural changes. (Keyboard handling — wake word, F13 toggle, F18 voice, double-tap — lives **inline in `codec.py`** in the `codec` PM2 process; the old standalone `codec_keyboard.py` was deleted as a dead duplicate per A-8.)
 
+**Canonical LLM + vision helpers (PR-3E, A-11/A-12).** `codec_vision.py` is the SINGLE source for screen-vision (`describe_sync` / `describe_async`, Gemini-flash → local-Qwen-VL fallback, config read live from `codec_config`) — used by `codec.py`, `codec_voice`, `codec_session`. `codec_llm.py` is the canonical chat/completions caller (`call()` + `strip_think`/`extract_content` — headers, Bearer auth, `enable_thinking`, `<think>` strip, `choices/reasoning` parse, retry+backoff, never-raises). NOTE: `codec_llm_proxy.py` is a priority *queue* (semaphore), NOT an HTTP caller — don't confuse the two. A-12 is migrating the ~45 inline `chat/completions` sites onto `codec_llm` in phased tranches; codec.py voice-reply + `codec_session.qwen_call` are done, streaming (`codec_llm.stream()`) + the rest are pending.
+
 ## 3. Agent + Crew runtime
 
 CODEC has its own minimalist multi-agent runtime in `codec_agents.py`. **Zero dependency on CrewAI or LangChain** — it's self-contained, only depends on `requests` and `codec_skill_registry`.

diff --git a/codec.py b/codec.py
@@ -22,7 +22,7 @@
 # ── CONFIG (single source of truth: codec_config.py) ─────────────────────────
 from codec_config import (
     cfg as _cfg,
-    QWEN_BASE_URL, QWEN_MODEL, LLM_API_KEY, LLM_KWARGS, QWEN_VISION_URL, QWEN_VISION_MODEL,
+    QWEN_BASE_URL, QWEN_MODEL, LLM_API_KEY, LLM_KWARGS,
     WHISPER_URL,
     TASK_QUEUE_FILE, DRAFT_TASK_FILE, SESSION_ALIVE, STREAMING, WAKE_WORD, WAKE_ENERGY, WAKE_CHUNK_SEC,
     WAKE_PHRASES,
@@ -71,7 +71,7 @@ def _is_wake_utterance(text: str) -> bool:
 # ─��� SHARED (from codec_core.py — single source of truth) ─────────────────────
 import codec_core as _core
 from codec_core import (
-    strip_think, is_draft, init_db, save_task, update_session_response, get_memory, get_recent_conversations,
+    is_draft, init_db, save_task, update_session_response, get_memory, get_recent_conversations,
     transcribe, speak_text, focused_app, get_text_dialog,
     terminal_session_exists,
     # A-14 (PR-3G): `close_session` import dropped — codec.py defines its own
@@ -96,50 +96,16 @@ def _is_wake_utterance(text: str) -> bool:
 # safety gate AND plugin lifecycle hooks (run_with_hooks), both of which the
 # legacy path bypassed.
 
-# ── VISION (Gemini Flash or local Qwen VL) ──────────────────────────────────
-def _gemini_vision(img_b64, prompt, max_tokens=800):
-    """Call Gemini Flash vision API. Fast, reliable, free tier."""
-    import requests
-    url = f"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent?key={GEMINI_API_KEY}"
-    payload = {
-        "contents": [{"parts": [
-            {"inlineData": {"mimeType": "image/png", "data": img_b64}},
-            {"text": prompt}
-        ]}],
-        "generationConfig": {"maxOutputTokens": max_tokens}
-    }
-    r = requests.post(url, json=payload, timeout=30)
-    if r.status_code == 200:
-        candidates = r.json().get("candidates", [])
-        if candidates:
-            parts = candidates[0].get("content", {}).get("parts", [])
-            if parts:
-                return parts[0].get("text", "").strip()
-    else:
-        print(f"[CODEC] Gemini error {r.status_code}: {r.text[:200]}")
-    return ""
-
-def _local_vision(img_b64, prompt, max_tokens=800):
-    """Call local Qwen VL vision API (fallback)."""
-    import requests
-    r = requests.post(f"{QWEN_VISION_URL}/chat/completions",
-        json={"model": QWEN_VISION_MODEL,
-            "messages": [{"role": "user", "content": [
-                {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{img_b64}"}},
-                {"type": "text", "text": prompt}
-            ]}], "max_tokens": max_tokens}, timeout=60)
-    if r.status_code == 200:
-        return r.json()["choices"][0]["message"].get("content", "").strip()
-    return ""
+# ── VISION (A-11, PR-3E: canonical helper in codec_vision) ──────────────────
+# The Gemini-Flash → local-Qwen-VL fallback used to be hand-rolled here (and in
+# codec_voice + codec_session). It now lives in codec_vision; this is a thin
+# delegate kept for any caller of codec.vision_describe.
+import codec_vision
+import codec_llm  # A-12: canonical chat/completions caller
 
 def vision_describe(img_b64, prompt="Read all visible text on this screen. Include app name, window title, and all message/content text. Output raw text only.", max_tokens=800):
-    """Route vision to Gemini or local based on config."""
-    if VISION_PROVIDER == "gemini" and GEMINI_API_KEY:
-        result = _gemini_vision(img_b64, prompt, max_tokens)
-        if result:
-            return result
-        print("[CODEC] Gemini failed, falling back to local vision...")
-    return _local_vision(img_b64, prompt, max_tokens)
+    """Route vision to Gemini or local based on config (codec_vision)."""
+    return codec_vision.describe_sync(img_b64, prompt, mime="image/png", max_tokens=max_tokens)
 
 def screenshot_ctx():
     try:
@@ -444,55 +410,41 @@ def _post_skill_screenshot():
 
     push(lambda: show_processing_overlay('Thinking...', 15000))
     try:
-        import requests as _llm_req
-        headers = {}
-        if LLM_API_KEY:
-            headers["Authorization"] = f"Bearer {LLM_API_KEY}"
-        payload = {
-            "model": QWEN_MODEL,
-            "messages": llm_messages,
-            "max_tokens": 400,
-            "temperature": 0.7,
-            "chat_template_kwargs": {"enable_thinking": False},
-        }
-        payload.update(LLM_KWARGS)
-        r = _llm_req.post(f"{QWEN_BASE_URL}/chat/completions", json=payload, headers=headers, timeout=120)
-        if r.status_code == 200:
-            data = r.json()
-            answer = data.get("choices", [{}])[0].get("message", {}).get("content", "")
-            answer = strip_think(answer).strip()
-            if answer:
-                print(f"[CODEC] Voice reply (turn {voice_session['turn_count']+1}): {answer[:120]}")
-                log_event("tts_speak", "open-codec",
-                          f"TTS: {answer[:60]}",
-                          extra={"text_len": len(answer)})
-                # Add assistant response to session history
-                voice_session["messages"].append({"role": "assistant", "content": answer})
-                voice_session["turn_count"] += 1
-                # Save response to DB (A-20: via codec_core helper with
-                # WAL + busy_timeout — replaces the inline lock-prone
-                # sqlite3.connect that risked "database is locked" under
-                # concurrent agent-runner + voice writes). Never raises.
-                update_session_response(rid, answer[:500])
-                # Save to shared memory (same store as Chat)
-                try:
-                    cm = CodecMemory()
-                    cm.save("voice", "user", task)
-                    cm.save("voice", "assistant", answer)
-                except Exception as e:
-                    log.warning(f"[CODEC] Memory save failed after LLM: {e}")
-                _last_tts_text = answer[:200]
-                speak_text(answer)
-                _safe_ans = answer[:80].replace('\\', '\\\\').replace('"', '\\"')
-                subprocess.Popen(["osascript", "-e",
-                    f'display notification "{_safe_ans}" with title "CODEC"'],
-                    stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
-            else:
-                print("[CODEC] Voice LLM returned empty response")
-                speak_text("Sorry, I didn't get a response.")
+        # A-12 (PR-3E): canonical codec_llm.call replaces the inline
+        # chat/completions POST + headers + enable_thinking + <think> strip +
+        # choices parse. Returns the stripped answer, or "" on any failure
+        # (non-200 and empty now collapse to the same apology).
+        answer = codec_llm.call(
+            llm_messages, base_url=QWEN_BASE_URL, model=QWEN_MODEL,
+            api_key=LLM_API_KEY, max_tokens=400, temperature=0.7,
+            timeout=120, retries=1, extra_kwargs=LLM_KWARGS,
+        )
+        if answer:
+            print(f"[CODEC] Voice reply (turn {voice_session['turn_count']+1}): {answer[:120]}")
+            log_event("tts_speak", "open-codec",
+                      f"TTS: {answer[:60]}",
+                      extra={"text_len": len(answer)})
+            # Add assistant response to session history
+            voice_session["messages"].append({"role": "assistant", "content": answer})
+            voice_session["turn_count"] += 1
+            # Save response to DB (A-20: codec_core helper, WAL + busy_timeout).
+            update_session_response(rid, answer[:500])
+            # Save to shared memory (same store as Chat)
+            try:
+                cm = CodecMemory()
+                cm.save("voice", "user", task)
+                cm.save("voice", "assistant", answer)
+            except Exception as e:
+                log.warning(f"[CODEC] Memory save failed after LLM: {e}")
+            _last_tts_text = answer[:200]
+            speak_text(answer)
+            _safe_ans = answer[:80].replace('\\', '\\\\').replace('"', '\\"')
+            subprocess.Popen(["osascript", "-e",
+                f'display notification "{_safe_ans}" with title "CODEC"'],
+                stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
         else:
-            print(f"[CODEC] Voice LLM error: {r.status_code} {r.text[:200]}")
-            speak_text("Sorry, the language model is not responding.")
+            print("[CODEC] Voice LLM returned no response")
+            speak_text("Sorry, I didn't get a response.")
     except Exception as e:
         log.error("Voice LLM call failed: %s", e)
         import traceback; traceback.print_exc()

diff --git a/codec_llm.py b/codec_llm.py
@@ -0,0 +1,108 @@
+"""CODEC LLM call helper — the single canonical OpenAI-style chat/completions caller.
+
+A-12 (PR-3E): before this, ~45 sites hand-rolled the same `chat/completions`
+POST — build headers (`Authorization: Bearer …`, `Content-Type`), assemble the
+payload (`model`/`messages`/`max_tokens`/`temperature`/
+`chat_template_kwargs.enable_thinking=False`), parse `choices[0].message`
+(content, with a `reasoning` fallback), and strip `<think>…</think>`. A model
+upgrade or API-shape fix then meant editing 20+ places.
+
+This module centralizes the **non-streaming** call. It is intentionally
+config-agnostic — each caller passes its own `base_url` / `model` / `api_key`
+/ tuning — so it's a pure "build payload → POST → parse" helper with no import
+cycle into codec_config. (Streaming SSE + the remaining call sites are migrated
+in later A-12 tranches; this PR covers the call() API + codec.py + codec_session.)
+
+NOTE: `codec_llm_proxy` is a *priority queue* (semaphore), not an HTTP proxy —
+orthogonal to this module. Callers that want prioritization still wrap the call
+in `llm_queue_sync(...)`; behavior parity for the migrated sites means we do NOT
+add queue acquisition here (none of them used it).
+"""
+from __future__ import annotations
+
+import logging
+import re
+import time
+from typing import Any, Dict, List, Optional
+
+log = logging.getLogger("codec.llm")
+
+_THINK_RE = re.compile(r"<think>.*?</think>", re.DOTALL)
+
+
+def strip_think(text: str) -> str:
+    """Remove <think>…</think> reasoning blocks and surrounding whitespace."""
+    if not text:
+        return ""
+    return _THINK_RE.sub("", text).strip()
+
+
+def extract_content(response_json: Dict[str, Any]) -> str:
+    """Pull the assistant text from an OpenAI-style response: prefer
+    `choices[0].message.content`, fall back to `.reasoning` (some local
+    servers put the answer there when content is empty). `<think>` stripped.
+    Returns "" on any shape mismatch."""
+    try:
+        msg = response_json["choices"][0]["message"]
+    except (KeyError, IndexError, TypeError):
+        return ""
+    content = (msg.get("content") or "").strip()
+    if content:
+        return strip_think(content)
+    reasoning = (msg.get("reasoning") or "").strip()
+    if reasoning:
+        return strip_think(reasoning)
+    return ""
+
+
+def call(
+    messages: List[Dict[str, Any]],
+    *,
+    base_url: str,
+    model: str,
+    api_key: str = "",
+    max_tokens: int = 500,
+    temperature: float = 0.7,
+    timeout: float = 120.0,
+    retries: int = 1,
+    enable_thinking: bool = False,
+    extra_kwargs: Optional[Dict[str, Any]] = None,
+) -> str:
+    """POST `messages` to `<base_url>/chat/completions` and return the parsed,
+    `<think>`-stripped assistant text (or "" on failure).
+
+    `retries` includes the first attempt (retries=3 → up to 3 tries with
+    exponential 2**n backoff between them, matching codec_session.qwen_call).
+    Never raises — network/parse errors are logged and yield "".
+    """
+    import requests
+    headers = {"Content-Type": "application/json"}
+    if api_key:
+        headers["Authorization"] = "Bearer " + api_key
+    payload: Dict[str, Any] = {
+        "model": model,
+        "messages": messages,
+        "max_tokens": max_tokens,
+        "temperature": temperature,
+        "chat_template_kwargs": {"enable_thinking": enable_thinking},
+    }
+    if extra_kwargs:
+        payload.update(extra_kwargs)
+
+    attempts = max(1, retries)
+    url = base_url.rstrip("/") + "/chat/completions"
+    for attempt in range(attempts):
+        try:
+            r = requests.post(url, json=payload, headers=headers, timeout=timeout)
+            if r.status_code == 200:
+                resp = extract_content(r.json())
+                if resp:
+                    return resp
+                # 200 but empty/odd shape — don't retry, nothing more to get.
+                return ""
+            log.warning("LLM call %s returned %s: %s", url, r.status_code, r.text[:200])
+        except Exception as e:
+            log.warning("LLM call attempt %d/%d failed: %s", attempt + 1, attempts, e)
+            if attempt < attempts - 1:
+                time.sleep(2 ** attempt)
+    return ""
diff --git a/codec_session.py b/codec_session.py
@@ -72,15 +72,9 @@ def strip_think(t):
     return re.sub(r"<think>.*?</think>", "", t, flags=re.DOTALL).strip()
 
 
-def extract_content(rj):
-    msg = rj["choices"][0]["message"]
-    c = msg.get("content", "").strip()
-    if c:
-        return strip_think(c)
-    r = msg.get("reasoning", "").strip()
-    if r:
-        return strip_think(r)
-    return ""
+# A-12 (PR-3E): local `extract_content` removed — its only caller was `qwen_call`,
+# now migrated to codec_llm.call (which owns the canonical content→reasoning
+# extraction). `strip_think` above is kept; qwen_stream still uses it.
 
 
 def clean_resp(text):
@@ -210,26 +204,12 @@ def screenshot_ctx(self):
                 ib = base64.b64encode(f.read()).decode()
             os.unlink(tmp.name)
             print("[C] Reading screen...")
-            import requests
-            r = requests.post(
-                self.qwen_vision_url + "/chat/completions",
-                json={
-                    "model": self.qwen_vision_model,
-                    "messages": [
-                        {
-                            "role": "user",
-                            "content": [
-                                {"type": "image_url", "image_url": {"url": "data:image/png;base64," + ib}},
-                                {"type": "text", "text": "Read all visible text. Include app name and content. Raw text only."},
-                            ],
-                        }
-                    ],
-                    "max_tokens": 800,
-                },
-                timeout=120,
-            )
-            if r.status_code == 200:
-                return r.json()["choices"][0]["message"].get("content", "")[:2000]
+            # A-11 (PR-3E): canonical vision helper. Was local-Qwen-VL only here;
+            # now gains the Gemini-Flash fallback for free (config-gated).
+            import codec_vision
+            return codec_vision.describe_sync(
+                ib, "Read all visible text. Include app name and content. Raw text only.",
+                mime="image/png", max_tokens=800)[:2000]
         except Exception as e:
             log.warning(f"Screenshot capture or vision analysis failed: {e}")
         return ""
@@ -266,28 +246,14 @@ def speak(self, text):
     # ── LLM Calls ────────────────────────────────────────────────────────
 
     def qwen_call(self, messages):
-        import requests
-        headers = {"Content-Type": "application/json"}
-        if self.llm_api_key:
-            headers["Authorization"] = "Bearer " + self.llm_api_key
-        payload = {"model": self.qwen_model, "messages": messages, "max_tokens": 500, "temperature": 0.5}
-        payload.update(self.llm_kwargs)
-        for attempt in range(3):
-            try:
-                r = requests.post(
-                    self.qwen_base_url + "/chat/completions",
-                    json=payload,
-                    headers=headers,
-                    timeout=90,
-                )
-                if r.status_code == 200:
-                    resp = extract_content(r.json())
-                    if resp:
-                        return resp
-            except Exception as e:
-                log.warning(f"LLM API call attempt {attempt+1} failed: {e}")
-                time.sleep(2 ** attempt)
-        return ""
+        # A-12 (PR-3E): canonical codec_llm.call (3 retries + backoff, content→
+        # reasoning extraction, <think> strip) — was an inline chat/completions POST.
+        import codec_llm
+        return codec_llm.call(
+            messages, base_url=self.qwen_base_url, model=self.qwen_model,
+            api_key=self.llm_api_key, max_tokens=500, temperature=0.5,
+            timeout=90, retries=3, extra_kwargs=self.llm_kwargs,
+        )
 
     def qwen_stream(self, messages):
         import requests