diff --git a/AGENTS.md b/AGENTS.md
index 667a6d6..276da4c 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -56,6 +56,8 @@ docs/                        API.md, MCP_HTTP_SETUP.md, CONTEXT_REPORT.md, desig
 
 Other engine modules (`codec_overlays`, `codec_metrics`, `codec_logging`, `codec_gdocs`, `codec_google_auth`, `codec_cdp`, `codec_llm_proxy`, `codec_retry`, `codec_alerts`, `codec_search`, `codec_textassist`, `codec_watcher`, `codec_watchdog`) are internal helpers — read them when you need them, but they're not part of the navigation surface for an agent making structural changes. (Keyboard handling — wake word, F13 toggle, F18 voice, double-tap — lives **inline in `codec.py`** in the `codec` PM2 process; the old standalone `codec_keyboard.py` was deleted as a dead duplicate per A-8.)
 
+**Canonical LLM + vision helpers (PR-3E, A-11/A-12).** `codec_vision.py` is the SINGLE source for screen-vision (`describe_sync` / `describe_async`, Gemini-flash → local-Qwen-VL fallback, config read live from `codec_config`) — used by `codec.py`, `codec_voice`, `codec_session`. `codec_llm.py` is the canonical chat/completions caller (`call()` + `strip_think`/`extract_content` — headers, Bearer auth, `enable_thinking`, `<think>` strip, `choices/reasoning` parse, retry+backoff, never-raises). NOTE: `codec_llm_proxy.py` is a priority *queue* (semaphore), NOT an HTTP caller — don't confuse the two. A-12 is migrating the ~45 inline `chat/completions` sites onto `codec_llm` in phased tranches; codec.py voice-reply + `codec_session.qwen_call` are done, streaming (`codec_llm.stream()`) + the rest are pending.
+
 ## 3. Agent + Crew runtime
 
 CODEC has its own minimalist multi-agent runtime in `codec_agents.py`. **Zero dependency on CrewAI or LangChain** — it's self-contained, only depends on `requests` and `codec_skill_registry`.
diff --git a/codec.py b/codec.py
index a2ffeaa..2ae8960 100644
--- a/codec.py
+++ b/codec.py
@@ -22,7 +22,7 @@
 # ── CONFIG (single source of truth: codec_config.py) ─────────────────────────
 from codec_config import (
     cfg as _cfg,
-    QWEN_BASE_URL, QWEN_MODEL, LLM_API_KEY, LLM_KWARGS, QWEN_VISION_URL, QWEN_VISION_MODEL,
+    QWEN_BASE_URL, QWEN_MODEL, LLM_API_KEY, LLM_KWARGS,
     WHISPER_URL,
     TASK_QUEUE_FILE, DRAFT_TASK_FILE, SESSION_ALIVE, STREAMING, WAKE_WORD, WAKE_ENERGY, WAKE_CHUNK_SEC,
     WAKE_PHRASES,
@@ -71,7 +71,7 @@ def _is_wake_utterance(text: str) -> bool:
 # ─��� SHARED (from codec_core.py — single source of truth) ─────────────────────
 import codec_core as _core
 from codec_core import (
-    strip_think, is_draft, init_db, save_task, update_session_response, get_memory, get_recent_conversations,
+    is_draft, init_db, save_task, update_session_response, get_memory, get_recent_conversations,
     transcribe, speak_text, focused_app, get_text_dialog,
     terminal_session_exists,
     # A-14 (PR-3G): `close_session` import dropped — codec.py defines its own
@@ -96,50 +96,16 @@ def _is_wake_utterance(text: str) -> bool:
 # safety gate AND plugin lifecycle hooks (run_with_hooks), both of which the
 # legacy path bypassed.
 
-# ── VISION (Gemini Flash or local Qwen VL) ──────────────────────────────────
-def _gemini_vision(img_b64, prompt, max_tokens=800):
-    """Call Gemini Flash vision API. Fast, reliable, free tier."""
-    import requests
-    url = f"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent?key={GEMINI_API_KEY}"
-    payload = {
-        "contents": [{"parts": [
-            {"inlineData": {"mimeType": "image/png", "data": img_b64}},
-            {"text": prompt}
-        ]}],
-        "generationConfig": {"maxOutputTokens": max_tokens}
-    }
-    r = requests.post(url, json=payload, timeout=30)
-    if r.status_code == 200:
-        candidates = r.json().get("candidates", [])
-        if candidates:
-            parts = candidates[0].get("content", {}).get("parts", [])
-            if parts:
-                return parts[0].get("text", "").strip()
-    else:
-        print(f"[CODEC] Gemini error {r.status_code}: {r.text[:200]}")
-    return ""
-
-def _local_vision(img_b64, prompt, max_tokens=800):
-    """Call local Qwen VL vision API (fallback)."""
-    import requests
-    r = requests.post(f"{QWEN_VISION_URL}/chat/completions",
-        json={"model": QWEN_VISION_MODEL,
-            "messages": [{"role": "user", "content": [
-                {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{img_b64}"}},
-                {"type": "text", "text": prompt}
-            ]}], "max_tokens": max_tokens}, timeout=60)
-    if r.status_code == 200:
-        return r.json()["choices"][0]["message"].get("content", "").strip()
-    return ""
+# ── VISION (A-11, PR-3E: canonical helper in codec_vision) ──────────────────
+# The Gemini-Flash → local-Qwen-VL fallback used to be hand-rolled here (and in
+# codec_voice + codec_session). It now lives in codec_vision; this is a thin
+# delegate kept for any caller of codec.vision_describe.
+import codec_vision
+import codec_llm  # A-12: canonical chat/completions caller
 
 def vision_describe(img_b64, prompt="Read all visible text on this screen. Include app name, window title, and all message/content text. Output raw text only.", max_tokens=800):
-    """Route vision to Gemini or local based on config."""
-    if VISION_PROVIDER == "gemini" and GEMINI_API_KEY:
-        result = _gemini_vision(img_b64, prompt, max_tokens)
-        if result:
-            return result
-        print("[CODEC] Gemini failed, falling back to local vision...")
-    return _local_vision(img_b64, prompt, max_tokens)
+    """Route vision to Gemini or local based on config (codec_vision)."""
+    return codec_vision.describe_sync(img_b64, prompt, mime="image/png", max_tokens=max_tokens)
 
 def screenshot_ctx():
     try:
@@ -444,55 +410,41 @@ def _post_skill_screenshot():
 
     push(lambda: show_processing_overlay('Thinking...', 15000))
     try:
-        import requests as _llm_req
-        headers = {}
-        if LLM_API_KEY:
-            headers["Authorization"] = f"Bearer {LLM_API_KEY}"
-        payload = {
-            "model": QWEN_MODEL,
-            "messages": llm_messages,
-            "max_tokens": 400,
-            "temperature": 0.7,
-            "chat_template_kwargs": {"enable_thinking": False},
-        }
-        payload.update(LLM_KWARGS)
-        r = _llm_req.post(f"{QWEN_BASE_URL}/chat/completions", json=payload, headers=headers, timeout=120)
-        if r.status_code == 200:
-            data = r.json()
-            answer = data.get("choices", [{}])[0].get("message", {}).get("content", "")
-            answer = strip_think(answer).strip()
-            if answer:
-                print(f"[CODEC] Voice reply (turn {voice_session['turn_count']+1}): {answer[:120]}")
-                log_event("tts_speak", "open-codec",
-                          f"TTS: {answer[:60]}",
-                          extra={"text_len": len(answer)})
-                # Add assistant response to session history
-                voice_session["messages"].append({"role": "assistant", "content": answer})
-                voice_session["turn_count"] += 1
-                # Save response to DB (A-20: via codec_core helper with
-                # WAL + busy_timeout — replaces the inline lock-prone
-                # sqlite3.connect that risked "database is locked" under
-                # concurrent agent-runner + voice writes). Never raises.
-                update_session_response(rid, answer[:500])
-                # Save to shared memory (same store as Chat)
-                try:
-                    cm = CodecMemory()
-                    cm.save("voice", "user", task)
-                    cm.save("voice", "assistant", answer)
-                except Exception as e:
-                    log.warning(f"[CODEC] Memory save failed after LLM: {e}")
-                _last_tts_text = answer[:200]
-                speak_text(answer)
-                _safe_ans = answer[:80].replace('\\', '\\\\').replace('"', '\\"')
-                subprocess.Popen(["osascript", "-e",
-                    f'display notification "{_safe_ans}" with title "CODEC"'],
-                    stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
-            else:
-                print("[CODEC] Voice LLM returned empty response")
-                speak_text("Sorry, I didn't get a response.")
+        # A-12 (PR-3E): canonical codec_llm.call replaces the inline
+        # chat/completions POST + headers + enable_thinking + <think> strip +
+        # choices parse. Returns the stripped answer, or "" on any failure
+        # (non-200 and empty now collapse to the same apology).
+        answer = codec_llm.call(
+            llm_messages, base_url=QWEN_BASE_URL, model=QWEN_MODEL,
+            api_key=LLM_API_KEY, max_tokens=400, temperature=0.7,
+            timeout=120, retries=1, extra_kwargs=LLM_KWARGS,
+        )
+        if answer:
+            print(f"[CODEC] Voice reply (turn {voice_session['turn_count']+1}): {answer[:120]}")
+            log_event("tts_speak", "open-codec",
+                      f"TTS: {answer[:60]}",
+                      extra={"text_len": len(answer)})
+            # Add assistant response to session history
+            voice_session["messages"].append({"role": "assistant", "content": answer})
+            voice_session["turn_count"] += 1
+            # Save response to DB (A-20: codec_core helper, WAL + busy_timeout).
+            update_session_response(rid, answer[:500])
+            # Save to shared memory (same store as Chat)
+            try:
+                cm = CodecMemory()
+                cm.save("voice", "user", task)
+                cm.save("voice", "assistant", answer)
+            except Exception as e:
+                log.warning(f"[CODEC] Memory save failed after LLM: {e}")
+            _last_tts_text = answer[:200]
+            speak_text(answer)
+            _safe_ans = answer[:80].replace('\\', '\\\\').replace('"', '\\"')
+            subprocess.Popen(["osascript", "-e",
+                f'display notification "{_safe_ans}" with title "CODEC"'],
+                stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
         else:
-            print(f"[CODEC] Voice LLM error: {r.status_code} {r.text[:200]}")
-            speak_text("Sorry, the language model is not responding.")
+            print("[CODEC] Voice LLM returned no response")
+            speak_text("Sorry, I didn't get a response.")
     except Exception as e:
         log.error("Voice LLM call failed: %s", e)
         import traceback; traceback.print_exc()
diff --git a/codec_llm.py b/codec_llm.py
new file mode 100644
index 0000000..40760ff
--- /dev/null
+++ b/codec_llm.py
@@ -0,0 +1,108 @@
+"""CODEC LLM call helper — the single canonical OpenAI-style chat/completions caller.
+
+A-12 (PR-3E): before this, ~45 sites hand-rolled the same `chat/completions`
+POST — build headers (`Authorization: Bearer …`, `Content-Type`), assemble the
+payload (`model`/`messages`/`max_tokens`/`temperature`/
+`chat_template_kwargs.enable_thinking=False`), parse `choices[0].message`
+(content, with a `reasoning` fallback), and strip `<think>…</think>`. A model
+upgrade or API-shape fix then meant editing 20+ places.
+
+This module centralizes the **non-streaming** call. It is intentionally
+config-agnostic — each caller passes its own `base_url` / `model` / `api_key`
+/ tuning — so it's a pure "build payload → POST → parse" helper with no import
+cycle into codec_config. (Streaming SSE + the remaining call sites are migrated
+in later A-12 tranches; this PR covers the call() API + codec.py + codec_session.)
+
+NOTE: `codec_llm_proxy` is a *priority queue* (semaphore), not an HTTP proxy —
+orthogonal to this module. Callers that want prioritization still wrap the call
+in `llm_queue_sync(...)`; behavior parity for the migrated sites means we do NOT
+add queue acquisition here (none of them used it).
+"""
+from __future__ import annotations
+
+import logging
+import re
+import time
+from typing import Any, Dict, List, Optional
+
+log = logging.getLogger("codec.llm")
+
+_THINK_RE = re.compile(r"<think>.*?</think>", re.DOTALL)
+
+
+def strip_think(text: str) -> str:
+    """Remove <think>…</think> reasoning blocks and surrounding whitespace."""
+    if not text:
+        return ""
+    return _THINK_RE.sub("", text).strip()
+
+
+def extract_content(response_json: Dict[str, Any]) -> str:
+    """Pull the assistant text from an OpenAI-style response: prefer
+    `choices[0].message.content`, fall back to `.reasoning` (some local
+    servers put the answer there when content is empty). `<think>` stripped.
+    Returns "" on any shape mismatch."""
+    try:
+        msg = response_json["choices"][0]["message"]
+    except (KeyError, IndexError, TypeError):
+        return ""
+    content = (msg.get("content") or "").strip()
+    if content:
+        return strip_think(content)
+    reasoning = (msg.get("reasoning") or "").strip()
+    if reasoning:
+        return strip_think(reasoning)
+    return ""
+
+
+def call(
+    messages: List[Dict[str, Any]],
+    *,
+    base_url: str,
+    model: str,
+    api_key: str = "",
+    max_tokens: int = 500,
+    temperature: float = 0.7,
+    timeout: float = 120.0,
+    retries: int = 1,
+    enable_thinking: bool = False,
+    extra_kwargs: Optional[Dict[str, Any]] = None,
+) -> str:
+    """POST `messages` to `<base_url>/chat/completions` and return the parsed,
+    `<think>`-stripped assistant text (or "" on failure).
+
+    `retries` includes the first attempt (retries=3 → up to 3 tries with
+    exponential 2**n backoff between them, matching codec_session.qwen_call).
+    Never raises — network/parse errors are logged and yield "".
+    """
+    import requests
+    headers = {"Content-Type": "application/json"}
+    if api_key:
+        headers["Authorization"] = "Bearer " + api_key
+    payload: Dict[str, Any] = {
+        "model": model,
+        "messages": messages,
+        "max_tokens": max_tokens,
+        "temperature": temperature,
+        "chat_template_kwargs": {"enable_thinking": enable_thinking},
+    }
+    if extra_kwargs:
+        payload.update(extra_kwargs)
+
+    attempts = max(1, retries)
+    url = base_url.rstrip("/") + "/chat/completions"
+    for attempt in range(attempts):
+        try:
+            r = requests.post(url, json=payload, headers=headers, timeout=timeout)
+            if r.status_code == 200:
+                resp = extract_content(r.json())
+                if resp:
+                    return resp
+                # 200 but empty/odd shape — don't retry, nothing more to get.
+                return ""
+            log.warning("LLM call %s returned %s: %s", url, r.status_code, r.text[:200])
+        except Exception as e:
+            log.warning("LLM call attempt %d/%d failed: %s", attempt + 1, attempts, e)
+            if attempt < attempts - 1:
+                time.sleep(2 ** attempt)
+    return ""
diff --git a/codec_session.py b/codec_session.py
index db49329..342a35b 100644
--- a/codec_session.py
+++ b/codec_session.py
@@ -72,15 +72,9 @@ def strip_think(t):
     return re.sub(r"<think>.*?</think>", "", t, flags=re.DOTALL).strip()
 
 
-def extract_content(rj):
-    msg = rj["choices"][0]["message"]
-    c = msg.get("content", "").strip()
-    if c:
-        return strip_think(c)
-    r = msg.get("reasoning", "").strip()
-    if r:
-        return strip_think(r)
-    return ""
+# A-12 (PR-3E): local `extract_content` removed — its only caller was `qwen_call`,
+# now migrated to codec_llm.call (which owns the canonical content→reasoning
+# extraction). `strip_think` above is kept; qwen_stream still uses it.
 
 
 def clean_resp(text):
@@ -210,26 +204,12 @@ def screenshot_ctx(self):
                 ib = base64.b64encode(f.read()).decode()
             os.unlink(tmp.name)
             print("[C] Reading screen...")
-            import requests
-            r = requests.post(
-                self.qwen_vision_url + "/chat/completions",
-                json={
-                    "model": self.qwen_vision_model,
-                    "messages": [
-                        {
-                            "role": "user",
-                            "content": [
-                                {"type": "image_url", "image_url": {"url": "data:image/png;base64," + ib}},
-                                {"type": "text", "text": "Read all visible text. Include app name and content. Raw text only."},
-                            ],
-                        }
-                    ],
-                    "max_tokens": 800,
-                },
-                timeout=120,
-            )
-            if r.status_code == 200:
-                return r.json()["choices"][0]["message"].get("content", "")[:2000]
+            # A-11 (PR-3E): canonical vision helper. Was local-Qwen-VL only here;
+            # now gains the Gemini-Flash fallback for free (config-gated).
+            import codec_vision
+            return codec_vision.describe_sync(
+                ib, "Read all visible text. Include app name and content. Raw text only.",
+                mime="image/png", max_tokens=800)[:2000]
         except Exception as e:
             log.warning(f"Screenshot capture or vision analysis failed: {e}")
         return ""
@@ -266,28 +246,14 @@ def speak(self, text):
     # ── LLM Calls ────────────────────────────────────────────────────────
 
     def qwen_call(self, messages):
-        import requests
-        headers = {"Content-Type": "application/json"}
-        if self.llm_api_key:
-            headers["Authorization"] = "Bearer " + self.llm_api_key
-        payload = {"model": self.qwen_model, "messages": messages, "max_tokens": 500, "temperature": 0.5}
-        payload.update(self.llm_kwargs)
-        for attempt in range(3):
-            try:
-                r = requests.post(
-                    self.qwen_base_url + "/chat/completions",
-                    json=payload,
-                    headers=headers,
-                    timeout=90,
-                )
-                if r.status_code == 200:
-                    resp = extract_content(r.json())
-                    if resp:
-                        return resp
-            except Exception as e:
-                log.warning(f"LLM API call attempt {attempt+1} failed: {e}")
-                time.sleep(2 ** attempt)
-        return ""
+        # A-12 (PR-3E): canonical codec_llm.call (3 retries + backoff, content→
+        # reasoning extraction, <think> strip) — was an inline chat/completions POST.
+        import codec_llm
+        return codec_llm.call(
+            messages, base_url=self.qwen_base_url, model=self.qwen_model,
+            api_key=self.llm_api_key, max_tokens=500, temperature=0.5,
+            timeout=90, retries=3, extra_kwargs=self.llm_kwargs,
+        )
 
     def qwen_stream(self, messages):
         import requests
diff --git a/codec_vision.py b/codec_vision.py
new file mode 100644
index 0000000..318e7b0
--- /dev/null
+++ b/codec_vision.py
@@ -0,0 +1,147 @@
+"""CODEC Vision — the single canonical screen-vision helper (A-11, PR-3E).
+
+Before this, the Gemini-Flash → local-Qwen-VL fallback was hand-rolled in three
+places with drifting shapes: `codec.py` (sync), `codec_voice._analyze_screenshot`
+(async), and `codec_session.screenshot_ctx` (sync, local-only). A model upgrade
+or vision-API fix meant editing all three.
+
+Canonical API:
+    describe_sync(image_b64, prompt, *, mime, max_tokens)        -> str
+    await describe_async(image_b64, prompt, *, mime, max_tokens, http) -> str
+
+Both: try Gemini Flash first (when `vision_provider == "gemini"` and a key is
+present), fall back to the local Qwen-VL `/chat/completions` endpoint. Return
+the description text, or "" on failure. Config is read live from codec_config
+(so provider/model/key changes + Keychain migration take effect without restart).
+"""
+from __future__ import annotations
+
+import logging
+from typing import Any, Optional, Tuple
+
+log = logging.getLogger("codec.vision")
+
+_GEMINI_MODEL = "gemini-2.0-flash"
+
+
+def _vision_config() -> Tuple[str, str, str, str]:
+    """(provider, gemini_key, local_url, local_model) read live from config.
+    Falls back to safe defaults if codec_config can't be imported."""
+    try:
+        from codec_config import cfg, QWEN_VISION_URL, QWEN_VISION_MODEL, get_gemini_api_key
+        gem = get_gemini_api_key() or ""
+        provider = cfg.get("vision_provider", "gemini" if gem else "local")
+        return provider, gem, QWEN_VISION_URL, QWEN_VISION_MODEL
+    except Exception as e:  # pragma: no cover — defensive
+        log.warning("vision config unavailable: %s", e)
+        return "local", "", "http://localhost:8082/v1", "qwen-vl"
+
+
+def _gemini_payload(image_b64: str, prompt: str, mime: str, max_tokens: int) -> dict:
+    return {
+        "contents": [{"parts": [
+            {"inlineData": {"mimeType": mime, "data": image_b64}},
+            {"text": prompt},
+        ]}],
+        "generationConfig": {"maxOutputTokens": max_tokens},
+    }
+
+
+def _gemini_url(api_key: str) -> str:
+    return (f"https://generativelanguage.googleapis.com/v1beta/models/"
+            f"{_GEMINI_MODEL}:generateContent?key={api_key}")
+
+
+def _parse_gemini(rj: dict) -> str:
+    try:
+        parts = rj.get("candidates", [])[0].get("content", {}).get("parts", [])
+        return (parts[0].get("text", "") if parts else "").strip()
+    except (IndexError, AttributeError, TypeError):
+        return ""
+
+
+def _local_payload(image_b64: str, prompt: str, mime: str, model: str, max_tokens: int) -> dict:
+    return {
+        "model": model,
+        "messages": [{"role": "user", "content": [
+            {"type": "image_url", "image_url": {"url": f"data:{mime};base64,{image_b64}"}},
+            {"type": "text", "text": prompt},
+        ]}],
+        "max_tokens": max_tokens,
+    }
+
+
+def _parse_local(rj: dict) -> str:
+    try:
+        return (rj["choices"][0]["message"].get("content") or "").strip()
+    except (KeyError, IndexError, TypeError):
+        return ""
+
+
+def describe_sync(image_b64: str, prompt: str, *, mime: str = "image/png",
+                  max_tokens: int = 800, timeout: float = 120.0) -> str:
+    """Synchronous (requests) vision describe. Gemini Flash → local Qwen-VL."""
+    import requests
+    provider, gem_key, local_url, local_model = _vision_config()
+
+    if provider == "gemini" and gem_key:
+        try:
+            r = requests.post(_gemini_url(gem_key),
+                              json=_gemini_payload(image_b64, prompt, mime, max_tokens),
+                              timeout=min(timeout, 30.0))
+            if r.status_code == 200:
+                txt = _parse_gemini(r.json())
+                if txt:
+                    return txt
+            log.info("Gemini vision %s; falling back to local", r.status_code)
+        except Exception as e:
+            log.info("Gemini vision error (%s); falling back to local", e)
+
+    try:
+        r = requests.post(local_url.rstrip("/") + "/chat/completions",
+                          json=_local_payload(image_b64, prompt, mime, local_model, max_tokens),
+                          headers={"Content-Type": "application/json"}, timeout=timeout)
+        if r.status_code == 200:
+            return _parse_local(r.json())
+        log.warning("Local vision returned %s: %s", r.status_code, r.text[:200])
+    except Exception as e:
+        log.warning("Local vision error: %s", e)
+    return ""
+
+
+async def describe_async(image_b64: str, prompt: str, *, mime: str = "image/jpeg",
+                         max_tokens: int = 500, timeout: float = 120.0,
+                         http: Optional[Any] = None) -> str:
+    """Async (httpx) vision describe. Gemini Flash → local Qwen-VL. Reuses the
+    caller's httpx client if given (e.g. VoicePipeline._http), else makes one."""
+    import httpx
+    provider, gem_key, local_url, local_model = _vision_config()
+    own_client = http is None
+    client = http or httpx.AsyncClient(timeout=timeout)
+    try:
+        if provider == "gemini" and gem_key:
+            try:
+                r = await client.post(_gemini_url(gem_key),
+                                      json=_gemini_payload(image_b64, prompt, mime, max_tokens),
+                                      timeout=min(timeout, 30.0))
+                if r.status_code == 200:
+                    txt = _parse_gemini(r.json())
+                    if txt:
+                        return txt
+                log.info("Gemini vision %s; falling back to local", r.status_code)
+            except Exception as e:
+                log.info("Gemini vision error (%s); falling back to local", e)
+
+        try:
+            r = await client.post(local_url.rstrip("/") + "/chat/completions",
+                                  json=_local_payload(image_b64, prompt, mime, local_model, max_tokens),
+                                  headers={"Content-Type": "application/json"}, timeout=timeout)
+            if r.status_code == 200:
+                return _parse_local(r.json())
+            log.warning("Local vision returned %s: %s", r.status_code, r.text[:200])
+        except Exception as e:
+            log.warning("Local vision error: %s", e)
+        return ""
+    finally:
+        if own_client:
+            await client.aclose()
diff --git a/codec_voice.py b/codec_voice.py
index 9f24251..d02de9a 100644
--- a/codec_voice.py
+++ b/codec_voice.py
@@ -666,54 +666,12 @@ async def _analyze_screenshot(self, image_b64: str, user_text: str) -> str:
             "Focus on the main content, app, or task visible. "
             "Be specific about text, UI elements, and what the user appears to be working on."
         )
-        # Try Gemini Flash first (fast, reliable)
-        if VISION_PROVIDER == "gemini" and GEMINI_API_KEY:
-            try:
-                url = f"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent?key={GEMINI_API_KEY}"
-                payload = {
-                    "contents": [{"parts": [
-                        {"inlineData": {"mimeType": "image/jpeg", "data": image_b64}},
-                        {"text": prompt}
-                    ]}],
-                    "generationConfig": {"maxOutputTokens": 500}
-                }
-                print("[Voice] Sending to Gemini Flash vision...")
-                r = await self._http.post(url, json=payload, timeout=30.0)
-                if r.status_code == 200:
-                    candidates = r.json().get("candidates", [])
-                    if candidates:
-                        parts = candidates[0].get("content", {}).get("parts", [])
-                        if parts:
-                            result = parts[0].get("text", "").strip()
-                            if result:
-                                print(f"[Voice] Gemini vision OK: {len(result)} chars")
-                                return result
-                print(f"[Voice] Gemini failed ({r.status_code}), falling back to local...")
-            except Exception as e:
-                print(f"[Voice] Gemini error: {e}, falling back to local...")
-
-        # Fallback: local Qwen VL
-        payload = {
-            "model": VISION_MODEL,
-            "messages": [{"role": "user", "content": [
-                {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image_b64}"}},
-                {"type": "text", "text": prompt},
-            ]}],
-            "max_tokens": 500,
-            "temperature": 0.7,
-        }
-        try:
-            r = await self._http.post(
-                VISION_URL, json=payload,
-                headers={"Content-Type": "application/json"},
-                timeout=120.0,
-            )
-            if r.status_code == 200:
-                return r.json()["choices"][0]["message"]["content"].strip()
-            print(f"[Voice] Vision model returned {r.status_code}: {r.text[:200]}")
-        except Exception as e:
-            print(f"[Voice] Vision analysis error: {e}")
-        return ""
+        # A-11 (PR-3E): canonical vision helper (Gemini Flash → local Qwen-VL).
+        # Reuses this pipeline's httpx client. Was an inline duplicate of the
+        # same fallback logic in codec.py + codec_session.
+        import codec_vision
+        return await codec_vision.describe_async(
+            image_b64, prompt, mime="image/jpeg", max_tokens=500, http=self._http)
 
     async def generate_response(self, user_text: str):
         self.messages.append({"role": "user", "content": user_text})
diff --git a/docs/PR3E-LLM-VISION-DEDUP-DESIGN.md b/docs/PR3E-LLM-VISION-DEDUP-DESIGN.md
new file mode 100644
index 0000000..92a5829
--- /dev/null
+++ b/docs/PR3E-LLM-VISION-DEDUP-DESIGN.md
@@ -0,0 +1,117 @@
+# PR-3E — LLM-call + vision dedup (DESIGN)
+
+**Status:** IMPLEMENTED — **Option 2** chosen (A-11 vision dedup + A-12 canonical `codec_llm` API + first chat tranche). See §8 for what actually shipped.
+**Findings:** A-11 (vision dup, MEDIUM) + A-12 (51→45 `chat/completions` sites, MEDIUM, audit-flagged **large**).
+**Wave:** 3. This is the **hottest code path in the repo** (every feature calls an LLM), so it gets design-first + a phased plan.
+
+---
+
+## 1. Reality check (what the trace found)
+
+- **`codec_llm_proxy.py` is NOT a proxy.** It's a priority *queue* (semaphore) — its own docstring: *"Does NOT proxy HTTP — callers still make their own requests."* So A-12's "the module already exists, just add `call()`/`stream()`" is **inaccurate**: there is no call/stream helper to reuse. A-12 means **building a new canonical call API** (which uses the queue internally).
+- **45 `chat/completions` sites** (was 51; some removed in earlier PRs) across **three shapes**: sync `requests`, async `httpx`, and streaming SSE — with copy-pasted headers, `Authorization: Bearer`, `enable_thinking=False`, `<think>` stripping, and `choices[0].message.content`/`.reasoning` parsing.
+- **A-11 vision = 3 divergent impls:**
+  - `codec.py` `vision_describe`/`_gemini_vision`/`_local_vision` — **sync** (`requests`), Gemini-flash → local-Qwen-VL fallback, PNG.
+  - `codec_voice._analyze_screenshot` — **async** (`httpx`), Gemini → local fallback, JPEG.
+  - `codec_session.screenshot_ctx` — **sync**, local-Qwen-VL **only** (no Gemini), PNG, with inline screencapture.
+
+## 2. Why this is high-risk
+
+These are the call paths behind voice, chat, vision, agents, bridges. A subtle
+regression in payload shape, `<think>` stripping, streaming chunk parsing,
+timeout, or error handling silently degrades a core feature. Blast radius =
+everything. So: **small, behavior-parity tranches with mocked-HTTP tests that
+assert payload/response equivalence — never a 45-site big-bang.**
+
+## 3. Recommended plan — split A-11 from A-12, phase A-12
+
+The audit lumps A-11 + A-12 as "PR-3E," but they're independent and A-12 is
+"large." Recommended:
+
+### This PR (PR-3E) — **A-11 vision dedup only** (contained, ~3 consumers)
+- New **`codec_vision.py`**: the single canonical vision helper.
+  - `describe_sync(image_b64, prompt, *, mime="image/png", max_tokens=800) -> str`
+  - `async describe_async(image_b64, prompt, *, mime="image/jpeg", max_tokens=500, http=None) -> str`
+  - Both: Gemini-flash (if `VISION_PROVIDER=="gemini"` and key present) → local-Qwen-VL fallback, reading config (`vision_base_url`, `vision_model`, `get_gemini_api_key`). One place to change the model / provider / API shape.
+- Migrate the 3 consumers to delegate:
+  - `codec.py`: `vision_describe` → `codec_vision.describe_sync`; drop `_gemini_vision`/`_local_vision`.
+  - `codec_voice._analyze_screenshot` → `await codec_vision.describe_async(..., http=self._http)`.
+  - `codec_session.screenshot_ctx` → `codec_vision.describe_sync` (gains Gemini fallback it lacked — a minor *improvement*, behaviorally a superset; flagged in the PR).
+- **Tests:** mock HTTP; assert Gemini-first + local-fallback, payload shapes, mime handling, empty-on-failure. ~8 tests.
+- **Risk:** medium-low (vision is less hot than chat; 3 well-understood sites). Behavior parity except session gaining the Gemini fallback (documented).
+
+### Follow-on (PR-3E-2+, separate design) — **A-12 chat/completions**
+- Build **`codec_llm.py`**: `call(messages, *, model, temperature, max_tokens, priority, **kw) -> str` (sync) + `stream(...)` (SSE generator) + an async variant. Centralizes headers, `enable_thinking`, `<think>` strip, `choices/reasoning` parse, queue-slot acquisition, timeouts, error shape.
+- Migrate the 45 sites **in small tranches by subsystem**, each its own PR with parity tests: e.g. (1) codec.py + codec_session, (2) dashboard, (3) voice, (4) agents/agent_plan/agent_runner, (5) bridges (telegram/imessage), (6) misc (compaction/self_improve/watcher/textassist/dictate). Each tranche is independently revertable.
+- This is deliberately **not** in this PR — 45 hot-path sites in one diff is unreviewable + high-risk.
+
+## 4. API / schema changes
+- New module `codec_vision.py` (this PR). No on-disk schema, no config changes
+  (reuses existing `vision_*` config keys + `get_gemini_api_key`).
+- `codec.py` loses `_gemini_vision`/`_local_vision` (internal); `vision_describe`
+  kept as a thin delegate for any external caller.
+- (A-12's `codec_llm.py` is a later PR.)
+
+## 5. Test plan (this PR — A-11)
+- New `tests/test_vision_dedup.py`:
+  - `describe_sync`: Gemini path returns text; Gemini failure → local fallback;
+    both fail → `""`; correct payload shape per provider; mime respected.
+  - `describe_async`: same matrix with a mocked httpx client.
+  - Source invariants: codec.py no longer defines `_gemini_vision`/`_local_vision`;
+    voice + session call `codec_vision`.
+- Regression: full suite (expect the 23 known failures, zero new). No `skills/`
+  touched → no manifest regen.
+- Manual (Mac Studio): voice "look at my screen" + a chat screenshot still
+  describe correctly via both providers.
+
+## 6. Risk + rollback
+- **Blast radius (this PR):** 3 files edited + 1 new module. Vision only — chat
+  paths untouched.
+- **Rollback:** single-commit revert restores the inline impls. No persistent
+  state touched.
+- A-12 risk is deferred to its own phased PRs (each small + revertable).
+
+## 7. Open question for you (Mickael)
+**Q: scope of PR-3E?**
+- **Option 1 (recommended):** PR-3E = **A-11 vision dedup only**, now. A-12
+  (chat/completions) becomes its own phased effort with a separate design doc
+  (build `codec_llm.call/stream` + migrate sites tranche-by-tranche). Keeps every
+  PR reviewable + low-risk on the hottest path.
+- **Option 2:** PR-3E = A-11 **+** A-12's canonical `codec_llm` API **+** the
+  first chat tranche (codec.py + codec_session). Bigger, riskier single PR.
+- **Option 3:** Do A-12 API first (no A-11 yet).
+
+I recommend **Option 1**. Pick one and I'll implement + open the PR
+(chat-review-then-merge — hot path).
+
+> **Decision: Option 2.** Mickael chose A-11 + A-12-API + first chat tranche in
+> one PR. Implemented as §8 below.
+
+---
+
+## 8. Implementation (shipped — Option 2)
+
+### New modules
+- **`codec_vision.py` (A-11)** — single canonical screen-vision helper.
+  - `describe_sync(image_b64, prompt, *, mime="image/png", max_tokens=800, timeout=120.0) -> str`
+  - `async describe_async(image_b64, prompt, *, mime="image/jpeg", max_tokens=500, timeout=120.0, http=None) -> str`
+  - Both: Gemini-flash (`gemini-2.0-flash`, when `vision_provider=="gemini"` + key present) → local-Qwen-VL `/chat/completions` fallback; return `""` on total failure.
+  - `_vision_config()` reads provider/key/url/model **live** from `codec_config` each call (so provider/model/Keychain changes take effect without restart); safe defaults if `codec_config` can't import.
+  - `describe_async` reuses the caller's httpx client when passed (`http=self._http`), else makes + closes its own.
+- **`codec_llm.py` (A-12 — canonical call API)** — config-agnostic (no `codec_config` import → no import cycle).
+  - `strip_think(text)` — drops `<think>…</think>` (DOTALL) + trims.
+  - `extract_content(response_json)` — `choices[0].message.content` → `.reasoning` fallback, `<think>`-stripped, `""` on malformed shape.
+  - `call(messages, *, base_url, model, api_key="", max_tokens=500, temperature=0.7, timeout=120.0, retries=1, enable_thinking=False, extra_kwargs=None) -> str` — builds headers (`Bearer` only when `api_key`), payload (`model`/`messages`/`max_tokens`/`temperature`/`chat_template_kwargs.enable_thinking` + merged `extra_kwargs`), POSTs to `base_url.rstrip("/")+"/chat/completions"`, retries with `2**attempt` backoff, returns extracted+stripped text or `""` (never raises).
+
+### Migrated sites (this PR)
+- **`codec.py`** — `vision_describe` → `codec_vision.describe_sync` (deleted `_gemini_vision`/`_local_vision`); voice-reply chat block in `_dispatch_inner` → `codec_llm.call`. Removed now-unused imports (`QWEN_VISION_URL`, `QWEN_VISION_MODEL`, `strip_think`).
+- **`codec_voice.py`** — `_analyze_screenshot` → `await codec_vision.describe_async(..., http=self._http)`. (Module-level `VISION_PROVIDER`/`GEMINI_API_KEY` retained — still used by observer transport logic; `VISION_URL`/`VISION_MODEL` now vestigial, cleanup deferred to the voice A-12 tranche.)
+- **`codec_session.py`** — `screenshot_ctx` → `codec_vision.describe_sync` (**gains** the Gemini fallback it previously lacked — a documented behavioral superset); `qwen_call` → `codec_llm.call` (retries=3).
+
+### Deliberately deferred (follow-on tranches, each its own PR + design)
+- `codec_session.qwen_stream` (SSE streaming) — needs a `codec_llm.stream()` generator; not in this PR.
+- The remaining ~40 `chat/completions` sites (dashboard, voice generate_response, agents/agent_plan/agent_runner, bridges, compaction/self_improve/watcher/textassist/dictate) — migrated tranche-by-tranche per §3.
+
+### Tests
+- **`tests/test_llm_vision_dedup.py`** — 19 tests: `strip_think`/`extract_content` matrix; `codec_llm.call` success / no-key-omits-auth / retries-then-empty / exception-returns-empty; `codec_vision.describe_sync` gemini-first / gemini→local fallback / local-only-when-provider-local / both-fail-empty; `describe_async` gemini + fallback (driven via `asyncio.run` + a fake httpx client — no `pytest-asyncio` dependency); source-level migration invariants (codec.py/voice/session call the canonical helpers, inline impls gone).
+- Full suite: **23 known-baseline failures, zero new.** No `skills/` touched → no manifest regen.
diff --git a/docs/audits/PHASE-1-CODE-QUALITY.md b/docs/audits/PHASE-1-CODE-QUALITY.md
index b6113f1..1c16749 100644
--- a/docs/audits/PHASE-1-CODE-QUALITY.md
+++ b/docs/audits/PHASE-1-CODE-QUALITY.md
@@ -113,12 +113,16 @@ Both scan `SKILLS_DIR` independently, so a skill file is loaded twice in differe
 **Effort:** small
 
 ### A-11 — `vision_describe` / `_gemini_vision` / `_local_vision` duplicated between codec.py and codec_voice.py [MEDIUM]
+
+> **Closed by PR-3E** (Option 2). New canonical **`codec_vision.py`** (`describe_sync` + `describe_async`, Gemini-flash → local-Qwen-VL fallback, config read live from `codec_config`). All three consumers now delegate: `codec.py:vision_describe` (deleted `_gemini_vision`/`_local_vision`), `codec_voice._analyze_screenshot` (async, reuses `self._http`), `codec_session.screenshot_ctx` (now **gains** the Gemini fallback it previously lacked — documented behavioral superset). A model/provider/API change is now a one-file edit. Pinned by `tests/test_llm_vision_dedup.py` (describe_sync/async provider matrix + source-invariant checks). See `docs/PR3E-LLM-VISION-DEDUP-DESIGN.md` §8.
 **Location:** `codec.py:69-111` (full implementation) and `codec_voice.py:659-714` (`_analyze_screenshot` — different shape but same Gemini Flash + local Qwen VL fallback pattern, same hardcoded `gemini-2.0-flash` model name and OpenAI vision-message shape). `codec_session.py:202-238` has yet another inline `screenshot_ctx` + vision call.
 **Impact:** When the user upgrades Gemini model, switches vision provider, or fixes a vision-API regression, they have to touch 3 places. Investor-grade: violates "single source of truth" principle stated in CLAUDE.md §10 and codec_core.py docstring.
 **Recommended fix:** Move `vision_describe` + provider routing into `codec_core.py` (or a new `codec_vision.py`) as the single canonical helper. Update `codec_voice.py._analyze_screenshot` to call it; update `codec_session.py:screenshot_ctx` to call it.
 **Effort:** medium
 
 ### A-12 — 51 separate `chat/completions` HTTP call sites with copy-pasted payload shapes [MEDIUM]
+
+> **First tranche closed by PR-3E** (Option 2); remainder phased. The audit's premise that `codec_llm_proxy.py` already has a `call()`/`stream()` to reuse was **inaccurate** — that module is a priority *queue*, not an HTTP caller. PR-3E instead built the genuinely-new canonical **`codec_llm.py`** (`call()` non-streaming, plus `strip_think`/`extract_content`: headers, `Bearer` auth, `chat_template_kwargs.enable_thinking`, `<think>` strip, `choices/reasoning` parse, retry+backoff, never-raises). **Migrated this PR:** `codec.py` voice-reply chat + `codec_session.qwen_call`. **Deferred (each its own tranche/PR + design):** `codec_session.qwen_stream` (needs `codec_llm.stream()` SSE generator) + the remaining ~40 sites (dashboard, voice `generate_response`, agents/agent_plan/agent_runner, telegram/imessage bridges, compaction/self_improve/watcher/textassist/dictate). Pinned by `tests/test_llm_vision_dedup.py`. See `docs/PR3E-LLM-VISION-DEDUP-DESIGN.md` §3 (phased plan) + §8 (shipped).
 **Location:** Sample sites: `codec.py:702`, `codec_dashboard.py:980,1076,1215`, `codec_voice.py:180,196,208,213`, `codec_session.py:215,278,307`, `codec_agents.py:51`, `codec_agent_plan.py:239`, `codec_agent_runner.py:148`, `codec_compaction.py:78`, `codec_self_improve.py:238`, `codec_telegram.py:471,508`, `codec_imessage.py:341,391`, `codec_textassist.py:33`, `codec_dictate.py:492`, `codec_watcher.py:86,182`. Total: 51 occurrences via `grep -rn "chat/completions"`.
 **Impact:** Each site repeats: headers build, `Authorization: Bearer {api_key}` formatting, `{Content-Type: application/json}`, payload assembly with `chat_template_kwargs.enable_thinking=False`, `<think>` stripping, `try/except` for `r.json()` shape (`choices[0].message.content` or `.reasoning` fallback). Many also re-implement streaming SSE parsing. When the Qwen-3.6 upgrade landed, this is exactly the kind of change that needs to be applied in 20+ places.
 **Recommended fix:** Add `codec_llm_proxy.call(messages, **kwargs)` and `codec_llm_proxy.stream(messages, **kwargs)` as the single canonical API (the module already exists at `codec_llm_proxy.py`, only 130 LOC, only used by codec_voice + codec_agents). Migrate all 51 sites over the course of the Phase 1 hardening. As a first step, just covering the 5 sites in codec.py + codec_dashboard.py + codec_session.py would remove ~80% of the most-edited duplication.
diff --git a/docs/audits/PHASE-1-CONSOLIDATED-TRIAGE.md b/docs/audits/PHASE-1-CONSOLIDATED-TRIAGE.md
index 6c1ed9b..99ccb88 100644
--- a/docs/audits/PHASE-1-CONSOLIDATED-TRIAGE.md
+++ b/docs/audits/PHASE-1-CONSOLIDATED-TRIAGE.md
@@ -242,7 +242,7 @@ Mirror the Intake Phase 3 wave pattern. 7 waves planned; sizes are PR-counts, NO
 - PR-3C: A-16 + A-17 + A-21 — wire `WAKE_PHRASES` (deduped homophone keywords + length-guarded phrase match in a testable `_is_wake_utterance`) + wire `draft_keywords` into `codec_core.is_draft` + remove dead `AGENT_NAME` constant ✅ (branch `fix/pr3c-wire-config-knobs`; 13 tests; zero net-new ruff; full suite 1338 passing). **A-4 (skill-loader unification) deliberately split out → its own PR**: it refactors the LIVE multi-file skill-dispatch path (`codec_core.load_skills` is called from codec.py + dashboard ×2 + voice + agent_runner), needs a careful voice-path test pass, and doesn't belong bundled with these contained config-wiring fixes.
 - A-4: skill-loader unification ✅ (branch `fix/pr3-a4-skill-loader-unification`, design-first per §11 → `docs/A4-SKILL-LOADER-UNIFICATION-DESIGN.md`). Deleted legacy `codec_core.{loaded_skills,load_skills,run_skill}`; codec.py + cortex_skills now use canonical `codec_dispatch` registry. Closed a real **security gap** (legacy path skipped the PR-1A AST gate) + a **hooks bypass** (voice path now fires run_with_hooks). Option A: `custom_triggers.json` now honored everywhere via SkillRegistry. 10 tests; full suite 1376 passing.
 - PR-3D: A-5 + A-6 + A-7 — extract helpers from the 3 monolithic functions (`_dispatch_inner`, `chat_completion`, `Agent.run`)
-- PR-3E: A-11 + A-12 — unify vision + 51-site `chat/completions` through `codec_llm_proxy`
+- PR-3E: A-11 + A-12 — unify vision + `chat/completions` ✅ (branch `fix/pr3e-llm-vision-dedup`, design-first per §11 → `docs/PR3E-LLM-VISION-DEDUP-DESIGN.md`; **Option 2** chosen by Mickael). **A-11 fully closed**: new `codec_vision.py` (sync+async, Gemini→local fallback, live config); all 3 consumers (codec.py/voice/session) delegate; session gains a Gemini fallback it lacked. **A-12 first tranche**: discovered `codec_llm_proxy` is a *queue*, not an HTTP caller — built genuinely-new `codec_llm.py` (`call()` + `strip_think`/`extract_content`, retry, never-raises) and migrated codec.py voice-reply chat + `codec_session.qwen_call`. **Deferred to phased follow-ons**: `qwen_stream` SSE (needs `codec_llm.stream()`) + ~40 remaining sites (dashboard/voice/agents/bridges/misc), each its own tranche. 19 tests (`tests/test_llm_vision_dedup.py`); full suite zero new failures.
 - PR-3F (optional, large): A-19 — bridge unification (iMessage + Telegram → `BridgeRouter`)
 - PR-3G: small misc ✅ (branch `fix/pr3g-small-misc-cleanup`) — closed A-9 (DISABLED overlay, ~90 LOC), A-10 (run_session_module, 33 LOC + orphan `import sys`), A-14 (close_session shadow import), A-18 (9 unused Pydantic models + dead typing import). A-13 (dashboard pattern blocker) verified **already closed by PR-2C**. 6 regression tests; zero net-new ruff (net −); full suite 1344 passing. **Deferred from this batch (each needs its own focused PR):** A-8 (codec_keyboard.py 398 LOC — verify-first delete-or-migrate decision), A-15 (config_version — additive migration feature touching `load_config`), A-20 (inline sqlite in the live dispatch path — reliability fix needing a CodecMemory method).
 - A-15: config schema versioning ✅ (branch `fix/pr3-a15-config-versioning`; `CONFIG_SCHEMA_VERSION=1` + migration ladder + idempotent atomic write-back in `load_config`; never creates-on-missing or overwrites-corrupt; 12 tests; zero net-new ruff; full suite 1356 passing).
diff --git a/tests/test_llm_vision_dedup.py b/tests/test_llm_vision_dedup.py
new file mode 100644
index 0000000..ff33677
--- /dev/null
+++ b/tests/test_llm_vision_dedup.py
@@ -0,0 +1,262 @@
+"""Tests for PR-3E — LLM-call (A-12) + vision (A-11) dedup.
+
+- codec_llm.call: the canonical chat/completions caller (headers, payload,
+  enable_thinking, <think> strip, content→reasoning extraction, retries).
+- codec_vision.describe_sync/_async: canonical Gemini-Flash → local-Qwen-VL.
+- First-tranche migrations: codec.py (vision + voice chat), codec_session
+  (vision + qwen_call), codec_voice._analyze_screenshot.
+
+Reference: docs/PR3E-LLM-VISION-DEDUP-DESIGN.md (Option 2).
+"""
+from __future__ import annotations
+
+import asyncio
+import sys
+from pathlib import Path
+
+REPO = Path(__file__).resolve().parent.parent
+sys.path.insert(0, str(REPO))
+
+import codec_llm  # noqa: E402
+import codec_vision  # noqa: E402
+
+
+# ── helpers ──────────────────────────────────────────────────────────────────
+
+
+class _Resp:
+    def __init__(self, status, payload=None, text=""):
+        self.status_code = status
+        self._payload = payload or {}
+        self.text = text
+
+    def json(self):
+        return self._payload
+
+
+def _msg(content=None, reasoning=None):
+    m = {}
+    if content is not None:
+        m["content"] = content
+    if reasoning is not None:
+        m["reasoning"] = reasoning
+    return {"choices": [{"message": m}]}
+
+
+# ── codec_llm.extract_content + strip_think ──────────────────────────────────
+
+
+def test_strip_think():
+    assert codec_llm.strip_think("<think>plan</think>answer") == "answer"
+    assert codec_llm.strip_think("  hi  ") == "hi"
+    assert codec_llm.strip_think("") == ""
+
+
+def test_extract_content_prefers_content():
+    assert codec_llm.extract_content(_msg(content="hello")) == "hello"
+
+
+def test_extract_content_reasoning_fallback():
+    assert codec_llm.extract_content(_msg(content="", reasoning="fallback")) == "fallback"
+
+
+def test_extract_content_strips_think():
+    assert codec_llm.extract_content(_msg(content="<think>x</think>real")) == "real"
+
+
+def test_extract_content_bad_shape_returns_empty():
+    assert codec_llm.extract_content({}) == ""
+    assert codec_llm.extract_content({"choices": []}) == ""
+
+
+# ── codec_llm.call ───────────────────────────────────────────────────────────
+
+
+def test_call_success(monkeypatch):
+    captured = {}
+
+    def fake_post(url, json=None, headers=None, timeout=None):
+        captured["url"] = url
+        captured["json"] = json
+        captured["headers"] = headers
+        return _Resp(200, _msg(content="42"))
+
+    import requests
+    monkeypatch.setattr(requests, "post", fake_post)
+    out = codec_llm.call([{"role": "user", "content": "q"}],
+                         base_url="http://x/v1", model="qwen", api_key="k",
+                         max_tokens=400, temperature=0.7, extra_kwargs={"top_p": 0.9})
+    assert out == "42"
+    assert captured["url"] == "http://x/v1/chat/completions"
+    assert captured["headers"]["Authorization"] == "Bearer k"
+    p = captured["json"]
+    assert p["model"] == "qwen"
+    assert p["max_tokens"] == 400 and p["temperature"] == 0.7
+    assert p["chat_template_kwargs"] == {"enable_thinking": False}
+    assert p["top_p"] == 0.9  # extra_kwargs merged
+
+
+def test_call_no_api_key_omits_auth(monkeypatch):
+    captured = {}
+
+    def fake_post(url, json=None, headers=None, timeout=None):
+        captured["headers"] = headers
+        return _Resp(200, _msg(content="ok"))
+
+    import requests
+    monkeypatch.setattr(requests, "post", fake_post)
+    codec_llm.call([{"role": "user", "content": "q"}], base_url="http://x/v1", model="m")
+    assert "Authorization" not in captured["headers"]
+
+
+def test_call_retries_then_empty(monkeypatch):
+    calls = {"n": 0}
+
+    def fake_post(url, json=None, headers=None, timeout=None):
+        calls["n"] += 1
+        return _Resp(500, text="err")
+
+    import requests
+    monkeypatch.setattr(requests, "post", fake_post)
+    monkeypatch.setattr(codec_llm.time, "sleep", lambda *_: None)  # no real backoff
+    out = codec_llm.call([{"role": "user", "content": "q"}],
+                         base_url="http://x/v1", model="m", retries=3)
+    assert out == ""
+    assert calls["n"] == 3  # all attempts used
+
+
+def test_call_exception_returns_empty(monkeypatch):
+    def fake_post(*a, **k):
+        raise ConnectionError("down")
+
+    import requests
+    monkeypatch.setattr(requests, "post", fake_post)
+    monkeypatch.setattr(codec_llm.time, "sleep", lambda *_: None)
+    assert codec_llm.call([{"role": "user", "content": "q"}],
+                          base_url="http://x/v1", model="m", retries=2) == ""
+
+
+# ── codec_vision.describe_sync ───────────────────────────────────────────────
+
+
+def test_describe_sync_gemini_first(monkeypatch):
+    monkeypatch.setattr(codec_vision, "_vision_config",
+                        lambda: ("gemini", "gemkey", "http://local/v1", "qwen-vl"))
+    captured = {}
+
+    def fake_post(url, json=None, headers=None, timeout=None):
+        captured["url"] = url
+        # Gemini response shape
+        return _Resp(200, {"candidates": [{"content": {"parts": [{"text": "a chart"}]}}]})
+
+    import requests
+    monkeypatch.setattr(requests, "post", fake_post)
+    out = codec_vision.describe_sync("b64", "what is this?", mime="image/png")
+    assert out == "a chart"
+    assert "generativelanguage.googleapis.com" in captured["url"]
+
+
+def test_describe_sync_falls_back_to_local(monkeypatch):
+    monkeypatch.setattr(codec_vision, "_vision_config",
+                        lambda: ("gemini", "gemkey", "http://local/v1", "qwen-vl"))
+    seen = []
+
+    def fake_post(url, json=None, headers=None, timeout=None):
+        seen.append(url)
+        if "googleapis" in url:
+            return _Resp(500, text="gemini down")
+        return _Resp(200, _msg(content="local says hi"))
+
+    import requests
+    monkeypatch.setattr(requests, "post", fake_post)
+    out = codec_vision.describe_sync("b64", "p")
+    assert out == "local says hi"
+    assert any("googleapis" in u for u in seen) and any("local/v1" in u for u in seen)
+
+
+def test_describe_sync_local_only_when_provider_local(monkeypatch):
+    monkeypatch.setattr(codec_vision, "_vision_config",
+                        lambda: ("local", "", "http://local/v1", "qwen-vl"))
+    seen = []
+
+    def fake_post(url, json=None, headers=None, timeout=None):
+        seen.append(url)
+        return _Resp(200, _msg(content="local"))
+
+    import requests
+    monkeypatch.setattr(requests, "post", fake_post)
+    out = codec_vision.describe_sync("b64", "p")
+    assert out == "local"
+    assert not any("googleapis" in u for u in seen)  # Gemini never tried
+
+
+def test_describe_sync_both_fail_returns_empty(monkeypatch):
+    monkeypatch.setattr(codec_vision, "_vision_config",
+                        lambda: ("gemini", "k", "http://local/v1", "m"))
+    import requests
+    monkeypatch.setattr(requests, "post", lambda *a, **k: _Resp(500, text="x"))
+    assert codec_vision.describe_sync("b64", "p") == ""
+
+
+# ── codec_vision.describe_async ──────────────────────────────────────────────
+
+
+class _FakeAsyncClient:
+    def __init__(self, handler):
+        self._handler = handler
+
+    async def post(self, url, json=None, headers=None, timeout=None):
+        return self._handler(url, json)
+
+
+def test_describe_async_gemini(monkeypatch):
+    monkeypatch.setattr(codec_vision, "_vision_config",
+                        lambda: ("gemini", "k", "http://local/v1", "m"))
+
+    def handler(url, json):
+        return _Resp(200, {"candidates": [{"content": {"parts": [{"text": "async vision"}]}}]})
+
+    out = asyncio.run(codec_vision.describe_async("b64", "p", http=_FakeAsyncClient(handler)))
+    assert out == "async vision"
+
+
+def test_describe_async_fallback(monkeypatch):
+    monkeypatch.setattr(codec_vision, "_vision_config",
+                        lambda: ("gemini", "k", "http://local/v1", "m"))
+
+    def handler(url, json):
+        if "googleapis" in url:
+            return _Resp(500)
+        return _Resp(200, _msg(content="local async"))
+
+    out = asyncio.run(codec_vision.describe_async("b64", "p", http=_FakeAsyncClient(handler)))
+    assert out == "local async"
+
+
+# ── source-level migration invariants ───────────────────────────────────────
+
+
+def test_codec_vision_is_single_source():
+    src = (REPO / "codec.py").read_text()
+    assert "def _gemini_vision" not in src and "def _local_vision" not in src
+    assert "codec_vision.describe_sync" in src
+
+
+def test_codec_chat_uses_codec_llm():
+    src = (REPO / "codec.py").read_text()
+    assert "codec_llm.call(" in src
+    # No inline chat/completions POST left in _dispatch_inner's LLM block
+    assert 'f"{QWEN_BASE_URL}/chat/completions"' not in src
+
+
+def test_voice_uses_codec_vision():
+    src = (REPO / "codec_voice.py").read_text()
+    assert "codec_vision.describe_async" in src
+    # Inline gemini URL gone from the analyze path
+    assert "generativelanguage.googleapis.com" not in src
+
+
+def test_session_uses_canonical_helpers():
+    src = (REPO / "codec_session.py").read_text()
+    assert "codec_llm.call(" in src
+    assert "codec_vision.describe_sync" in src