Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions AGENTS.md
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,8 @@ docs/ API.md, MCP_HTTP_SETUP.md, CONTEXT_REPORT.md, desig

Other engine modules (`codec_overlays`, `codec_metrics`, `codec_logging`, `codec_gdocs`, `codec_google_auth`, `codec_cdp`, `codec_llm_proxy`, `codec_retry`, `codec_alerts`, `codec_search`, `codec_textassist`, `codec_watcher`, `codec_watchdog`) are internal helpers — read them when you need them, but they're not part of the navigation surface for an agent making structural changes. (Keyboard handling — wake word, F13 toggle, F18 voice, double-tap — lives **inline in `codec.py`** in the `codec` PM2 process; the old standalone `codec_keyboard.py` was deleted as a dead duplicate per A-8.)

**Canonical LLM + vision helpers (PR-3E, A-11/A-12).** `codec_vision.py` is the SINGLE source for screen-vision (`describe_sync` / `describe_async`, Gemini-flash → local-Qwen-VL fallback, config read live from `codec_config`) — used by `codec.py`, `codec_voice`, `codec_session`. `codec_llm.py` is the canonical chat/completions caller (`call()` + `strip_think`/`extract_content` — headers, Bearer auth, `enable_thinking`, `<think>` strip, `choices/reasoning` parse, retry+backoff, never-raises). NOTE: `codec_llm_proxy.py` is a priority *queue* (semaphore), NOT an HTTP caller — don't confuse the two. A-12 is migrating the ~45 inline `chat/completions` sites onto `codec_llm` in phased tranches; codec.py voice-reply + `codec_session.qwen_call` are done, streaming (`codec_llm.stream()`) + the rest are pending.

## 3. Agent + Crew runtime

CODEC has its own minimalist multi-agent runtime in `codec_agents.py`. **Zero dependency on CrewAI or LangChain** — it's self-contained, only depends on `requests` and `codec_skill_registry`.
Expand Down
136 changes: 44 additions & 92 deletions codec.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
# ── CONFIG (single source of truth: codec_config.py) ─────────────────────────
from codec_config import (
cfg as _cfg,
QWEN_BASE_URL, QWEN_MODEL, LLM_API_KEY, LLM_KWARGS, QWEN_VISION_URL, QWEN_VISION_MODEL,
QWEN_BASE_URL, QWEN_MODEL, LLM_API_KEY, LLM_KWARGS,
WHISPER_URL,
TASK_QUEUE_FILE, DRAFT_TASK_FILE, SESSION_ALIVE, STREAMING, WAKE_WORD, WAKE_ENERGY, WAKE_CHUNK_SEC,
WAKE_PHRASES,
Expand Down Expand Up @@ -71,7 +71,7 @@ def _is_wake_utterance(text: str) -> bool:
# ─��� SHARED (from codec_core.py — single source of truth) ─────────────────────
import codec_core as _core
from codec_core import (
strip_think, is_draft, init_db, save_task, update_session_response, get_memory, get_recent_conversations,
is_draft, init_db, save_task, update_session_response, get_memory, get_recent_conversations,
transcribe, speak_text, focused_app, get_text_dialog,
terminal_session_exists,
# A-14 (PR-3G): `close_session` import dropped — codec.py defines its own
Expand All @@ -96,50 +96,16 @@ def _is_wake_utterance(text: str) -> bool:
# safety gate AND plugin lifecycle hooks (run_with_hooks), both of which the
# legacy path bypassed.

# ── VISION (Gemini Flash or local Qwen VL) ──────────────────────────────────
def _gemini_vision(img_b64, prompt, max_tokens=800):
"""Call Gemini Flash vision API. Fast, reliable, free tier."""
import requests
url = f"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent?key={GEMINI_API_KEY}"
payload = {
"contents": [{"parts": [
{"inlineData": {"mimeType": "image/png", "data": img_b64}},
{"text": prompt}
]}],
"generationConfig": {"maxOutputTokens": max_tokens}
}
r = requests.post(url, json=payload, timeout=30)
if r.status_code == 200:
candidates = r.json().get("candidates", [])
if candidates:
parts = candidates[0].get("content", {}).get("parts", [])
if parts:
return parts[0].get("text", "").strip()
else:
print(f"[CODEC] Gemini error {r.status_code}: {r.text[:200]}")
return ""

def _local_vision(img_b64, prompt, max_tokens=800):
"""Call local Qwen VL vision API (fallback)."""
import requests
r = requests.post(f"{QWEN_VISION_URL}/chat/completions",
json={"model": QWEN_VISION_MODEL,
"messages": [{"role": "user", "content": [
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{img_b64}"}},
{"type": "text", "text": prompt}
]}], "max_tokens": max_tokens}, timeout=60)
if r.status_code == 200:
return r.json()["choices"][0]["message"].get("content", "").strip()
return ""
# ── VISION (A-11, PR-3E: canonical helper in codec_vision) ──────────────────
# The Gemini-Flash → local-Qwen-VL fallback used to be hand-rolled here (and in
# codec_voice + codec_session). It now lives in codec_vision; this is a thin
# delegate kept for any caller of codec.vision_describe.
import codec_vision
import codec_llm # A-12: canonical chat/completions caller

def vision_describe(img_b64, prompt="Read all visible text on this screen. Include app name, window title, and all message/content text. Output raw text only.", max_tokens=800):
"""Route vision to Gemini or local based on config."""
if VISION_PROVIDER == "gemini" and GEMINI_API_KEY:
result = _gemini_vision(img_b64, prompt, max_tokens)
if result:
return result
print("[CODEC] Gemini failed, falling back to local vision...")
return _local_vision(img_b64, prompt, max_tokens)
"""Route vision to Gemini or local based on config (codec_vision)."""
return codec_vision.describe_sync(img_b64, prompt, mime="image/png", max_tokens=max_tokens)

def screenshot_ctx():
try:
Expand Down Expand Up @@ -444,55 +410,41 @@ def _post_skill_screenshot():

push(lambda: show_processing_overlay('Thinking...', 15000))
try:
import requests as _llm_req
headers = {}
if LLM_API_KEY:
headers["Authorization"] = f"Bearer {LLM_API_KEY}"
payload = {
"model": QWEN_MODEL,
"messages": llm_messages,
"max_tokens": 400,
"temperature": 0.7,
"chat_template_kwargs": {"enable_thinking": False},
}
payload.update(LLM_KWARGS)
r = _llm_req.post(f"{QWEN_BASE_URL}/chat/completions", json=payload, headers=headers, timeout=120)
if r.status_code == 200:
data = r.json()
answer = data.get("choices", [{}])[0].get("message", {}).get("content", "")
answer = strip_think(answer).strip()
if answer:
print(f"[CODEC] Voice reply (turn {voice_session['turn_count']+1}): {answer[:120]}")
log_event("tts_speak", "open-codec",
f"TTS: {answer[:60]}",
extra={"text_len": len(answer)})
# Add assistant response to session history
voice_session["messages"].append({"role": "assistant", "content": answer})
voice_session["turn_count"] += 1
# Save response to DB (A-20: via codec_core helper with
# WAL + busy_timeout — replaces the inline lock-prone
# sqlite3.connect that risked "database is locked" under
# concurrent agent-runner + voice writes). Never raises.
update_session_response(rid, answer[:500])
# Save to shared memory (same store as Chat)
try:
cm = CodecMemory()
cm.save("voice", "user", task)
cm.save("voice", "assistant", answer)
except Exception as e:
log.warning(f"[CODEC] Memory save failed after LLM: {e}")
_last_tts_text = answer[:200]
speak_text(answer)
_safe_ans = answer[:80].replace('\\', '\\\\').replace('"', '\\"')
subprocess.Popen(["osascript", "-e",
f'display notification "{_safe_ans}" with title "CODEC"'],
stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
else:
print("[CODEC] Voice LLM returned empty response")
speak_text("Sorry, I didn't get a response.")
# A-12 (PR-3E): canonical codec_llm.call replaces the inline
# chat/completions POST + headers + enable_thinking + <think> strip +
# choices parse. Returns the stripped answer, or "" on any failure
# (non-200 and empty now collapse to the same apology).
answer = codec_llm.call(
llm_messages, base_url=QWEN_BASE_URL, model=QWEN_MODEL,
api_key=LLM_API_KEY, max_tokens=400, temperature=0.7,
timeout=120, retries=1, extra_kwargs=LLM_KWARGS,
)
if answer:
print(f"[CODEC] Voice reply (turn {voice_session['turn_count']+1}): {answer[:120]}")
log_event("tts_speak", "open-codec",
f"TTS: {answer[:60]}",
extra={"text_len": len(answer)})
# Add assistant response to session history
voice_session["messages"].append({"role": "assistant", "content": answer})
voice_session["turn_count"] += 1
# Save response to DB (A-20: codec_core helper, WAL + busy_timeout).
update_session_response(rid, answer[:500])
# Save to shared memory (same store as Chat)
try:
cm = CodecMemory()
cm.save("voice", "user", task)
cm.save("voice", "assistant", answer)
except Exception as e:
log.warning(f"[CODEC] Memory save failed after LLM: {e}")
_last_tts_text = answer[:200]
speak_text(answer)
_safe_ans = answer[:80].replace('\\', '\\\\').replace('"', '\\"')
subprocess.Popen(["osascript", "-e",
f'display notification "{_safe_ans}" with title "CODEC"'],
stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
else:
print(f"[CODEC] Voice LLM error: {r.status_code} {r.text[:200]}")
speak_text("Sorry, the language model is not responding.")
print("[CODEC] Voice LLM returned no response")
speak_text("Sorry, I didn't get a response.")
except Exception as e:
log.error("Voice LLM call failed: %s", e)
import traceback; traceback.print_exc()
Expand Down
108 changes: 108 additions & 0 deletions codec_llm.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
"""CODEC LLM call helper — the single canonical OpenAI-style chat/completions caller.

A-12 (PR-3E): before this, ~45 sites hand-rolled the same `chat/completions`
POST — build headers (`Authorization: Bearer …`, `Content-Type`), assemble the
payload (`model`/`messages`/`max_tokens`/`temperature`/
`chat_template_kwargs.enable_thinking=False`), parse `choices[0].message`
(content, with a `reasoning` fallback), and strip `<think>…</think>`. A model
upgrade or API-shape fix then meant editing 20+ places.

This module centralizes the **non-streaming** call. It is intentionally
config-agnostic — each caller passes its own `base_url` / `model` / `api_key`
/ tuning — so it's a pure "build payload → POST → parse" helper with no import
cycle into codec_config. (Streaming SSE + the remaining call sites are migrated
in later A-12 tranches; this PR covers the call() API + codec.py + codec_session.)

NOTE: `codec_llm_proxy` is a *priority queue* (semaphore), not an HTTP proxy —
orthogonal to this module. Callers that want prioritization still wrap the call
in `llm_queue_sync(...)`; behavior parity for the migrated sites means we do NOT
add queue acquisition here (none of them used it).
"""
from __future__ import annotations

import logging
import re
import time
from typing import Any, Dict, List, Optional

log = logging.getLogger("codec.llm")

_THINK_RE = re.compile(r"<think>.*?</think>", re.DOTALL)


def strip_think(text: str) -> str:
"""Remove <think>…</think> reasoning blocks and surrounding whitespace."""
if not text:
return ""
return _THINK_RE.sub("", text).strip()


def extract_content(response_json: Dict[str, Any]) -> str:
"""Pull the assistant text from an OpenAI-style response: prefer
`choices[0].message.content`, fall back to `.reasoning` (some local
servers put the answer there when content is empty). `<think>` stripped.
Returns "" on any shape mismatch."""
try:
msg = response_json["choices"][0]["message"]
except (KeyError, IndexError, TypeError):
return ""
content = (msg.get("content") or "").strip()
if content:
return strip_think(content)
reasoning = (msg.get("reasoning") or "").strip()
if reasoning:
return strip_think(reasoning)
return ""


def call(
messages: List[Dict[str, Any]],
*,
base_url: str,
model: str,
api_key: str = "",
max_tokens: int = 500,
temperature: float = 0.7,
timeout: float = 120.0,
retries: int = 1,
enable_thinking: bool = False,
extra_kwargs: Optional[Dict[str, Any]] = None,
) -> str:
"""POST `messages` to `<base_url>/chat/completions` and return the parsed,
`<think>`-stripped assistant text (or "" on failure).

`retries` includes the first attempt (retries=3 → up to 3 tries with
exponential 2**n backoff between them, matching codec_session.qwen_call).
Never raises — network/parse errors are logged and yield "".
"""
import requests
headers = {"Content-Type": "application/json"}
if api_key:
headers["Authorization"] = "Bearer " + api_key
payload: Dict[str, Any] = {
"model": model,
"messages": messages,
"max_tokens": max_tokens,
"temperature": temperature,
"chat_template_kwargs": {"enable_thinking": enable_thinking},
}
if extra_kwargs:
payload.update(extra_kwargs)

attempts = max(1, retries)
url = base_url.rstrip("/") + "/chat/completions"
for attempt in range(attempts):
try:
r = requests.post(url, json=payload, headers=headers, timeout=timeout)
if r.status_code == 200:
resp = extract_content(r.json())
if resp:
return resp
# 200 but empty/odd shape — don't retry, nothing more to get.
return ""
log.warning("LLM call %s returned %s: %s", url, r.status_code, r.text[:200])
except Exception as e:
log.warning("LLM call attempt %d/%d failed: %s", attempt + 1, attempts, e)
if attempt < attempts - 1:
time.sleep(2 ** attempt)
return ""
68 changes: 17 additions & 51 deletions codec_session.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,15 +72,9 @@ def strip_think(t):
return re.sub(r"<think>.*?</think>", "", t, flags=re.DOTALL).strip()


def extract_content(rj):
msg = rj["choices"][0]["message"]
c = msg.get("content", "").strip()
if c:
return strip_think(c)
r = msg.get("reasoning", "").strip()
if r:
return strip_think(r)
return ""
# A-12 (PR-3E): local `extract_content` removed — its only caller was `qwen_call`,
# now migrated to codec_llm.call (which owns the canonical content→reasoning
# extraction). `strip_think` above is kept; qwen_stream still uses it.


def clean_resp(text):
Expand Down Expand Up @@ -210,26 +204,12 @@ def screenshot_ctx(self):
ib = base64.b64encode(f.read()).decode()
os.unlink(tmp.name)
print("[C] Reading screen...")
import requests
r = requests.post(
self.qwen_vision_url + "/chat/completions",
json={
"model": self.qwen_vision_model,
"messages": [
{
"role": "user",
"content": [
{"type": "image_url", "image_url": {"url": "data:image/png;base64," + ib}},
{"type": "text", "text": "Read all visible text. Include app name and content. Raw text only."},
],
}
],
"max_tokens": 800,
},
timeout=120,
)
if r.status_code == 200:
return r.json()["choices"][0]["message"].get("content", "")[:2000]
# A-11 (PR-3E): canonical vision helper. Was local-Qwen-VL only here;
# now gains the Gemini-Flash fallback for free (config-gated).
import codec_vision
return codec_vision.describe_sync(
ib, "Read all visible text. Include app name and content. Raw text only.",
mime="image/png", max_tokens=800)[:2000]
except Exception as e:
log.warning(f"Screenshot capture or vision analysis failed: {e}")
return ""
Expand Down Expand Up @@ -266,28 +246,14 @@ def speak(self, text):
# ── LLM Calls ────────────────────────────────────────────────────────

def qwen_call(self, messages):
import requests
headers = {"Content-Type": "application/json"}
if self.llm_api_key:
headers["Authorization"] = "Bearer " + self.llm_api_key
payload = {"model": self.qwen_model, "messages": messages, "max_tokens": 500, "temperature": 0.5}
payload.update(self.llm_kwargs)
for attempt in range(3):
try:
r = requests.post(
self.qwen_base_url + "/chat/completions",
json=payload,
headers=headers,
timeout=90,
)
if r.status_code == 200:
resp = extract_content(r.json())
if resp:
return resp
except Exception as e:
log.warning(f"LLM API call attempt {attempt+1} failed: {e}")
time.sleep(2 ** attempt)
return ""
# A-12 (PR-3E): canonical codec_llm.call (3 retries + backoff, content→
# reasoning extraction, <think> strip) — was an inline chat/completions POST.
import codec_llm
return codec_llm.call(
messages, base_url=self.qwen_base_url, model=self.qwen_model,
api_key=self.llm_api_key, max_tokens=500, temperature=0.5,
timeout=90, retries=3, extra_kwargs=self.llm_kwargs,
)

def qwen_stream(self, messages):
import requests
Expand Down
Loading
Loading