BackendStack21 · jkyberneees · Jun 6, 2026 · Jun 6, 2026 · Jun 6, 2026
diff --git a/docker/Dockerfile b/docker/Dockerfile
@@ -15,12 +15,49 @@ RUN go mod download
 COPY . .
 RUN CGO_ENABLED=0 go build -ldflags "-s -w" -o /out/odek ./cmd/odek
 
+# ---- whisper stage ----
+# Build whisper.cpp's CLI and fetch the `tiny` model so the `transcribe` tool
+# (and Telegram voice auto-transcription) work out of the box — no host
+# install, no first-run model download. Same alpine base as the runtime stage
+# so the musl ABI matches; OpenMP is disabled to keep the runtime dependency
+# surface down to just libstdc++. To ship a different model, override the build
+# arg: `--build-arg WHISPER_MODEL=base` (tiny | base | small | medium) — size
+# and RAM grow accordingly. WHISPER_VERSION pins the whisper.cpp release so the
+# build is reproducible — bump it deliberately rather than tracking master.
+FROM alpine:latest AS whisper
+ARG WHISPER_MODEL=tiny
+ARG WHISPER_VERSION=v1.8.6
+RUN apk add --no-cache git cmake make g++ musl-dev curl
+RUN git clone --depth 1 --branch "${WHISPER_VERSION}" https://github.com/ggerganov/whisper.cpp /whisper
+WORKDIR /whisper
+RUN cmake -B build \
+      -DCMAKE_BUILD_TYPE=Release \
+      -DBUILD_SHARED_LIBS=OFF \
+      -DGGML_OPENMP=OFF \
+      -DWHISPER_BUILD_TESTS=OFF \
+      -DWHISPER_BUILD_EXAMPLES=ON \
+ && cmake --build build -j "$(nproc)" --target whisper-cli
+# Fetch the ggml model into a fixed image path (NOT under ~/.odek, which the
+# Telegram compose profiles bind-mount over — that would hide a model baked
+# there). The runtime config points transcription.models_dir at this path.
+RUN mkdir -p /models \
+ && curl -fsSL -o "/models/ggml-${WHISPER_MODEL}.bin" \
+      "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-${WHISPER_MODEL}.bin"
+
 # ---- runtime stage ----
 FROM alpine:latest
 # Tooling the agent commonly needs inside the sandbox container.
 # Trim or extend this list to taste. git + the GitHub CLI (`gh`, from the
 # Alpine community repo) are included so the agent can clone/PR/release.
-RUN apk add --no-cache ca-certificates git github-cli bash coreutils curl jq
+# ffmpeg converts Telegram's OGG/Opus voice notes to WAV for whisper.cpp;
+# libstdc++ is the only shared lib the bundled whisper-cli needs at runtime.
+RUN apk add --no-cache ca-certificates git github-cli bash coreutils curl jq ffmpeg libstdc++
+
+# Bundle the whisper CLI + model from the whisper stage so `transcribe` works
+# with zero setup. whisper-cli lands on PATH; the model goes to a stable image
+# path that the runtime config (transcription.models_dir) points at.
+COPY --from=whisper /whisper/build/bin/whisper-cli /usr/local/bin/whisper-cli
+COPY --from=whisper /models/ /usr/local/share/whisper/models/
 
 # ── Adding extra dependencies the agent can use ──────────────────────────
 # The agent runs shell commands INSIDE this image, so any runtime or CLI it

diff --git a/docker/README.md b/docker/README.md
@@ -134,6 +134,22 @@ start (non-zero exit, "another schedule daemon is already running") when the bot
 holds it. In the reverse order (daemon up first), the bot's embedded scheduler
 just defers silently.
 
+## Voice transcription (out of the box)
+
+The image **bundles whisper.cpp's CLI and the `tiny` ggml model**, plus `ffmpeg`
+for OGG/Opus → WAV conversion — so the `transcribe` tool and Telegram voice
+auto-transcription work with zero setup. No host install, no first-run download.
+
+- The model ships at `/usr/local/share/whisper/models/ggml-tiny.bin`, and both
+  `config.restricted.json` and `config.godmode.json` point
+  `transcription.models_dir` there. (It lives outside `~/.odek` on purpose — the
+  Telegram profiles bind-mount `./.odek`, which would otherwise shadow it.)
+- Send the bot a voice note → it's transcribed locally and handed to the agent
+  as text. `auto_transcribe` is on by default in the bundled configs.
+- Want a more accurate (larger) model? Rebuild with
+  `--build-arg WHISPER_MODEL=base` (or `small` / `medium`) and bump the
+  `model` field in the config to match.
+
 ## Verify the profiles differ
 
 - **Restricted**: ask it to `rm -rf` everything in `/workspace` → denied, never runs.

diff --git a/docker/config.godmode.json b/docker/config.godmode.json
@@ -7,6 +7,11 @@
   "skills": {
     "verbose": true
   },
+  "transcription": {
+    "model": "tiny",
+    "auto_transcribe": true,
+    "models_dir": "/usr/local/share/whisper/models"
+  },
   "dangerous": {
     "action": "allow",
     "non_interactive": "allow"

diff --git a/docker/config.restricted.json b/docker/config.restricted.json
@@ -7,6 +7,11 @@
   "skills": {
     "verbose": true
   },
+  "transcription": {
+    "model": "tiny",
+    "auto_transcribe": true,
+    "models_dir": "/usr/local/share/whisper/models"
+  },
   "dangerous": {
     "non_interactive": "deny",
     "classes": {

diff --git a/docs/TELEGRAM.md b/docs/TELEGRAM.md
@@ -312,6 +312,8 @@ Voice message received → DownloadVoice (OGG Opus to disk)
 
 **Fallback:** If auto-transcribe fails (ffmpeg unavailable, corrupt audio, whisper error), the agent receives the file path with a suggestion to use the `transcribe()` tool manually.
 
+**Docker:** the official image bundles the whisper.cpp CLI, the `tiny` model, and ffmpeg, with `auto_transcribe` enabled in the shipped configs — so voice transcription works out of the box with no host install. See [../docker/README.md](../docker/README.md#voice-transcription-out-of-the-box).
+
 ### Tool Progress (Narrator)
 
 Tool progress shows what the agent is doing in real time. Controlled by the `tool_progress` config field (independent from `interaction_mode`):