diff --git a/docker/Dockerfile b/docker/Dockerfile index 64c1092..b610567 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -15,12 +15,49 @@ RUN go mod download COPY . . RUN CGO_ENABLED=0 go build -ldflags "-s -w" -o /out/odek ./cmd/odek +# ---- whisper stage ---- +# Build whisper.cpp's CLI and fetch the `tiny` model so the `transcribe` tool +# (and Telegram voice auto-transcription) work out of the box — no host +# install, no first-run model download. Same alpine base as the runtime stage +# so the musl ABI matches; OpenMP is disabled to keep the runtime dependency +# surface down to just libstdc++. To ship a different model, override the build +# arg: `--build-arg WHISPER_MODEL=base` (tiny | base | small | medium) — size +# and RAM grow accordingly. WHISPER_VERSION pins the whisper.cpp release so the +# build is reproducible — bump it deliberately rather than tracking master. +FROM alpine:latest AS whisper +ARG WHISPER_MODEL=tiny +ARG WHISPER_VERSION=v1.8.6 +RUN apk add --no-cache git cmake make g++ musl-dev curl +RUN git clone --depth 1 --branch "${WHISPER_VERSION}" https://github.com/ggerganov/whisper.cpp /whisper +WORKDIR /whisper +RUN cmake -B build \ + -DCMAKE_BUILD_TYPE=Release \ + -DBUILD_SHARED_LIBS=OFF \ + -DGGML_OPENMP=OFF \ + -DWHISPER_BUILD_TESTS=OFF \ + -DWHISPER_BUILD_EXAMPLES=ON \ + && cmake --build build -j "$(nproc)" --target whisper-cli +# Fetch the ggml model into a fixed image path (NOT under ~/.odek, which the +# Telegram compose profiles bind-mount over — that would hide a model baked +# there). The runtime config points transcription.models_dir at this path. +RUN mkdir -p /models \ + && curl -fsSL -o "/models/ggml-${WHISPER_MODEL}.bin" \ + "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-${WHISPER_MODEL}.bin" + # ---- runtime stage ---- FROM alpine:latest # Tooling the agent commonly needs inside the sandbox container. # Trim or extend this list to taste. git + the GitHub CLI (`gh`, from the # Alpine community repo) are included so the agent can clone/PR/release. -RUN apk add --no-cache ca-certificates git github-cli bash coreutils curl jq +# ffmpeg converts Telegram's OGG/Opus voice notes to WAV for whisper.cpp; +# libstdc++ is the only shared lib the bundled whisper-cli needs at runtime. +RUN apk add --no-cache ca-certificates git github-cli bash coreutils curl jq ffmpeg libstdc++ + +# Bundle the whisper CLI + model from the whisper stage so `transcribe` works +# with zero setup. whisper-cli lands on PATH; the model goes to a stable image +# path that the runtime config (transcription.models_dir) points at. +COPY --from=whisper /whisper/build/bin/whisper-cli /usr/local/bin/whisper-cli +COPY --from=whisper /models/ /usr/local/share/whisper/models/ # ── Adding extra dependencies the agent can use ────────────────────────── # The agent runs shell commands INSIDE this image, so any runtime or CLI it diff --git a/docker/README.md b/docker/README.md index 2202143..e6d367f 100644 --- a/docker/README.md +++ b/docker/README.md @@ -134,6 +134,22 @@ start (non-zero exit, "another schedule daemon is already running") when the bot holds it. In the reverse order (daemon up first), the bot's embedded scheduler just defers silently. +## Voice transcription (out of the box) + +The image **bundles whisper.cpp's CLI and the `tiny` ggml model**, plus `ffmpeg` +for OGG/Opus → WAV conversion — so the `transcribe` tool and Telegram voice +auto-transcription work with zero setup. No host install, no first-run download. + +- The model ships at `/usr/local/share/whisper/models/ggml-tiny.bin`, and both + `config.restricted.json` and `config.godmode.json` point + `transcription.models_dir` there. (It lives outside `~/.odek` on purpose — the + Telegram profiles bind-mount `./.odek`, which would otherwise shadow it.) +- Send the bot a voice note → it's transcribed locally and handed to the agent + as text. `auto_transcribe` is on by default in the bundled configs. +- Want a more accurate (larger) model? Rebuild with + `--build-arg WHISPER_MODEL=base` (or `small` / `medium`) and bump the + `model` field in the config to match. + ## Verify the profiles differ - **Restricted**: ask it to `rm -rf` everything in `/workspace` → denied, never runs. diff --git a/docker/config.godmode.json b/docker/config.godmode.json index 604522c..e2c2e8c 100644 --- a/docker/config.godmode.json +++ b/docker/config.godmode.json @@ -7,6 +7,11 @@ "skills": { "verbose": true }, + "transcription": { + "model": "tiny", + "auto_transcribe": true, + "models_dir": "/usr/local/share/whisper/models" + }, "dangerous": { "action": "allow", "non_interactive": "allow" diff --git a/docker/config.restricted.json b/docker/config.restricted.json index 541af02..c28f03e 100644 --- a/docker/config.restricted.json +++ b/docker/config.restricted.json @@ -7,6 +7,11 @@ "skills": { "verbose": true }, + "transcription": { + "model": "tiny", + "auto_transcribe": true, + "models_dir": "/usr/local/share/whisper/models" + }, "dangerous": { "non_interactive": "deny", "classes": { diff --git a/docs/TELEGRAM.md b/docs/TELEGRAM.md index 3720fb4..f497dc2 100644 --- a/docs/TELEGRAM.md +++ b/docs/TELEGRAM.md @@ -312,6 +312,8 @@ Voice message received → DownloadVoice (OGG Opus to disk) **Fallback:** If auto-transcribe fails (ffmpeg unavailable, corrupt audio, whisper error), the agent receives the file path with a suggestion to use the `transcribe()` tool manually. +**Docker:** the official image bundles the whisper.cpp CLI, the `tiny` model, and ffmpeg, with `auto_transcribe` enabled in the shipped configs — so voice transcription works out of the box with no host install. See [../docker/README.md](../docker/README.md#voice-transcription-out-of-the-box). + ### Tool Progress (Narrator) Tool progress shows what the agent is doing in real time. Controlled by the `tool_progress` config field (independent from `interaction_mode`):