kyutai-labs · vvolhejn · Mar 6, 2026 · Feb 26, 2026 · Feb 27, 2026 · Feb 27, 2026
diff --git a/README.md b/README.md
@@ -24,7 +24,7 @@ graph LR
 - The user opens the Unmute website, served by the **frontend**.
 - By clicking "connect", the user establishes a websocket connection to the **backend**, sending audio and other metadata back and forth in real time.
   - The backend connects via websocket to the **speech-to-text** server, sending it the audio from the user and receiving back the transcription in real time.
-  - Once the speech-to-text detects that the user has stopped speaking and it's time to generate a response, the backend connects to an **LLM** server to retrieve the response. We host our own LLM using [VLLM](https://github.com/vllm-project/vllm), but you could also use an external API like OpenAI or Mistral.
+  - Once the speech-to-text detects that the user has stopped speaking and it's time to generate a response, the backend connects to an **LLM** server to retrieve the response. We serve the LLM using [OpenRouter](https://openrouter.ai/), but you can also host your own using [VLLM](https://github.com/vllm-project/vllm).
   - As the response is being generated, the backend feeds it to the **text-to-speech** server to read it out loud, and forwards the generated speech to the user.
 
 ## Setup
@@ -54,8 +54,9 @@ While we support deploying with Docker compose and without Docker, the Docker Sw
 ### LLM access on Hugging Face Hub
 
 You can use any LLM you want.
-By default, Unmute uses [Mistral Small 3.2 24B](https://huggingface.co/mistralai/Mistral-Small-3.2-24B-Instruct-2506) as the LLM.
-([Gemma 3 12B](https://huggingface.co/google/gemma-3-12b-it) is also a good choice.)
+In production, we use GPT OSS 120B served over OpenRouter.
+In the default local setup (Docker Compose/Dockerless), Unmute uses [Gemma 3 1B](https://huggingface.co/google/gemma-3-1b-it) as the LLM.
+
 This model is freely available but requires you to accept the conditions to accept it:
 
 1. Create a Hugging Face account.
@@ -73,7 +74,7 @@ To make sure the NVIDIA Container Toolkit is installed correctly, run:
 sudo docker run --rm --runtime=nvidia --gpus all ubuntu nvidia-smi
 ```
 
-If you use [meta-llama/Llama-3.2-1B](https://huggingface.co/meta-llama/Llama-3.2-1B),
+If you use [google/gemma-3-1b-it](https://huggingface.co/google/gemma-3-1b-it),
 the default in `docker-compose.yml`, 16GB of GPU memory is sufficient.
 If you're running into memory issues, open `docker-compose.yml` and look for `NOTE:` comments to see places that you might need to adjust.
 
@@ -247,16 +248,16 @@ with
     extra_hosts:
       - "host.docker.internal:host-gateway"
 ```
-This points to your localhost server. Alternatively, for OpenAI, you can use
+This points to your localhost server. Alternatively, to use an OpenAI-compatible server such as [OpenRouter](https://openrouter.ai/), you can use
 ```yaml
   backend:
     image: unmute-backend:latest
     [..]
     environment:
       [..]
-      - KYUTAI_LLM_URL=https://api.openai.com/v1
-      - KYUTAI_LLM_MODEL=gpt-4.1
-      - KYUTAI_LLM_API_KEY=sk-..
+      - KYUTAI_LLM_URL=https://openrouter.ai/api
+      - KYUTAI_LLM_MODEL=google/gemma-3-12b-it # or whatever
+      - KYUTAI_LLM_API_KEY=sk-.. # your OpenRouter key
 ```
 
 The section for vllm can then be removed, as it is no longer needed:

diff --git a/bake_deploy_prod.sh b/bake_deploy_prod.sh
@@ -22,14 +22,14 @@ if [[ -n $(git status --porcelain) ]]; then
   fi
 fi
 
-set -x # Print commands
-
 export DOMAIN=unmute.sh
-# Note that using non-Mistral models also requires changing the vLLM args in ./swarm-deploy.yml
-export KYUTAI_LLM_MODEL=google/gemma-3-12b-it
 export DOCKER_HOST=ssh://root@${DOMAIN}
+export KYUTAI_LLM_MODEL=@preset/gpt-oss-120b # The name of our Openrouter preset
+export KYUTAI_LLM_API_KEY=$OPENROUTER_API_KEY_UNMUTE
 
 echo "If you get an connection error, do: ssh root@${DOMAIN}"
 
+set -x # Print the deployment commands
+
 docker buildx bake -f ./swarm-deploy.yml --allow=ssh --push
 docker stack deploy --with-registry-auth --prune --compose-file ./swarm-deploy.yml llm-wrapper
diff --git a/bake_deploy_staging.sh b/bake_deploy_staging.sh
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -103,8 +103,8 @@ services:
     command:
       [
         # NOTE: Change the LLM here if you want.
-        # (caution: gemma-3-1b-it also exists but it's slow on vLLM: https://github.com/vllm-project/vllm/issues/19575)
-        "--model=meta-llama/Llama-3.2-1B-Instruct",
+        #   You can also use an external LLM provider instead of self-hosting, see README.
+        "--model=google/gemma-3-1b-it",
         # NOTE: You can adapt this based on your GPU memory.
         # A higher value takes more memory but supports longer conversations.
         "--max-model-len=1536",

diff --git a/frontend/src/app/UnmuteHeader.tsx b/frontend/src/app/UnmuteHeader.tsx
@@ -71,7 +71,7 @@ const UnmuteHeader = () => {
         <div className="flex flex-col gap-3">
           <p>
             This is a cascaded system made by Kyutai: our speech-to-text
-            transcribes what you say, an LLM (we use Gemma 3 12B)
+            transcribes what you say, an LLM (we use GPT OSS 120B)
             generates the text of the response, and we then use our
             text-to-speech model to say it out loud.
           </p>

diff --git a/swarm-deploy.yml b/swarm-deploy.yml
@@ -1,6 +1,6 @@
 services:
   traefik:
-    image: traefik:v3.3.1
+    image: traefik:v3.6.9
     command:
       # Swarm provider configuration
       - "--providers.swarm.endpoint=unix:///var/run/docker.sock"
@@ -63,7 +63,7 @@ services:
     deploy:
       # Having more than one replica is useful for scaling but also to avoid downtime
       # during crashes or updates. Traffic will be load balanced between replicas.
-      replicas: 5
+      replicas: 3
       update_config:
         delay: 10s
       labels:
@@ -88,7 +88,8 @@ services:
       # for a given service, allowing manual load balancing. The backend does this currently.
       - KYUTAI_STT_URL=ws://tasks.stt:8080
       - KYUTAI_TTS_URL=ws://tasks.tts:8080
-      - KYUTAI_LLM_URL=http://llm:8000
+      - KYUTAI_LLM_URL=https://openrouter.ai/api
+      - KYUTAI_LLM_API_KEY=$KYUTAI_LLM_API_KEY
       - KYUTAI_VOICE_CLONING_URL=http://voice-cloning:8080
       - KYUTAI_REDIS_URL=redis://redis:6379
       - KYUTAI_VOICE_DONATION_DIR=/voice-donation
@@ -113,7 +114,10 @@ services:
         - "traefik.http.services.backend.loadbalancer.server.port=80"
         - "traefik.http.routers.backend.priority=100" # higher priority than frontend
         - "prometheus-port=80"
-      replicas: 16
+      # Running the backend is quite cheap but Python is not great at multi-threading
+      # so we can run a few to get better performance. The downside is logs are harder to read;
+      # for debugging set replicas to 1.
+      replicas: 4
       update_config:
         delay: 10s      # wait 10 seconds before updating the next replica
         parallelism: 3  # update 3 replicas at a time
@@ -164,20 +168,13 @@ services:
         - "traefik.http.routers.tts.tls.certresolver=letsencrypt_resolver"
         - "traefik.http.services.tts.loadbalancer.server.port=8080"
         - "traefik.http.routers.tts.priority=100"
-      replicas: 3
+      replicas: 1
       update_config:
         delay: 60s # it takes a very long time to boot up and we want no downtime
       resources:
         limits:
           cpus: "8"
           memory: 16G
-        # This is how to reserve a GPU for the service in swarm. We can ask multiple GPUs
-        # for a single container but we never needed to.
-        reservations:
-          generic_resources:
-            - discrete_resource_spec:
-                kind: gpu
-                value: 1
 
   stt:
     image: rg.fr-par.scw.cloud/namespace-unruffled-tereshkova/${DOMAIN}-moshi-server:latest
@@ -212,11 +209,6 @@ services:
         limits:
           cpus: "8"
           memory: 16G
-        reservations:
-          generic_resources:
-            - discrete_resource_spec:
-                kind: gpu
-                value: 1
 
   voice-cloning:
     image: rg.fr-par.scw.cloud/namespace-unruffled-tereshkova/${DOMAIN}-moshi-server:latest
@@ -233,63 +225,14 @@ services:
     deploy:
       labels:
         - "prometheus-port=8080"
-      replicas: 2
+      replicas: 1
       update_config:
         delay: 60s #it takes a very long time to boot up and we want no downtime
       resources:
         limits:
           cpus: "8"
           memory: 16G
 
-  llm:
-    image: vllm/vllm-openai:v0.11.0
-    command:
-      [
-        "--model=${KYUTAI_LLM_MODEL}",
-        "--max-model-len=8192",
-        "--dtype=bfloat16",
-        "--tensor-parallel-size=1",  # >1 needed to run models that don't fit on a single GPU
-      ]
-    healthcheck:
-      # The very first time it can be VERY slow, because of the download
-      # and compilation. We don't care about healthcheck failures during that time.
-      # But if the healthcheck succeeds once (even before the end of the start period),
-      # it will be considered healthy and the service will be available.
-      start_period: 10m
-      test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
-    volumes:
-      - "huggingface-cache:/root/.cache/huggingface"
-      # This is where vLLM stores its cache, we want to keep it across restarts
-      # to avoid recompiling the model every time.
-      - vllm-cache:/root/.cache/vllm
-    environment:
-      - HUGGING_FACE_HUB_TOKEN=$HUGGING_FACE_HUB_TOKEN
-    deploy:
-      labels:
-        - "prometheus-port=8000"
-        # Expose the LLM service via Traefik under the /llm-server path
-        - "traefik.enable=true"
-        - "traefik.http.routers.llm.rule=(Host(`www.${DOMAIN}`) || Host(`${DOMAIN}`)) && PathPrefix(`/llm-server`)"
-        - "traefik.http.routers.llm.middlewares=strip-llm"
-        - "traefik.http.middlewares.strip-llm.replacepathregex.regex=^/llm-server/(.*)"
-        - "traefik.http.middlewares.strip-llm.replacepathregex.replacement=/$$1"
-        - "traefik.http.routers.llm.entrypoints=websecure"
-        - "traefik.http.routers.llm.tls=true"
-        - "traefik.http.routers.llm.tls.certresolver=letsencrypt_resolver"
-        - "traefik.http.services.llm.loadbalancer.server.port=8000"
-        - "traefik.http.routers.llm.priority=100"
-      # 2 containers are used, 1 gpu per container, and the requests end up being load balanced
-      # between them. There is no "smart" routing but it's enough for our use case.
-      replicas: 2
-      update_config:
-        delay: 120s # it takes a very long time to boot up and we want no downtime
-      resources:
-        reservations:
-          generic_resources:
-            - discrete_resource_spec:
-                kind: gpu
-                value: 1 # put more if needed
-
   # -------------------------------------------------------------------------
   # Monitoring
 

diff --git a/unmute/llm/llm_utils.py b/unmute/llm/llm_utils.py
@@ -149,6 +149,10 @@ async def chat_completion(
 
         async with stream:
             async for chunk in stream:
+                if len(chunk.choices) == 0:
+                    # OpenRouter sometimes does this, some kind of keep-alive chunk with no content. Just ignore it.
+                    continue
+
                 chunk_content = chunk.choices[0].delta.content
 
                 if not chunk_content:

diff --git a/unmute/llm/system_prompt.py b/unmute/llm/system_prompt.py
@@ -90,6 +90,9 @@
 
 def get_readable_llm_name():
     model = autoselect_model()
+    # Remove anything before the last slash, if present. The convention is often
+    # "model-creator/model-name", or for openrouter "@preset/preset-name".
+    model = model.split("/")[-1]
     return model.replace("-", " ").replace("_", " ")