MLX: GPU-side argmax in decode loop, avoid 64KB logits transfer per step

juntao · claude · juntao · commit 1feb25435949 · 2026-03-27T17:30:36.000Z
decoder.step() now returns the argmax token ID (computed on GPU via
mlx_argmax_axis) instead of the full 16,384-element logits vector.
This eliminates a to_vec_f32() CPU round-trip per decode step, keeping
the full decoder graph (8 layers + head + argmax) as one fused Metal
dispatch.

Co-Authored-By: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/CLAUDE.md b/CLAUDE.md
@@ -463,7 +463,25 @@ release zip so users need zero configuration:
 - **vocab.json** is generated once in a separate job and included in every platform zip,
   so users only need to copy it into their model directory.
 
-### 18. MLX weight count differs from tch (2104 vs 2152)
+### 18. MLX eval() placement and GPU-side argmax
+
+MLX lazy evaluation builds a computation graph that should be evaluated at outer loop
+boundaries, not per-layer. Our encoder correctly runs all 48 conformer layers as one lazy
+graph with a single `eval()` after. The decoder runs 8 layers per step — also fine.
+
+The decode loop originally called `to_vec_f32()` on the logits (shape: 16,384) at every
+step to perform argmax on the CPU. This transferred 64 KB per token and broke the lazy graph.
+
+**Fix:** use `Array::argmax_flat()` which calls `mlx_argmax_axis` on GPU and transfers a
+single i32 to CPU. The full graph (8 decoder layers + layer norm + linear head + argmax) is
+now evaluated as one fused Metal dispatch per step.
+
+**Rule of thumb:**
+- `eval()` after encoder forward (1 call)
+- `argmax_flat()` after each decoder step (1 call per token, transfers 4 bytes not 64 KB)
+- Never `eval()` or `to_vec_f32()` per-layer or mid-graph
+
+### 19. MLX weight count differs from tch (2104 vs 2152)
 
 The MLX weight loader skips `num_batches_tracked` tensors (I64 dtype, used only during
 PyTorch training). This results in 2104 loaded tensors vs 2152 for the tch backend
diff --git a/src/mlx/decoder.rs b/src/mlx/decoder.rs
@@ -300,14 +300,16 @@ impl TransformerDecoder {
 
     /// One greedy-decoding step.
     ///
-    /// Returns (logits: Vec<f32> of shape vocab_size, updated self_kv_cache).
+    /// Returns (next_token_id, updated self_kv_cache).
+    /// Argmax is computed on GPU — only a single i32 is transferred to CPU,
+    /// avoiding a 16,384-element logits transfer per step.
     pub fn step(
         &self,
         token_id: i32,
         position: i32,
         self_kv_cache: &[(Option<Array>, Option<Array>)],
         cross_kv: &[(Array, Array)],
-    ) -> (Vec<f32>, Vec<(Option<Array>, Option<Array>)>) {
+    ) -> (i32, Vec<(Option<Array>, Option<Array>)>) {
         // Token embedding lookup
         let idx = Array::from_slice_i32(&[token_id]);
         let emb = ops::take(&self.token_emb, &idx, 0); // (1, hidden)
@@ -339,8 +341,10 @@ impl TransformerDecoder {
         let hidden = ops::squeeze(&hidden, &[1]);
         let logits = ops::linear(&hidden, &self.head_w, &self.head_b); // (1, vocab)
         let logits = ops::squeeze(&logits, &[0]); // (vocab,)
-        let logits_vec = logits.to_vec_f32();
 
-        (logits_vec, new_kv)
+        // Argmax on GPU — transfers a single i32 instead of 16,384 floats
+        let next_token = logits.argmax_flat() as i32;
+
+        (next_token, new_kv)
     }
 }
diff --git a/src/mlx/inference.rs b/src/mlx/inference.rs
@@ -38,19 +38,19 @@ pub fn transcribe(
         (0..decoder.layers.len()).map(|_| (None, None)).collect();
 
     // 5. Prime decoder with prompt tokens
-    let mut last_logits: Vec<f32> = Vec::new();
+    let mut next_token = 0i32;
     for (i, &token_id) in prompt.iter().enumerate() {
-        let (logits, new_kv) = decoder.step(token_id as i32, i as i32, &self_kv_cache, &cross_kv);
+        let (token, new_kv) = decoder.step(token_id as i32, i as i32, &self_kv_cache, &cross_kv);
         self_kv_cache = new_kv;
-        last_logits = logits;
+        next_token = token;
     }
 
     // 6. Greedy decode until EOS or max_new_tokens
+    //    Argmax is computed on GPU inside decoder.step() — only a single i32
+    //    is transferred per step instead of the full 16,384-element logits vector.
     let eos_id = tokenizer.special.eos as i32;
     let nospeech_id = tokenizer.special.nospeech as i32;
     let mut generated: Vec<i64> = Vec::new();
-
-    let mut next_token = argmax(&last_logits) as i32;
     let mut position = n_prompt as i32;
 
     while generated.len() < max_new_tokens {
@@ -59,24 +59,14 @@ pub fn transcribe(
         }
         generated.push(next_token as i64);
 
-        let (logits, new_kv) = decoder.step(next_token, position, &self_kv_cache, &cross_kv);
+        let (token, new_kv) = decoder.step(next_token, position, &self_kv_cache, &cross_kv);
         self_kv_cache = new_kv;
-        last_logits = logits;
+        next_token = token;
         position += 1;
-        next_token = argmax(&last_logits) as i32;
     }
 
     tracing::debug!("Generated token IDs: {:?}", generated);
 
     // 7. Decode tokens to text
     Ok(tokenizer.decode(&generated))
 }
-
-fn argmax(logits: &[f32]) -> usize {
-    logits
-        .iter()
-        .enumerate()
-        .max_by(|(_, a), (_, b)| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal))
-        .map(|(i, _)| i)
-        .unwrap_or(0)
-}