Fix MLX 1-token output, tracing to stderr, and shallow_clone perf

juntao · claude · juntao · commit fd7d9e61039c · 2026-03-27T16:21:37.000Z
- Fix ConvSubsampling NHWC flatten order: transpose (T',F,C) → (T',C,F)
  before flattening so the linear projection receives features in the same
  order as PyTorch's NCHW layout. This was the root cause of the model
  generating only 1 token on the MLX backend.
- Direct tracing output to stderr in both CLI and server binaries so
  transcript text on stdout is not contaminated by log lines.
- Replace shallow_clone() CPU round-trip (to_vec_f32 + from_data_f32)
  with mlx_array_set() for O(1) ref-counted sharing, eliminating the
  ~75s encoder construction overhead.

Co-Authored-By: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/src/bin/server.rs b/src/bin/server.rs
@@ -535,6 +535,7 @@ async fn main() -> Result<()> {
         _ => "trace",
     };
     tracing_subscriber::fmt()
+        .with_writer(std::io::stderr)
         .with_env_filter(
             tracing_subscriber::EnvFilter::try_from_default_env()
                 .unwrap_or_else(|_| tracing_subscriber::EnvFilter::new(log_level)),
diff --git a/src/main.rs b/src/main.rs
@@ -60,6 +60,7 @@ fn main() -> Result<()> {
         _ => "trace",
     };
     tracing_subscriber::fmt()
+        .with_writer(std::io::stderr)
         .with_env_filter(
             tracing_subscriber::EnvFilter::try_from_default_env()
                 .unwrap_or_else(|_| tracing_subscriber::EnvFilter::new(log_level)),
diff --git a/src/mlx/array.rs b/src/mlx/array.rs
@@ -165,17 +165,12 @@ impl Drop for Array {
 // C API.  Provide an explicit method instead of implementing Clone to avoid
 // accidental copies.
 impl Array {
-    /// Shallow copy — wraps the same storage.
-    /// The caller is responsible for ensuring the original outlives the copy
-    /// (or that eval has been called, materialising the data).
-    ///
-    /// TODO: use mlx_array_retain if/when available in mlx-c.
+    /// Shallow copy — uses `mlx_array_set` to share the underlying storage
+    /// with reference counting.  This is O(1) and avoids the expensive CPU
+    /// round-trip that the previous `to_vec_f32` + `from_data_f32` approach used.
     pub fn shallow_clone(&self) -> Self {
-        // Re-create from data to guarantee independent ownership.
-        // This is the safe fallback — the eval round-trip is acceptable for
-        // weight tensors that are only cloned once at load time.
-        let data = self.to_vec_f32();
-        let shape = self.shape();
-        Self::from_data_f32(&data, &shape)
+        let mut new = Self::empty();
+        unsafe { ffi::mlx_array_set(&mut new.ptr, self.ptr) };
+        new
     }
 }
diff --git a/src/mlx/encoder.rs b/src/mlx/encoder.rs
@@ -97,7 +97,9 @@ impl ConvSubsampling {
         let x = add_bias_nhwc(&x, &self.c6_b);
         let x = ops::relu(&x);
 
-        // x: (1, T', n_mels/8, 256) → (1, T', 256 * n_mels/8)
+        // x: (1, T', n_mels/8, 256) in NHWC — transpose to match PyTorch's
+        // NCHW flatten order: (1, T', 256, n_mels/8) → (1, T', 256*n_mels/8)
+        let x = ops::transpose(&x, &[0, 1, 3, 2]);
         let t_prime = x.dim(1);
         let feat = x.dim(2) * x.dim(3);
         let x = ops::reshape(&x, &[1, t_prime, feat]);