feat: Keep ssm in f16 until output on SSD code path

gabe-l-hart · gabe-l-hart · commit 426a97c8dec2 · 2025-11-05T11:47:34.000-07:00
Branch: Mamba2SSD

Signed-off-by: Gabe Goodhart &lt;ghart@us.ibm.com&gt;
diff --git a/src/models/graph-context-mamba.cpp b/src/models/graph-context-mamba.cpp
@@ -242,7 +242,6 @@ ggml_tensor * llm_graph_context_mamba::build_mamba2_layer(llm_graph_input_rs * i
         //  while avoiding to make unnecessary copies of the states)
         auto get_ssm_rows = [&](ggml_context * ctx, ggml_tensor * states, ggml_tensor * ids) {
             ggml_tensor * ssm = ggml_reshape_4d(ctx, states, d_state, head_dim, n_head, mctx_cur->get_size());
-            ssm = ggml_cast(ctx, ssm, GGML_TYPE_F32);
 
             // Empty y that will be extended with each chunk of tokens
             ggml_tensor * y = ggml_new_tensor_4d(ctx, x->type, x->ne[0], x->ne[1], 0, x->ne[3]);
@@ -252,6 +251,7 @@ ggml_tensor * llm_graph_context_mamba::build_mamba2_layer(llm_graph_input_rs * i
                 //DEBUG
                 LLAMA_LOG_DEBUG("build_mamba2_layer(layer %d): single-token update\n", il);
                 // If single-token, use ssm_scan op
+                ssm = ggml_cast(ctx, ssm, GGML_TYPE_F32);
                 return ggml_ssm_scan(ctx, ssm, x, dt, A, B, C, ids);
             } else {
                 //DEBUG
@@ -362,6 +362,9 @@ ggml_tensor * llm_graph_context_mamba::build_mamba2_layer(llm_graph_input_rs * i
 
                     // step 8: compute next_state
                     ggml_tensor * next_state = ggml_mul_mat(ctx, ggml_cont(ctx, ggml_permute(ctx, B_perm, 1, 0, 2, 3)), dtxdecay);
+                    if (next_state->type != ssm->type) {
+                        next_state = ggml_cast(ctx, next_state, ssm->type);
+                    }
                     cb(next_state, "next_state", il);
 
                     // TODO: Skip y and state updates if no previous state
@@ -395,6 +398,9 @@ ggml_tensor * llm_graph_context_mamba::build_mamba2_layer(llm_graph_input_rs * i
                 }
 
                 // Concat the output y and state
+                if (ssm->type != y->type) {
+                    ssm = ggml_cast(ctx, ssm, y->type);
+                }
                 ggml_tensor * out = ggml_concat(ctx,
                     ggml_view_1d(ctx, y, ggml_nelements(y), 0),
                     ggml_view_1d(ctx, ssm, ggml_nelements(ssm), 0),