feat: Remove sub-ubatch batching

gabe-l-hart · gabe-l-hart · commit 6733bdaba995 · 2025-11-05T12:07:38.000-07:00
Unlike Qwen3Next, we don't hit big commplexity scaling issues here, so
removing all of the batching gives a big reduction in complexity and a big
boost to performance!

Branch: Mamba2SSD

Signed-off-by: Gabe Goodhart &lt;ghart@us.ibm.com&gt;
diff --git a/src/models/graph-context-mamba.cpp b/src/models/graph-context-mamba.cpp
@@ -243,9 +243,6 @@ ggml_tensor * llm_graph_context_mamba::build_mamba2_layer(llm_graph_input_rs * i
         auto get_ssm_rows = [&](ggml_context * ctx, ggml_tensor * states, ggml_tensor * ids) {
             ggml_tensor * ssm = ggml_reshape_4d(ctx, states, d_state, head_dim, n_head, mctx_cur->get_size());
 
-            // Empty y that will be extended with each chunk of tokens
-            ggml_tensor * y = ggml_new_tensor_4d(ctx, x->type, x->ne[0], x->ne[1], 0, x->ne[3]);
-
             if (n_seq_tokens == 1) {
             // if (true) {
                 //DEBUG
@@ -259,9 +256,6 @@ ggml_tensor * llm_graph_context_mamba::build_mamba2_layer(llm_graph_input_rs * i
 
                 // otherwise, use the SSD formulation
 
-                // TODO: make this configurable
-                const uint32_t chunk_size = 256;
-
                 // extract the state(s) for the sequences identified by ids
                 if (ssm->ne[3] != ids->ne[0]) {
                     ggml_tensor * ssm_perm = ggml_permute(ctx, ssm, 0, 2, 3, 1);        // put the target dim in dim 1
@@ -287,115 +281,80 @@ ggml_tensor * llm_graph_context_mamba::build_mamba2_layer(llm_graph_input_rs * i
                 ggml_tensor * dtX = ggml_mul(ctx, x, ggml_reshape_4d(ctx, dt_softplus, 1, dt_softplus->ne[0], dt_softplus->ne[1], dt_softplus->ne[2])); // {head_dim, n_head, n_seq_tokens, n_seqs}
                 cb(dtX, "dtX", il);
 
-                // loop over all chunks
+
+                // step 3: compute CB
                 uint32_t repeats = n_head / n_group;
-                for (auto chunk_i = 0; chunk_i < n_seq_tokens; chunk_i += chunk_size) {
-
-                    // chunk views
-                    const auto chunk_size_i = std::min(chunk_size, uint32_t(n_seq_tokens - chunk_i));
-                    // slice dtA on dim 1
-                    ggml_tensor * dtA_chunk = ggml_view_3d(ctx, dtA,
-                        dtA->ne[0], chunk_size_i, dtA->ne[2],
-                        dtA->nb[1], dtA->nb[2],
-                        chunk_i * dtA->nb[1]);
-                    cb(dtA_chunk, "dtA_chunk", il);
-                    // slice dtX on dim 2
-                    ggml_tensor * dtX_chunk = ggml_view_4d(ctx, dtX,
-                        dtX->ne[0], dtX->ne[1], chunk_size_i, dtX->ne[3],
-                        dtX->nb[1], dtX->nb[2], dtX->nb[3],
-                        chunk_i * dtX->nb[2]);
-                    cb(dtX_chunk, "dtX_chunk", il);
-                    // slice B on dim 2
-                    ggml_tensor * B_chunk = ggml_view_4d(ctx, B,
-                        B->ne[0], B->ne[1], chunk_size_i, B->ne[3],
-                        B->nb[1], B->nb[2], B->nb[3],
-                        chunk_i * B->nb[2]);
-                    cb(B_chunk, "B_chunk", il);
-                    // slice C on dim 2
-                    ggml_tensor * C_chunk = ggml_view_4d(ctx, C,
-                        C->ne[0], C->ne[1], chunk_size_i, C->ne[3],
-                        C->nb[1], C->nb[2], C->nb[3],
-                        chunk_i * C->nb[2]);
-                    cb(C_chunk, "C_chunk", il);
-
-                    // step 3: compute CB
-                    ggml_tensor * C_perm = ggml_permute(ctx, C_chunk, 0, 2, 1, 3); // {d_state, n_seq_tokens, n_group, n_seqs}
-                    ggml_tensor * B_perm = ggml_permute(ctx, B_chunk, 0, 2, 1, 3); // {d_state, n_seq_tokens, n_group, n_seqs}
-                    ggml_tensor * CB = ggml_mul_mat(ctx, B_perm, C_perm); // {n_seq_tokens, n_seq_tokens, n_group, n_seqs}
-                    CB = ggml_repeat_4d(ctx, CB, CB->ne[0], CB->ne[1], CB->ne[2] * repeats, CB->ne[3]); // {n_seq_tokens, n_seq_tokens, n_head (repeats * n_group), n_seqs}
-                    cb(CB, "CB", il);
-
-                    // step 4: compute decay
-                    ggml_tensor * dtA_tmp0 = ggml_repeat_4d(ctx, dtA_chunk,
-                        dtA_chunk->ne[0], dtA_chunk->ne[1], dtA_chunk->ne[2], dtA_chunk->ne[3] * chunk_size_i); // {n_head, chunk_size_i_0, n_seqs, chunk_size_i_1}
-                    ggml_tensor * dtA_tmp1 = ggml_tri_dims(ctx, dtA_tmp0, nan(""), GGML_TRI_TYPE_LOWER, 3, 1); // {n_head, chunk_size_i_0, n_seqs, chunk_size_i_1}
-                    ggml_tensor * segsum = ggml_cumsum(ctx, dtA_tmp1, 1); // {n_head, chunk_size_i_0, n_seqs, chunk_size_i_1}
-                    cb(segsum, "segsum", il);
-                    ggml_tensor * decay = ggml_exp(ctx, segsum); // {n_head, chunk_size_i_0, n_seqs, chunk_size_i_1}
-                    decay = ggml_permute(ctx, decay, 2, 1, 3, 0);  // {chunk_size_i_1, chunk_size_i_0, n_head, n_seqs}
-                    cb(decay, "decay", il);
-
-                    // step 5: compute surrogate_attention_matrix
-                    ggml_tensor * CBdecay = ggml_mul(ctx, CB, ggml_cont(ctx, decay));
-                    ggml_tensor * surrogate_attention_matrix = ggml_tri_keep(ctx, CBdecay, GGML_TRI_TYPE_LOWER_DIAG);
-                    cb(surrogate_attention_matrix, "surrogate_attention_matrix", il);
-
-                    // step 6: compute y
-                    ggml_tensor * dtX_chunk_perm = ggml_cont(ctx, ggml_permute(ctx, dtX_chunk, 1, 2, 0, 3));
-                    ggml_tensor * y_chunk = ggml_mul_mat(ctx, dtX_chunk_perm, surrogate_attention_matrix);
-                    y_chunk = ggml_cont(ctx, ggml_permute(ctx, y_chunk, 0, 2, 1, 3));
-                    cb(y_chunk, "y_chunk", il);
-
-                    // step 7: compute dtxdecay
-                    ggml_tensor * decay_last = ggml_view_4d(ctx, decay,
-                        decay->ne[0], 1, decay->ne[2], decay->ne[3],
-                        decay->nb[1], decay->nb[2], decay->nb[3],
-                        (decay->ne[1] - 1) * decay->nb[1]);
-                    decay_last = ggml_cont(ctx, ggml_permute(ctx, decay_last, 2, 0, 1, 3));
-                    cb(decay_last, "decay_last", il);
-                    B_perm = ggml_cont(ctx, B_perm);
-                    B_perm = ggml_repeat_4d(ctx, B_perm,
-                        B_perm->ne[0], B_perm->ne[1], B_perm->ne[2] * repeats, B_perm->ne[3]);
-                    ggml_tensor * dtxdecay = ggml_mul(ctx, dtX_chunk, decay_last);
-                    dtxdecay = ggml_cont(ctx, ggml_permute(ctx, dtxdecay, 1, 2, 0, 3));
-                    cb(dtxdecay, "dtxdecay", il);
-
-                    // step 8: compute next_state
-                    ggml_tensor * next_state = ggml_mul_mat(ctx, ggml_cont(ctx, ggml_permute(ctx, B_perm, 1, 0, 2, 3)), dtxdecay);
-                    if (next_state->type != ssm->type) {
-                        next_state = ggml_cast(ctx, next_state, ssm->type);
-                    }
-                    cb(next_state, "next_state", il);
-
-                    // TODO: Skip y and state updates if no previous state
-
-                    // step 9: update from previous state
-                    ggml_tensor * exp_dtA_cumsum = ggml_exp(ctx, ggml_cumsum(ctx, dtA_chunk, 1)); // {n_head, chunk_size_i, n_seqs}
-                    cb(exp_dtA_cumsum, "exp_dtA_cumsum", il);
-                    ggml_tensor * exp_dtA_cumsum_last = ggml_view_4d(ctx, exp_dtA_cumsum,
-                        exp_dtA_cumsum->ne[0], 1, exp_dtA_cumsum->ne[2], exp_dtA_cumsum->ne[3],
-                        exp_dtA_cumsum->nb[1], exp_dtA_cumsum->nb[2], exp_dtA_cumsum->nb[3],
-                        (exp_dtA_cumsum->ne[1] - 1) * exp_dtA_cumsum->nb[1]); // {n_head, 1, n_seqs}
-                    cb(exp_dtA_cumsum_last, "exp_dtA_cumsum_last", il);
-                    ggml_tensor * exp_dtA_cumsum_perm = ggml_permute(ctx, exp_dtA_cumsum_last, 2, 1, 3, 0); // {1, 1, n_head, n_seqs}
-                    next_state = ggml_add(ctx, next_state, ggml_mul(ctx, ssm, ggml_cont(ctx, exp_dtA_cumsum_perm)));
-                    cb(next_state, "next_state_updated", il);
-
-                    // step 10: update from previous y
-                    ggml_tensor * y_prev = ggml_mul_mat(ctx, ggml_permute(ctx, C_chunk, 0, 2, 1, 3), ssm);
-                    cb(y_prev, "y_prev", il);
-                    y_prev = ggml_mul(ctx,
-                        ggml_cont(ctx, ggml_permute(ctx, y_prev, 2, 0, 1, 3)),
-                        ggml_cont(ctx, ggml_permute(ctx, exp_dtA_cumsum, 1, 2, 3, 0)));
-                    cb(y_prev, "y_prev_mul", il);
-                    y_chunk = ggml_add(ctx, y_chunk, y_prev);
-                    cb(y_chunk, "y_chunk_updated", il);
-
-                    // step 11: recurse
-                    y = ggml_concat(ctx, y, y_chunk, 2);
-                    cb(y, "y", il);
-                    ssm = next_state;
+                ggml_tensor * C_perm = ggml_permute(ctx, C, 0, 2, 1, 3); // {d_state, n_seq_tokens, n_group, n_seqs}
+                ggml_tensor * B_perm = ggml_permute(ctx, B, 0, 2, 1, 3); // {d_state, n_seq_tokens, n_group, n_seqs}
+                ggml_tensor * CB = ggml_mul_mat(ctx, B_perm, C_perm); // {n_seq_tokens, n_seq_tokens, n_group, n_seqs}
+                CB = ggml_repeat_4d(ctx, CB, CB->ne[0], CB->ne[1], CB->ne[2] * repeats, CB->ne[3]); // {n_seq_tokens, n_seq_tokens, n_head (repeats * n_group), n_seqs}
+                cb(CB, "CB", il);
+
+                // step 4: compute decay
+                ggml_tensor * dtA_tmp0 = ggml_repeat_4d(ctx, dtA,
+                    dtA->ne[0], dtA->ne[1], dtA->ne[2], dtA->ne[3] * n_seq_tokens); // {n_head, n_seq_tokens_0, n_seqs, n_seq_tokens_1}
+                ggml_tensor * dtA_tmp1 = ggml_tri_dims(ctx, dtA_tmp0, nan(""), GGML_TRI_TYPE_LOWER, 3, 1); // {n_head, n_seq_tokens_0, n_seqs, n_seq_tokens_1}
+                ggml_tensor * segsum = ggml_cumsum(ctx, dtA_tmp1, 1); // {n_head, n_seq_tokens_0, n_seqs, n_seq_tokens_1}
+                cb(segsum, "segsum", il);
+                ggml_tensor * decay = ggml_exp(ctx, segsum); // {n_head, n_seq_tokens_0, n_seqs, n_seq_tokens_1}
+                decay = ggml_permute(ctx, decay, 2, 1, 3, 0);  // {n_seq_tokens_1, n_seq_tokens_0, n_head, n_seqs}
+                cb(decay, "decay", il);
+
+                // step 5: compute surrogate_attention_matrix
+                ggml_tensor * CBdecay = ggml_mul(ctx, CB, ggml_cont(ctx, decay));
+                ggml_tensor * surrogate_attention_matrix = ggml_tri_keep(ctx, CBdecay, GGML_TRI_TYPE_LOWER_DIAG);
+                cb(surrogate_attention_matrix, "surrogate_attention_matrix", il);
+
+                // step 6: compute y
+                ggml_tensor * dtX_chunk_perm = ggml_cont(ctx, ggml_permute(ctx, dtX, 1, 2, 0, 3));
+                ggml_tensor * y = ggml_mul_mat(ctx, dtX_chunk_perm, surrogate_attention_matrix);
+                y = ggml_cont(ctx, ggml_permute(ctx, y, 0, 2, 1, 3));
+                cb(y, "y", il);
+
+                // step 7: compute dtxdecay
+                ggml_tensor * decay_last = ggml_view_4d(ctx, decay,
+                    decay->ne[0], 1, decay->ne[2], decay->ne[3],
+                    decay->nb[1], decay->nb[2], decay->nb[3],
+                    (decay->ne[1] - 1) * decay->nb[1]);
+                decay_last = ggml_cont(ctx, ggml_permute(ctx, decay_last, 2, 0, 1, 3));
+                cb(decay_last, "decay_last", il);
+                B_perm = ggml_cont(ctx, B_perm);
+                B_perm = ggml_repeat_4d(ctx, B_perm,
+                    B_perm->ne[0], B_perm->ne[1], B_perm->ne[2] * repeats, B_perm->ne[3]);
+                ggml_tensor * dtxdecay = ggml_mul(ctx, dtX, decay_last);
+                dtxdecay = ggml_cont(ctx, ggml_permute(ctx, dtxdecay, 1, 2, 0, 3));
+                cb(dtxdecay, "dtxdecay", il);
+
+                // step 8: compute next_state
+                ggml_tensor * next_state = ggml_mul_mat(ctx, ggml_cont(ctx, ggml_permute(ctx, B_perm, 1, 0, 2, 3)), dtxdecay);
+                if (next_state->type != ssm->type) {
+                    next_state = ggml_cast(ctx, next_state, ssm->type);
                 }
+                cb(next_state, "next_state", il);
+
+                // TODO: Skip y and state updates if no previous state
+
+                // step 9: update from previous state
+                ggml_tensor * exp_dtA_cumsum = ggml_exp(ctx, ggml_cumsum(ctx, dtA, 1)); // {n_head, chunk_size_i, n_seqs}
+                cb(exp_dtA_cumsum, "exp_dtA_cumsum", il);
+                ggml_tensor * exp_dtA_cumsum_last = ggml_view_4d(ctx, exp_dtA_cumsum,
+                    exp_dtA_cumsum->ne[0], 1, exp_dtA_cumsum->ne[2], exp_dtA_cumsum->ne[3],
+                    exp_dtA_cumsum->nb[1], exp_dtA_cumsum->nb[2], exp_dtA_cumsum->nb[3],
+                    (exp_dtA_cumsum->ne[1] - 1) * exp_dtA_cumsum->nb[1]); // {n_head, 1, n_seqs}
+                cb(exp_dtA_cumsum_last, "exp_dtA_cumsum_last", il);
+                ggml_tensor * exp_dtA_cumsum_perm = ggml_permute(ctx, exp_dtA_cumsum_last, 2, 1, 3, 0); // {1, 1, n_head, n_seqs}
+                next_state = ggml_add(ctx, next_state, ggml_mul(ctx, ssm, ggml_cont(ctx, exp_dtA_cumsum_perm)));
+                cb(next_state, "next_state_updated", il);
+
+                // step 10: update from previous y
+                ggml_tensor * y_prev = ggml_mul_mat(ctx, ggml_permute(ctx, C, 0, 2, 1, 3), ssm);
+                cb(y_prev, "y_prev", il);
+                y_prev = ggml_mul(ctx,
+                    ggml_cont(ctx, ggml_permute(ctx, y_prev, 2, 0, 1, 3)),
+                    ggml_cont(ctx, ggml_permute(ctx, exp_dtA_cumsum, 1, 2, 3, 0)));
+                cb(y_prev, "y_prev_mul", il);
+                y = ggml_add(ctx, y, y_prev);
+                cb(y, "y_updated", il);
 
                 // Concat the output y and state
                 if (ssm->type != y->type) {