Fixing mamba part of plamo2

mitmul · mitmul · commit 48ada485b3c8 · 2025-07-03T14:41:15.000+09:00
diff --git a/examples/eval-callback/eval-callback.cpp b/examples/eval-callback/eval-callback.cpp
@@ -120,7 +120,7 @@ static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) {
 
     if (!ggml_is_quantized(t->type)) {
         uint8_t * data = is_host ? (uint8_t *) t->data : cb_data->data.data();
-        ggml_print_tensor(data, t->type, t->ne, t->nb, 3);
+        ggml_print_tensor(data, t->type, t->ne, t->nb, 256);
     }
 
     return true;
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
@@ -1131,109 +1131,6 @@ int llama_context::decode(const llama_batch & batch_inp) {
             }
         }
 
-        // Debug: Dump tensor values after computation (for PLaMo-2 only)
-#define PLAMO2_DEBUG
-#ifdef PLAMO2_DEBUG
-        if (model.arch == LLM_ARCH_PLAMO2) {  // Only for small inputs
-            // Create debug directory if it doesn't exist
-            #ifdef _WIN32
-            _mkdir("debug_tensors");
-            #else
-            mkdir("debug_tensors", 0755);
-            #endif
-            // Find debug tensors by searching through the graph (gf is now accessible via res->get_graph())
-            ggml_cgraph* current_gf = res->get_graph();
-            for (int i = 0; i < ggml_graph_n_nodes(current_gf); ++i) {
-                ggml_tensor* node = ggml_graph_node(current_gf, i);
-                printf("Processing node: %s\n", node->name ? node->name : "unknown");
-                if (node && node->name) {
-                    bool should_dump = (strcmp(node->name, "embedding_output") == 0) ||
-                                    (strstr(node->name, "mamba_") == node->name) ||
-                                    (strstr(node->name, "attn_norm") == node->name) ||
-                                    (strstr(node->name, "norm") == node->name) ||
-                                    (strcmp(node->name, "tokens") == 0) ||
-                                    (strstr(node->name, "attn_pre_norm") == node->name) ||
-                                    (strcmp(node->name, "inp_embd") == 0) ||
-                                    (strcmp(node->name, "inp_tokens") == 0);
-
-                    if (strcmp(node->name, "tokens") == 0) {
-                        llama_token* token_data = (llama_token*)node->data;
-                        printf("Input Tokens: ");
-                        for (int j = 0; j < node->ne[0]; ++j) {
-                            printf("%d ", token_data[j]);
-                        }
-                        printf("\n");
-                        continue;  // Skip dumping tensor values for "tokens"
-                    }
-
-                    if (should_dump && node->data) {
-                        printf("=== Post-Compute Tensor Values ===\n");
-                        printf("Tensor: %s\n", node->name);
-                        printf("Shape: [%ld, %ld", node->ne[0], node->ne[1]);
-                        if (node->ne[2] > 1) printf(", %ld", node->ne[2]);
-                        if (node->ne[3] > 1) printf(", %ld", node->ne[3]);
-                        printf("]\n");
-
-                        int64_t total_elements = ggml_nelements(node);
-                        float* data = new float[total_elements];
-                        if (node->type == GGML_TYPE_F32) {
-                            data = (float*)node->data;
-                        } else if (node->type == GGML_TYPE_BF16) {
-                            ggml_bf16_t * bf16_data = (ggml_bf16_t*)node->data;
-                            for (int64_t j = 0; j < total_elements; j++) {
-                                printf("%.6f -> %.6f \n", bf16_data[j], ggml_bf16_to_fp32(bf16_data[j]));
-                            }
-                            ggml_bf16_to_fp32_row((ggml_bf16_t*)node->data, data, total_elements);
-                        }
-
-                        if (total_elements > 0) {
-                            // Calculate statistics
-                            float sum = 0.0f, sum_sq = 0.0f, min_val = data[0], max_val = data[0];
-                            for (int64_t j = 0; j < total_elements; j++) {
-                                sum += data[j];
-                                sum_sq += data[j] * data[j];
-                                min_val = fminf(min_val, data[j]);
-                                max_val = fmaxf(max_val, data[j]);
-                            }
-
-                            float mean = sum / total_elements;
-                            float variance = (sum_sq / total_elements) - (mean * mean);
-                            float std_dev = sqrtf(variance);
-
-                            printf("Stats - Mean: %.6f, Std: %.6f, Min: %.6f, Max: %.6f\n",
-                                mean, std_dev, min_val, max_val);
-
-                            // Print first 8 values
-                            printf("First 8 values: ");
-                            for (int j = 0; j < 8 && j < total_elements; j++) {
-                                printf("%.6f ", data[j]);
-                            }
-                            printf("\n");
-
-                            // Save to file for detailed comparison
-                            char filename[256];
-                            snprintf(filename, sizeof(filename), "debug_tensors/%s.csv", node->name);
-                            FILE* f = fopen(filename, "w");
-                            if (f) {
-                                for (int64_t j = 0; j < total_elements; ++j) {
-                                    fprintf(f, "%f", data[j]);
-                                    if ((j + 1) % node->ne[0] == 0) {
-                                        fprintf(f, "\n");
-                                    } else {
-                                        fprintf(f, ",");
-                                    }
-                                }
-                                fclose(f);
-                                printf("Saved to: %s\n", filename);
-                            }
-                        }
-                        printf("==================================\n");
-                    }
-                }
-            }
-        }
-#endif // PLAMO2_DEBUG
-
         n_outputs_prev += n_outputs;
     } while (mctx->next());
 
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
@@ -8027,13 +8027,8 @@ struct llm_build_plamo2 : public llm_graph_context {
             // ggml_graph_add_node(gf, model.layers[il].attn_norm);
             // cb(model.layers[il].attn_norm, "attn_norm", il);
 
-            ggml_graph_add_node(gf, model.layers[il].attn_norm);
-            cb(model.layers[il].attn_norm, "attn_norm_weight", il);
-
             // pre_mixer_norm
-            cb(inpL, "attn_pre_norm_input", il);
             cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
-            cb(cur, "attn_pre_norm", il);
 
             // check if this layer is Mamba or Attention
             bool is_mamba_layer = hparams.is_recurrent(il);
@@ -8073,6 +8068,10 @@ struct llm_build_plamo2 : public llm_graph_context {
             cur = ggml_add(ctx0, cur, residual);
 
             inpL = cur;
+
+            if (il >= 2) {
+                break;
+            }
         }
 
         cur = inpL;
@@ -8238,17 +8237,28 @@ struct llm_build_plamo2 : public llm_graph_context {
         ggml_tensor * zx = build_lora_mm(model.layers[il].ssm_in, cur);
         cb(zx, "mamba_in_proj", il);
 
+        zx = ggml_permute(ctx0, zx, 0, 2, 1, 3);
+        zx = ggml_reshape_4d(ctx0, zx, 2 * hparams.ssm_head_dim, hparams.ssm_num_heads, n_seq_tokens, n_seqs);
+        cb(zx, "mamba_in_proj_out", il);
+
         // split into z and x
         // => {d_inner, n_seq_tokens, n_seqs}
-        ggml_tensor * x = ggml_view_3d(ctx0, zx, d_inner, zx->ne[1], zx->ne[2], zx->nb[1], zx->nb[2], 0);
-        ggml_tensor * z = ggml_view_3d(ctx0, zx, d_inner, zx->ne[1], zx->ne[2], zx->nb[1], zx->nb[2], d_inner*ggml_element_size(zx));
+        ggml_tensor * x = ggml_view_4d(ctx0, zx, hparams.ssm_head_dim, zx->ne[1], zx->ne[2], zx->ne[3], zx->nb[1], zx->nb[2], zx->nb[3], hparams.ssm_head_dim*ggml_element_size(zx));
+        x = ggml_cont(ctx0, x);
+        x = ggml_reshape_4d(ctx0, x, hparams.ssm_head_dim * hparams.ssm_num_heads, 1, n_seq_tokens, n_seqs);
+        x = ggml_permute(ctx0, x, 0, 2, 1, 3);
         cb(x, "mamba_x_split", il);
+        ggml_tensor * z = ggml_view_4d(ctx0, zx, hparams.ssm_head_dim, zx->ne[1], zx->ne[2], zx->ne[3], zx->nb[1], zx->nb[2], zx->nb[3], 0);
+        z = ggml_cont(ctx0, z);
+        z = ggml_reshape_4d(ctx0, z, hparams.ssm_head_dim * hparams.ssm_num_heads, 1, n_seq_tokens, n_seqs);
+        z = ggml_permute(ctx0, z, 0, 2, 1, 3);
         cb(z, "mamba_z_split", il);
 
         // conv1d
         {
             // => {d_conv - 1 + n_seq_tokens, d_inner, n_seqs}
             ggml_tensor * conv_x = ggml_concat(ctx0, conv, ggml_transpose(ctx0, x), 0);
+            cb(conv_x, "mamba_conv1d_input", il);
 
             // copy last (d_conv - 1) columns back into the state cache
             ggml_tensor * last_conv = ggml_view_3d(ctx0, conv_x, d_conv - 1, d_inner, n_seqs,
@@ -8264,9 +8274,6 @@ struct llm_build_plamo2 : public llm_graph_context {
             x = ggml_ssm_conv(ctx0, conv_x, model.layers[il].ssm_conv1d);
             cb(x, "mamba_conv1d", il);
 
-            // bias
-            // x = ggml_add(ctx0, x, model.layers[il].ssm_conv1d_b);  // PLaMo-2 does not use bias here
-
             x = ggml_silu(ctx0, x);
             cb(x, "mamba_conv1d_silu", il);
         }
@@ -8279,9 +8286,9 @@ struct llm_build_plamo2 : public llm_graph_context {
 
             // split into dt, B, C
             const int64_t dt_dim = std::max(64, int(hparams.n_embd / 16));
-            ggml_tensor * dt = ggml_view_3d(ctx0, x_bcdt, dt_dim, n_seq_tokens, n_seqs, x_bcdt->nb[1], x_bcdt->nb[2], 0);
-            ggml_tensor * C  = ggml_view_3d(ctx0, x_bcdt, d_state, n_seq_tokens, n_seqs, x_bcdt->nb[1], x_bcdt->nb[2], ggml_element_size(x_bcdt)*dt_dim);
-            ggml_tensor * B  = ggml_view_3d(ctx0, x_bcdt, d_state, n_seq_tokens, n_seqs, x_bcdt->nb[1], x_bcdt->nb[2], ggml_element_size(x_bcdt)*(dt_dim + d_state));
+            ggml_tensor * B = ggml_view_3d(ctx0, x_bcdt, d_state, n_seq_tokens, n_seqs, x_bcdt->nb[1], x_bcdt->nb[2], 0);
+            ggml_tensor * C  = ggml_view_3d(ctx0, x_bcdt, d_state, n_seq_tokens, n_seqs, x_bcdt->nb[1], x_bcdt->nb[2], ggml_element_size(x_bcdt)*d_state);
+            ggml_tensor * dt  = ggml_view_3d(ctx0, x_bcdt, dt_dim, n_seq_tokens, n_seqs, x_bcdt->nb[1], x_bcdt->nb[2], ggml_element_size(x_bcdt)*(2*d_state));
             cb(B, "mamba_B_raw", il);
             cb(C, "mamba_C_raw", il);
             cb(dt, "mamba_dt_raw", il);
@@ -8296,15 +8303,17 @@ struct llm_build_plamo2 : public llm_graph_context {
 
             // dt_proj: {dt_rank, d_inner} @ {dt_rank, n_seq_tokens, n_seqs} => {d_inner, n_seq_tokens, n_seqs}
             dt = build_lora_mm(model.layers[il].ssm_dt, dt);
-            dt = ggml_add(ctx0, dt, model.layers[il].ssm_dt_b);
             cb(dt, "mamba_dt_proj", il);
 
             // This is corresponding to the broadcast_to operation in ssd_update_state() of the originall code
-            ggml_tensor * dt_expanded = ggml_new_tensor_2d(ctx0, dt->type, d_inner, n_seq_tokens);
+            ggml_tensor * dt_expanded = ggml_new_tensor_2d(ctx0, dt->type, dt_dim * hparams.ssm_num_heads, dt->ne[1]);
             dt = ggml_repeat(ctx0, dt, dt_expanded);
+            cb(dt, "mamba_dt_expanded", il);
+
             ggml_tensor * A_expanded = ggml_new_tensor_2d(ctx0, model.layers[il].ssm_a->type, d_state, d_inner);
             A_expanded = ggml_repeat(ctx0, model.layers[il].ssm_a, A_expanded);
-            cb(dt, "mamba_dt_expanded", il);
+            A_expanded = ggml_exp(ctx0, A_expanded);
+            A_expanded = ggml_scale(ctx0, A_expanded, -1.0f);
             cb(A_expanded, "mamba_A_expanded", il);
 
             // SSM scan operation

Original file line number	Diff line number	Diff line change
`@@ -120,7 +120,7 @@ static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) {`
`120`	`120`
`121`	`121`	`if (!ggml_is_quantized(t->type)) {`
`122`	`122`	`uint8_t * data = is_host ? (uint8_t *) t->data : cb_data->data.data();`
`123`		`- ggml_print_tensor(data, t->type, t->ne, t->nb, 3);`
	`123`	`+ ggml_print_tensor(data, t->type, t->ne, t->nb, 256);`
`124`	`124`	`}`
`125`	`125`
`126`	`126`	`return true;`