ServeurpersoCom
diff --git a/‎ggml/include/ggml.h‎
Lines changed: 22 additions & 0 deletions b/‎ggml/include/ggml.h‎
Lines changed: 22 additions & 0 deletions
diff --git a/‎ggml/src/ggml-cpu/ops.cpp‎
Lines changed: 2 additions & 123 deletions b/‎ggml/src/ggml-cpu/ops.cpp‎
Lines changed: 2 additions & 123 deletions
diff --git a/‎ggml/src/ggml-impl.h‎
Lines changed: 1 addition & 1 deletion b/‎ggml/src/ggml-impl.h‎
Lines changed: 1 addition & 1 deletion
@@ -2417,6 +2417,28 @@ extern "C" {
             struct ggml_tensor  * b,
             struct ggml_tensor  * state);
 
+    GGML_API struct ggml_tensor * ggml_delta_net(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * q,
+        struct ggml_tensor  * k,
+        struct ggml_tensor  * v,
+        struct ggml_tensor  * g,
+        struct ggml_tensor  * beta,
+        struct ggml_tensor  * state,
+        bool                  use_qk_l2norm,
+        float                 eps_norm);
+
+    GGML_API struct ggml_tensor * ggml_delta_net_recurrent(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * q,
+        struct ggml_tensor  * k,
+        struct ggml_tensor  * v,
+        struct ggml_tensor  * g,
+        struct ggml_tensor  * beta,
+        struct ggml_tensor  * state,
+        bool                  use_qk_l2norm,
+        float                 eps_norm);
+
     // custom operators
 
     typedef void (*ggml_custom1_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, int ith, int nth, void * userdata);
 
@@ -8728,7 +8728,7 @@ static void ggml_compute_forward_ssm_scan_f32(
                 // n_head
                 for (int h = ih0; h < ih1; ++h) {
                     // ref: https://github.com/state-spaces/mamba/blob/62db608da60f6fc790b8ed9f4b3225e95ca15fde/mamba_ssm/ops/triton/softplus.py#L16
-                    const float dt_soft_plus = ggml_softplus(dt[h]);
+                    const float dt_soft_plus = ggml_compute_softplus_f32(dt[h]);
                     const float dA = expf(dt_soft_plus * A[h]);
                     const int g = h / (nh / ng); // repeat_interleave
 
@@ -8825,7 +8825,7 @@ static void ggml_compute_forward_ssm_scan_f32(
                 // n_head
                 for (int h = ih0; h < ih1; ++h) {
                     // ref: https://github.com/state-spaces/mamba/blob/62db608da60f6fc790b8ed9f4b3225e95ca15fde/mamba_ssm/ops/triton/softplus.py#L16
-                    const float dt_soft_plus = ggml_softplus(dt[h]);
+                    const float dt_soft_plus = ggml_compute_softplus_f32(dt[h]);
                     const int g = h / (nh / ng); // repeat_interleave
 
                     // dim
@@ -9712,22 +9712,6 @@ void ggml_compute_forward_gla(
     }
 }
 
-static double debug_sum(float * data, size_t size) {
-    double sum = 0.0;
-    for (unsigned int i = 0; i < size; i++) {
-        sum += data[i];
-    }
-    return sum;
-}
-
-static void print_debug_info(float * data, size_t size, const char * name, int64_t token) {
-#ifdef MR_CHUNKY_TALKS
-    GGML_LOG_INFO("\nggml-debug: %s (%ld) first 5 values: [%.6f, %.6f, %.6f, %.6f, %.6f, ...]\n", 
-        name, token, data[0], data[1], data[2], data[3], data[4]);
-    GGML_LOG_INFO("total elements: %ld, sum = %.10f\n", size, debug_sum(data, size));
-#endif MR_CHUNKY_TALKS
-}
-
 // Helper function to compute cumulative sum
 static void delta_cumsum_f32(const float * x, float * dst, const int64_t n) {
     float cumsum = 0.0f;
@@ -9837,34 +9821,9 @@ static void delta_apply_triangular_updates_chunk_f32(float *       attn,
                         attn_ptr[i * chunk_size + j] = row[j] + sum_val;
                     }
 
-                    if (i % 10 == 0) {
-                        if (seq == 1 && head == 0 && chunk == 0) {
-                            print_debug_info(row, i, "row[1, 0, 0]", i);
-                            print_debug_info(sub, i * i, "sub[1, 0, 0]", i);
-                        }
-                        if (seq == 0 && head == 1 && chunk == 0) {
-                            print_debug_info(row, i, "row[0, 1, 0]", i);
-                            print_debug_info(sub, i * i, "sub[0, 1, 0]", i);
-                        }
-                        if (seq == 0 && head == 0 && chunk == 1) {
-                            print_debug_info(row, i, "row[0, 0, 1]", i);
-                            print_debug_info(sub, i * i, "sub[0, 0, 1]", i);
-                        }
-                    }
-
                     free(row);
                     free(sub);
                 }
-
-                if (seq == 1 && head == 0 && chunk == 0) {
-                    print_debug_info(attn_ptr, chunk_size * chunk_size, "attn[1, 0, 0]", 0);
-                }
-                if (seq == 0 && head == 1 && chunk == 0) {
-                    print_debug_info(attn_ptr, chunk_size * chunk_size, "attn[0, 1, 0]", 0);
-                }
-                if (seq == 0 && head == 0 && chunk == 1) {
-                    print_debug_info(attn_ptr, chunk_size * chunk_size, "attn[0, 0, 1]", 0);
-                }
             }
         }
     }
@@ -10191,8 +10150,6 @@ void ggml_compute_forward_delta_net_f32(const ggml_compute_params * params, ggml
             }
         }
     }
-    print_debug_info(new_state, S_v * S_v * H_v * n_seqs, "init_state", -1);
-
 
     GGML_ASSERT(ggml_is_contiguous(src0));
     GGML_ASSERT(ggml_is_contiguous(src1));
@@ -10229,13 +10186,10 @@ void ggml_compute_forward_delta_net_f32(const ggml_compute_params * params, ggml
     // for i in range(1, chunk_size): attn[..., i, :i] = row + (row.unsqueeze(-1) * sub).sum(-2)
     // attn = attn + torch.eye(chunk_size)
     delta_apply_triangular_updates_chunk_f32(attn, chunk_size, n_seqs, H_v, num_chunks);
-    print_debug_info(attn, chunk_size * chunk_size * H_v * num_chunks * n_seqs, "attn_chunk", -1);
     delta_add_identity_matrix_chunk_f32(attn, chunk_size, n_seqs, H_v, num_chunks);
-    print_debug_info(attn, chunk_size * chunk_size * H_v * num_chunks * n_seqs, "attn_eye", -1);
 
     // Compute value = attn @ v_beta
     delta_compute_value_f32(attn, (const float *) src6->data, value, chunk_size, S_v, H_v, n_seqs, num_chunks);
-    print_debug_info(value, chunk_size * S_v * H_v * num_chunks * n_seqs, "value", -1);
 
     for (int64_t seq = 0; seq < n_seqs; seq++) {
         for (int i = 0; i < num_chunks; i++) {
@@ -10248,7 +10202,6 @@ void ggml_compute_forward_delta_net_f32(const ggml_compute_params * params, ggml
             } 
         }
     }
-    print_debug_info(k_cumdecay, chunk_size * S_v * H_v * num_chunks * n_seqs, "k_cumdecay", -1);
 
     // Process each chunk with all sequences and heads together
     for (int64_t chunk = 0; chunk < num_chunks; chunk++) {
@@ -10304,9 +10257,6 @@ void ggml_compute_forward_delta_net_f32(const ggml_compute_params * params, ggml
             }
         }
 
-        print_debug_info(pc_q_chunk_data, chunk_size * S_v * H_v * n_seqs, "q_i_chunk", chunk);
-        print_debug_info(pc_k_chunk_data, chunk_size * S_v * H_v * n_seqs, "k_i_chunk", chunk);
-
         // Step 4: Compute NEW attention matrix for this chunk: attn = (q_i @ k_i.transpose(-1, -2) * decay_mask[:, :, i]).masked_fill_(mask, 0)
         // Note: decay_mask[:, :, i] means we need to use the decay_mask for this specific chunk
         // The mask applied is the simple causal attention mask: torch.triu(torch.ones(chunk_size, chunk_size), diagonal=1)
@@ -10328,7 +10278,6 @@ void ggml_compute_forward_delta_net_f32(const ggml_compute_params * params, ggml
                 delta_matmul_f32(q_ptr, k_trans, attn_ptr, chunk_size, chunk_size, S_v);
             }
         }
-        print_debug_info(attn, chunk_size * chunk_size * num_chunks * H_v * n_seqs, "attn_q_k_trans", chunk);
 
         for (int64_t seq = 0; seq < n_seqs; seq++) {
             for (int64_t head = 0; head < H_v; head++) {
@@ -10348,20 +10297,15 @@ void ggml_compute_forward_delta_net_f32(const ggml_compute_params * params, ggml
             }
         }
 
-        print_debug_info(attn, chunk_size * chunk_size * num_chunks * H_v * n_seqs, "attn_step4_new_chunk", chunk);
-
         // v_prime = (k_cumdecay[:, :, i]) @ last_recurrent_state
         // k_cumdecay has shape [chunk_size, v_head_dim], state has shape [v_head_dim, v_head_dim]
         delta_matmul_state_chunk_f32(k_cumdecay, new_state, pc_v_prime, chunk_size, S_v, S_v, n_seqs, H_v, chunk, num_chunks);
-        print_debug_info(pc_v_prime, chunk_size * S_v * H_v * n_seqs, "v_prime_chunk", chunk);
 
         // v_new = v_i - v_prime
         delta_tensor_subtract_chunk_f32(value, pc_v_prime, pc_v_new, chunk_size * S_v, n_seqs, H_v, num_chunks, chunk);
-        print_debug_info(pc_v_new, chunk_size * S_v * H_v * n_seqs, "v_new_chunk", chunk);
 
         // attn_inter = (q_i * g[:, :, i, :, None].exp()) @ last_recurrent_state
         delta_matmul_state_chunk_f32(pc_q_g_exp, new_state, pc_attn_inter, chunk_size, S_v, S_v, n_seqs, H_v, -1, -1);
-        print_debug_info(pc_attn_inter, chunk_size * S_v * H_v * n_seqs, "attn_inter_chunk", chunk);
 
         // core_attn_out[:, :, i] = attn_inter + attn @ v_new
         // Use regular matrix multiplication for attn @ v_new
@@ -10375,9 +10319,7 @@ void ggml_compute_forward_delta_net_f32(const ggml_compute_params * params, ggml
                 delta_matmul_f32(attn_ptr, v_new_ptr, attn_v_new_ptr, chunk_size, S_v, chunk_size);
             }
         }
-        print_debug_info(pc_attn_v_new, chunk_size * S_v * H_v * n_seqs, "attn_v_new_chunk", chunk);
         delta_tensor_add_chunk_f32(pc_attn_inter, pc_attn_v_new, pc_core_attn_out, chunk_size * S_v, n_seqs, H_v);
-        print_debug_info(pc_core_attn_out, chunk_size * S_v * H_v * n_seqs, "core_attn_out_chunk", chunk);
 
         // Prepare g_last and g_diff_exp for state update
         for (int64_t seq = 0; seq < n_seqs; seq++) {
@@ -10394,9 +10336,6 @@ void ggml_compute_forward_delta_net_f32(const ggml_compute_params * params, ggml
             }
         }
 
-        print_debug_info(pc_g_last, H_v * n_seqs, "g_last_chunk", chunk);
-        print_debug_info(pc_g_diff_exp, chunk_size * H_v * n_seqs, "g_diff_exp", chunk);
-
         float * k_g_diffexp = (float *) malloc(chunk_size * S_v * H_v * n_seqs * sizeof(float));
         for (int64_t seq = 0; seq < n_seqs; seq++) {
             for (int64_t head = 0; head < H_v; head++) {
@@ -10408,7 +10347,6 @@ void ggml_compute_forward_delta_net_f32(const ggml_compute_params * params, ggml
                 }
             }
         }
-        print_debug_info(k_g_diffexp, chunk_size * S_v * H_v * n_seqs, "k_g_diffexp", chunk);
         float * k_g_diffexp_T = (float *) malloc(chunk_size * S_v * H_v * n_seqs * sizeof(float));
         for (int64_t seq = 0; seq < n_seqs; seq++) {
             for (int64_t head = 0; head < H_v; head++) {
@@ -10421,25 +10359,6 @@ void ggml_compute_forward_delta_net_f32(const ggml_compute_params * params, ggml
             }
         }
 
-        // for (int64_t seq = 0; seq < n_seqs; seq++) {
-        //     for (int64_t head = 0; head < H_v; head++) {
-        //         GGML_LOG_INFO("Sequence %ld, head %ld: \n[ ", seq, head);
-        //         for (int i = 0; i < chunk_size; i++) {
-        //             GGML_LOG_INFO("[ ");
-        //             for (int j = 0; j < S_v; j++) {
-        //                 GGML_LOG_INFO("%.6f", k_g_diffexp[(chunk_size * S_v * H_v) * seq + (chunk_size * S_v) * head + i * S_v + j]);
-        //                 if (j < chunk_size - 1) {
-        //                     GGML_LOG_INFO(", ");
-        //                 }
-        //             }
-        //             GGML_LOG_INFO("], \n");
-        //         }
-        //         GGML_LOG_INFO("]\n");
-        //     }
-        // }
-
-        print_debug_info(k_g_diffexp_T, chunk_size * S_v * H_v * n_seqs, "k_g_diffexp_T", chunk);
-
         float * kgd_mul_vnew = (float *) malloc(S_v * S_v * H_v * n_seqs * sizeof(float));
 
         for (int64_t seq = 0; seq < n_seqs; seq++) {
@@ -10450,24 +10369,6 @@ void ggml_compute_forward_delta_net_f32(const ggml_compute_params * params, ggml
                     S_v, S_v, chunk_size);
             }
         }
-        print_debug_info(kgd_mul_vnew, S_v * S_v * H_v * n_seqs, "kgd_mul_vnew", chunk);
-        
-        // for (int64_t seq = 0; seq < n_seqs; seq++) {
-        //     for (int64_t head = 0; head < H_v; head++) {
-        //         GGML_LOG_INFO("Sequence %ld, head %ld: \n[ ", seq, head);
-        //         for (int i = 0; i < S_v; i++) {
-        //             GGML_LOG_INFO("[ ");
-        //             for (int j = 0; j < S_v; j++) {
-        //                 GGML_LOG_INFO("%.6f", kgd_mul_vnew[(S_v * S_v * H_v) * seq + (S_v * S_v) * head + i * S_v + j]);
-        //                 if (j < S_v - 1) {
-        //                     GGML_LOG_INFO(", ");
-        //                 }
-        //             }
-        //             GGML_LOG_INFO("], \n");
-        //         }
-        //         GGML_LOG_INFO("]\n");
-        //     }
-        // }
 
         for (int64_t seq = 0; seq < n_seqs; seq++) {
             for (int64_t head = 0; head < H_v; head++) {
@@ -10480,7 +10381,6 @@ void ggml_compute_forward_delta_net_f32(const ggml_compute_params * params, ggml
                 }
             }
         }
-        print_debug_info(new_state, S_v * S_v * H_v * n_seqs, "state_end_chunk", chunk);
 
         // Free temporary memory
         free(pc_q_chunk_data);
@@ -10511,21 +10411,6 @@ void ggml_compute_forward_delta_net_f32(const ggml_compute_params * params, ggml
                 }
             }
         }
-        print_debug_info(output, S_v * H_v * n_tokens * n_seqs, "output", chunk);
-        // GGML_LOG_INFO("\nFull output tensor: \n\n");
-        // for (int64_t seq = 0; seq < n_seqs; seq++) {
-        //     for (int64_t head = 0; head < H_v; head++) {
-        //         GGML_LOG_INFO("\n[ ");
-        //         for (int64_t i = 0; i < n_tokens; i++) {
-        //             for (int64_t d = 0; d < S_v; d++) {
-        //                 GGML_LOG_INFO("%.4f  ", output[seq * (n_tokens * S_v * H_v) + head * (n_tokens * S_v) + (chunk * chunk_size + i) * S_v + d]);
-        //             }
-        //         }
-        //         GGML_LOG_INFO(" ]");
-        //     }
-        // }
-        print_debug_info(new_state, S_v * S_v * H_v * n_seqs, "new_state", chunk);
-
         free(pc_core_attn_out);
         free(pc_attn_inter);
         free(pc_v_new);
@@ -10622,7 +10507,6 @@ void ggml_compute_forward_delta_net_recurrent_f32(const ggml_compute_params * pa
                 }
             }
         }
-        print_debug_info(temp_state, n_seqs * H_v * S_v * S_v, "temp_state_copy", token);
 
         // 1. last_recurrent_state = last_recurrent_state * g_t (for all seqs and heads)
         for (int64_t seq = 0; seq < n_seqs; seq++) {
@@ -10636,7 +10520,6 @@ void ggml_compute_forward_delta_net_recurrent_f32(const ggml_compute_params * pa
                 }
             }
         }
-        print_debug_info(temp_state, n_seqs * H_v * S_v * S_v, "temp_state_times_g_t", token);
 
         // 2. kv_mem = (last_recurrent_state * k_t.unsqueeze(-1)).sum(dim=-2) (for all seqs and heads)
         for (int64_t seq = 0; seq < n_seqs; seq++) {
@@ -10651,7 +10534,6 @@ void ggml_compute_forward_delta_net_recurrent_f32(const ggml_compute_params * pa
                 }
             }
         }
-        print_debug_info(kv_mem, n_seqs * H_v * S_v, "kv_mem", token);
 
         // 3. delta = (v_t - kv_mem) * beta_t (for all seqs and heads)
         for (int64_t seq = 0; seq < n_seqs; seq++) {
@@ -10663,7 +10545,6 @@ void ggml_compute_forward_delta_net_recurrent_f32(const ggml_compute_params * pa
                 }
             }
         }
-        print_debug_info(delta, n_seqs * H_v * S_v, "delta", token);
 
         // 4. last_recurrent_state = last_recurrent_state + k_t.unsqueeze(-1) * delta.unsqueeze(-2) (for all seqs and heads)
         for (int64_t seq = 0; seq < n_seqs; seq++) {
@@ -10677,7 +10558,6 @@ void ggml_compute_forward_delta_net_recurrent_f32(const ggml_compute_params * pa
                 }
             }
         }
-        print_debug_info(temp_state, n_seqs * H_v * S_v * S_v, "temp_state", token);
 
         // 5. core_attn_out[:, :, i] = (last_recurrent_state * q_t.unsqueeze(-1)).sum(dim=-2) (for all seqs and heads)
         for (int64_t seq = 0; seq < n_seqs; seq++) {
@@ -10691,7 +10571,6 @@ void ggml_compute_forward_delta_net_recurrent_f32(const ggml_compute_params * pa
                 }
             }
         }
-        print_debug_info(attn_out_t, n_seqs * S_v * H_v, "attn_out_t", token);
 
         // Store the output for this token (for all seqs and heads)
         for (int64_t seq = 0; seq < n_seqs; seq++) {
 
@@ -102,7 +102,7 @@ static bool ggml_op_is_empty(enum ggml_op op) {
     }
 }
 
-static inline float ggml_softplus(float input) {
+static inline float ggml_compute_softplus_f32(float input) {
     return (input > 20.0f) ? input : logf(1 + expf(input));
 }
 //
Original file line number	Diff line number	Diff line change
`@@ -8728,7 +8728,7 @@ static void ggml_compute_forward_ssm_scan_f32(`
`8728`	`8728`	`// n_head`
`8729`	`8729`	`for (int h = ih0; h < ih1; ++h) {`
`8730`	`8730`	`// ref: https://github.com/state-spaces/mamba/blob/62db608da60f6fc790b8ed9f4b3225e95ca15fde/mamba_ssm/ops/triton/softplus.py#L16`
`8731`		`- const float dt_soft_plus = ggml_softplus(dt[h]);`
	`8731`	`+ const float dt_soft_plus = ggml_compute_softplus_f32(dt[h]);`
`8732`	`8732`	`const float dA = expf(dt_soft_plus * A[h]);`
`8733`	`8733`	`const int g = h / (nh / ng); // repeat_interleave`
`8734`	`8734`
`@@ -8825,7 +8825,7 @@ static void ggml_compute_forward_ssm_scan_f32(`
`8825`	`8825`	`// n_head`
`8826`	`8826`	`for (int h = ih0; h < ih1; ++h) {`
`8827`	`8827`	`// ref: https://github.com/state-spaces/mamba/blob/62db608da60f6fc790b8ed9f4b3225e95ca15fde/mamba_ssm/ops/triton/softplus.py#L16`
`8828`		`- const float dt_soft_plus = ggml_softplus(dt[h]);`
	`8828`	`+ const float dt_soft_plus = ggml_compute_softplus_f32(dt[h]);`
`8829`	`8829`	`const int g = h / (nh / ng); // repeat_interleave`
`8830`	`8830`
`8831`	`8831`	`// dim`
`@@ -9712,22 +9712,6 @@ void ggml_compute_forward_gla(`
`9712`	`9712`	`}`
`9713`	`9713`	`}`
`9714`	`9714`
`9715`		`-static double debug_sum(float * data, size_t size) {`
`9716`		`- double sum = 0.0;`
`9717`		`- for (unsigned int i = 0; i < size; i++) {`
`9718`		`- sum += data[i];`
`9719`		`- }`
`9720`		`- return sum;`
`9721`		`-}`
`9722`		`-`
`9723`		`-static void print_debug_info(float * data, size_t size, const char * name, int64_t token) {`
`9724`		`-#ifdef MR_CHUNKY_TALKS`
`9725`		`- GGML_LOG_INFO("\nggml-debug: %s (%ld) first 5 values: [%.6f, %.6f, %.6f, %.6f, %.6f, ...]\n",`
`9726`		`- name, token, data[0], data[1], data[2], data[3], data[4]);`
`9727`		`- GGML_LOG_INFO("total elements: %ld, sum = %.10f\n", size, debug_sum(data, size));`
`9728`		`-#endif MR_CHUNKY_TALKS`
`9729`		`-}`
`9730`		`-`
`9731`	`9715`	`// Helper function to compute cumulative sum`
`9732`	`9716`	`static void delta_cumsum_f32(const float * x, float * dst, const int64_t n) {`
`9733`	`9717`	`float cumsum = 0.0f;`
`@@ -9837,34 +9821,9 @@ static void delta_apply_triangular_updates_chunk_f32(float * attn,`
`9837`	`9821`	`attn_ptr[i * chunk_size + j] = row[j] + sum_val;`
`9838`	`9822`	`}`
`9839`	`9823`
`9840`		`- if (i % 10 == 0) {`
`9841`		`- if (seq == 1 && head == 0 && chunk == 0) {`
`9842`		`- print_debug_info(row, i, "row[1, 0, 0]", i);`
`9843`		`- print_debug_info(sub, i * i, "sub[1, 0, 0]", i);`
`9844`		`- }`
`9845`		`- if (seq == 0 && head == 1 && chunk == 0) {`
`9846`		`- print_debug_info(row, i, "row[0, 1, 0]", i);`
`9847`		`- print_debug_info(sub, i * i, "sub[0, 1, 0]", i);`
`9848`		`- }`
`9849`		`- if (seq == 0 && head == 0 && chunk == 1) {`
`9850`		`- print_debug_info(row, i, "row[0, 0, 1]", i);`
`9851`		`- print_debug_info(sub, i * i, "sub[0, 0, 1]", i);`
`9852`		`- }`
`9853`		`- }`
`9854`		`-`
`9855`	`9824`	`free(row);`
`9856`	`9825`	`free(sub);`
`9857`	`9826`	`}`
`9858`		`-`
`9859`		`- if (seq == 1 && head == 0 && chunk == 0) {`
`9860`		`- print_debug_info(attn_ptr, chunk_size * chunk_size, "attn[1, 0, 0]", 0);`
`9861`		`- }`
`9862`		`- if (seq == 0 && head == 1 && chunk == 0) {`
`9863`		`- print_debug_info(attn_ptr, chunk_size * chunk_size, "attn[0, 1, 0]", 0);`
`9864`		`- }`
`9865`		`- if (seq == 0 && head == 0 && chunk == 1) {`
`9866`		`- print_debug_info(attn_ptr, chunk_size * chunk_size, "attn[0, 0, 1]", 0);`
`9867`		`- }`
`9868`	`9827`	`}`
`9869`	`9828`	`}`
`9870`	`9829`	`}`
`@@ -10191,8 +10150,6 @@ void ggml_compute_forward_delta_net_f32(const ggml_compute_params * params, ggml`
`10191`	`10150`	`}`
`10192`	`10151`	`}`
`10193`	`10152`	`}`
`10194`		`- print_debug_info(new_state, S_v * S_v * H_v * n_seqs, "init_state", -1);`
`10195`		`-`
`10196`	`10153`
`10197`	`10154`	`GGML_ASSERT(ggml_is_contiguous(src0));`
`10198`	`10155`	`GGML_ASSERT(ggml_is_contiguous(src1));`
`@@ -10229,13 +10186,10 @@ void ggml_compute_forward_delta_net_f32(const ggml_compute_params * params, ggml`
`10229`	`10186`	`// for i in range(1, chunk_size): attn[..., i, :i] = row + (row.unsqueeze(-1) * sub).sum(-2)`
`10230`	`10187`	`// attn = attn + torch.eye(chunk_size)`
`10231`	`10188`	`delta_apply_triangular_updates_chunk_f32(attn, chunk_size, n_seqs, H_v, num_chunks);`
`10232`		`- print_debug_info(attn, chunk_size * chunk_size * H_v * num_chunks * n_seqs, "attn_chunk", -1);`
`10233`	`10189`	`delta_add_identity_matrix_chunk_f32(attn, chunk_size, n_seqs, H_v, num_chunks);`
`10234`		`- print_debug_info(attn, chunk_size * chunk_size * H_v * num_chunks * n_seqs, "attn_eye", -1);`
`10235`	`10190`
`10236`	`10191`	`// Compute value = attn @ v_beta`
`10237`	`10192`	`delta_compute_value_f32(attn, (const float *) src6->data, value, chunk_size, S_v, H_v, n_seqs, num_chunks);`
`10238`		`- print_debug_info(value, chunk_size * S_v * H_v * num_chunks * n_seqs, "value", -1);`
`10239`	`10193`
`10240`	`10194`	`for (int64_t seq = 0; seq < n_seqs; seq++) {`
`10241`	`10195`	`for (int i = 0; i < num_chunks; i++) {`
`@@ -10248,7 +10202,6 @@ void ggml_compute_forward_delta_net_f32(const ggml_compute_params * params, ggml`
`10248`	`10202`	`}`
`10249`	`10203`	`}`
`10250`	`10204`	`}`
`10251`		`- print_debug_info(k_cumdecay, chunk_size * S_v * H_v * num_chunks * n_seqs, "k_cumdecay", -1);`
`10252`	`10205`
`10253`	`10206`	`// Process each chunk with all sequences and heads together`
`10254`	`10207`	`for (int64_t chunk = 0; chunk < num_chunks; chunk++) {`
`@@ -10304,9 +10257,6 @@ void ggml_compute_forward_delta_net_f32(const ggml_compute_params * params, ggml`
`10304`	`10257`	`}`
`10305`	`10258`	`}`
`10306`	`10259`
`10307`		`- print_debug_info(pc_q_chunk_data, chunk_size * S_v * H_v * n_seqs, "q_i_chunk", chunk);`
`10308`		`- print_debug_info(pc_k_chunk_data, chunk_size * S_v * H_v * n_seqs, "k_i_chunk", chunk);`
`10309`		`-`
`10310`	`10260`	`// Step 4: Compute NEW attention matrix for this chunk: attn = (q_i @ k_i.transpose(-1, -2) * decay_mask[:, :, i]).masked_fill_(mask, 0)`
`10311`	`10261`	`// Note: decay_mask[:, :, i] means we need to use the decay_mask for this specific chunk`
`10312`	`10262`	`// The mask applied is the simple causal attention mask: torch.triu(torch.ones(chunk_size, chunk_size), diagonal=1)`
`@@ -10328,7 +10278,6 @@ void ggml_compute_forward_delta_net_f32(const ggml_compute_params * params, ggml`
`10328`	`10278`	`delta_matmul_f32(q_ptr, k_trans, attn_ptr, chunk_size, chunk_size, S_v);`
`10329`	`10279`	`}`
`10330`	`10280`	`}`
`10331`		`- print_debug_info(attn, chunk_size * chunk_size * num_chunks * H_v * n_seqs, "attn_q_k_trans", chunk);`
`10332`	`10281`
`10333`	`10282`	`for (int64_t seq = 0; seq < n_seqs; seq++) {`
`10334`	`10283`	`for (int64_t head = 0; head < H_v; head++) {`
`@@ -10348,20 +10297,15 @@ void ggml_compute_forward_delta_net_f32(const ggml_compute_params * params, ggml`
`10348`	`10297`	`}`
`10349`	`10298`	`}`
`10350`	`10299`
`10351`		`- print_debug_info(attn, chunk_size * chunk_size * num_chunks * H_v * n_seqs, "attn_step4_new_chunk", chunk);`
`10352`		`-`
`10353`	`10300`	`// v_prime = (k_cumdecay[:, :, i]) @ last_recurrent_state`
`10354`	`10301`	`// k_cumdecay has shape [chunk_size, v_head_dim], state has shape [v_head_dim, v_head_dim]`
`10355`	`10302`	`delta_matmul_state_chunk_f32(k_cumdecay, new_state, pc_v_prime, chunk_size, S_v, S_v, n_seqs, H_v, chunk, num_chunks);`
`10356`		`- print_debug_info(pc_v_prime, chunk_size * S_v * H_v * n_seqs, "v_prime_chunk", chunk);`
`10357`	`10303`
`10358`	`10304`	`// v_new = v_i - v_prime`
`10359`	`10305`	`delta_tensor_subtract_chunk_f32(value, pc_v_prime, pc_v_new, chunk_size * S_v, n_seqs, H_v, num_chunks, chunk);`
`10360`		`- print_debug_info(pc_v_new, chunk_size * S_v * H_v * n_seqs, "v_new_chunk", chunk);`
`10361`	`10306`
`10362`	`10307`	`// attn_inter = (q_i * g[:, :, i, :, None].exp()) @ last_recurrent_state`
`10363`	`10308`	`delta_matmul_state_chunk_f32(pc_q_g_exp, new_state, pc_attn_inter, chunk_size, S_v, S_v, n_seqs, H_v, -1, -1);`
`10364`		`- print_debug_info(pc_attn_inter, chunk_size * S_v * H_v * n_seqs, "attn_inter_chunk", chunk);`
`10365`	`10309`
`10366`	`10310`	`// core_attn_out[:, :, i] = attn_inter + attn @ v_new`
`10367`	`10311`	`// Use regular matrix multiplication for attn @ v_new`
`@@ -10375,9 +10319,7 @@ void ggml_compute_forward_delta_net_f32(const ggml_compute_params * params, ggml`
`10375`	`10319`	`delta_matmul_f32(attn_ptr, v_new_ptr, attn_v_new_ptr, chunk_size, S_v, chunk_size);`
`10376`	`10320`	`}`
`10377`	`10321`	`}`
`10378`		`- print_debug_info(pc_attn_v_new, chunk_size * S_v * H_v * n_seqs, "attn_v_new_chunk", chunk);`
`10379`	`10322`	`delta_tensor_add_chunk_f32(pc_attn_inter, pc_attn_v_new, pc_core_attn_out, chunk_size * S_v, n_seqs, H_v);`
`10380`		`- print_debug_info(pc_core_attn_out, chunk_size * S_v * H_v * n_seqs, "core_attn_out_chunk", chunk);`
`10381`	`10323`
`10382`	`10324`	`// Prepare g_last and g_diff_exp for state update`
`10383`	`10325`	`for (int64_t seq = 0; seq < n_seqs; seq++) {`
`@@ -10394,9 +10336,6 @@ void ggml_compute_forward_delta_net_f32(const ggml_compute_params * params, ggml`
`10394`	`10336`	`}`
`10395`	`10337`	`}`
`10396`	`10338`
`10397`		`- print_debug_info(pc_g_last, H_v * n_seqs, "g_last_chunk", chunk);`
`10398`		`- print_debug_info(pc_g_diff_exp, chunk_size * H_v * n_seqs, "g_diff_exp", chunk);`
`10399`		`-`
`10400`	`10339`	`float * k_g_diffexp = (float ) malloc(chunk_size S_v * H_v * n_seqs * sizeof(float));`
`10401`	`10340`	`for (int64_t seq = 0; seq < n_seqs; seq++) {`
`10402`	`10341`	`for (int64_t head = 0; head < H_v; head++) {`
`@@ -10408,7 +10347,6 @@ void ggml_compute_forward_delta_net_f32(const ggml_compute_params * params, ggml`
`10408`	`10347`	`}`
`10409`	`10348`	`}`
`10410`	`10349`	`}`
`10411`		`- print_debug_info(k_g_diffexp, chunk_size * S_v * H_v * n_seqs, "k_g_diffexp", chunk);`
`10412`	`10350`	`float * k_g_diffexp_T = (float ) malloc(chunk_size S_v * H_v * n_seqs * sizeof(float));`
`10413`	`10351`	`for (int64_t seq = 0; seq < n_seqs; seq++) {`
`10414`	`10352`	`for (int64_t head = 0; head < H_v; head++) {`
`@@ -10421,25 +10359,6 @@ void ggml_compute_forward_delta_net_f32(const ggml_compute_params * params, ggml`
`10421`	`10359`	`}`
`10422`	`10360`	`}`
`10423`	`10361`
`10424`		`- // for (int64_t seq = 0; seq < n_seqs; seq++) {`
`10425`		`- // for (int64_t head = 0; head < H_v; head++) {`
`10426`		`- // GGML_LOG_INFO("Sequence %ld, head %ld: \n[ ", seq, head);`
`10427`		`- // for (int i = 0; i < chunk_size; i++) {`
`10428`		`- // GGML_LOG_INFO("[ ");`
`10429`		`- // for (int j = 0; j < S_v; j++) {`
`10430`		`- // GGML_LOG_INFO("%.6f", k_g_diffexp[(chunk_size * S_v * H_v) * seq + (chunk_size * S_v) * head + i * S_v + j]);`
`10431`		`- // if (j < chunk_size - 1) {`
`10432`		`- // GGML_LOG_INFO(", ");`
`10433`		`- // }`
`10434`		`- // }`
`10435`		`- // GGML_LOG_INFO("], \n");`
`10436`		`- // }`
`10437`		`- // GGML_LOG_INFO("]\n");`
`10438`		`- // }`
`10439`		`- // }`
`10440`		`-`
`10441`		`- print_debug_info(k_g_diffexp_T, chunk_size * S_v * H_v * n_seqs, "k_g_diffexp_T", chunk);`
`10442`		`-`
`10443`	`10362`	`float * kgd_mul_vnew = (float ) malloc(S_v S_v * H_v * n_seqs * sizeof(float));`
`10444`	`10363`
`10445`	`10364`	`for (int64_t seq = 0; seq < n_seqs; seq++) {`
`@@ -10450,24 +10369,6 @@ void ggml_compute_forward_delta_net_f32(const ggml_compute_params * params, ggml`
`10450`	`10369`	`S_v, S_v, chunk_size);`
`10451`	`10370`	`}`
`10452`	`10371`	`}`
`10453`		`- print_debug_info(kgd_mul_vnew, S_v * S_v * H_v * n_seqs, "kgd_mul_vnew", chunk);`
`10454`		`-`
`10455`		`- // for (int64_t seq = 0; seq < n_seqs; seq++) {`
`10456`		`- // for (int64_t head = 0; head < H_v; head++) {`
`10457`		`- // GGML_LOG_INFO("Sequence %ld, head %ld: \n[ ", seq, head);`
`10458`		`- // for (int i = 0; i < S_v; i++) {`
`10459`		`- // GGML_LOG_INFO("[ ");`
`10460`		`- // for (int j = 0; j < S_v; j++) {`
`10461`		`- // GGML_LOG_INFO("%.6f", kgd_mul_vnew[(S_v * S_v * H_v) * seq + (S_v * S_v) * head + i * S_v + j]);`
`10462`		`- // if (j < S_v - 1) {`
`10463`		`- // GGML_LOG_INFO(", ");`
`10464`		`- // }`
`10465`		`- // }`
`10466`		`- // GGML_LOG_INFO("], \n");`
`10467`		`- // }`
`10468`		`- // GGML_LOG_INFO("]\n");`
`10469`		`- // }`
`10470`		`- // }`
`10471`	`10372`
`10472`	`10373`	`for (int64_t seq = 0; seq < n_seqs; seq++) {`
`10473`	`10374`	`for (int64_t head = 0; head < H_v; head++) {`
`@@ -10480,7 +10381,6 @@ void ggml_compute_forward_delta_net_f32(const ggml_compute_params * params, ggml`
`10480`	`10381`	`}`
`10481`	`10382`	`}`
`10482`	`10383`	`}`
`10483`		`- print_debug_info(new_state, S_v * S_v * H_v * n_seqs, "state_end_chunk", chunk);`
`10484`	`10384`
`10485`	`10385`	`// Free temporary memory`
`10486`	`10386`	`free(pc_q_chunk_data);`
`@@ -10511,21 +10411,6 @@ void ggml_compute_forward_delta_net_f32(const ggml_compute_params * params, ggml`
`10511`	`10411`	`}`
`10512`	`10412`	`}`
`10513`	`10413`	`}`
`10514`		`- print_debug_info(output, S_v * H_v * n_tokens * n_seqs, "output", chunk);`
`10515`		`- // GGML_LOG_INFO("\nFull output tensor: \n\n");`
`10516`		`- // for (int64_t seq = 0; seq < n_seqs; seq++) {`
`10517`		`- // for (int64_t head = 0; head < H_v; head++) {`
`10518`		`- // GGML_LOG_INFO("\n[ ");`
`10519`		`- // for (int64_t i = 0; i < n_tokens; i++) {`
`10520`		`- // for (int64_t d = 0; d < S_v; d++) {`
`10521`		`- // GGML_LOG_INFO("%.4f ", output[seq * (n_tokens * S_v * H_v) + head * (n_tokens * S_v) + (chunk * chunk_size + i) * S_v + d]);`
`10522`		`- // }`
`10523`		`- // }`
`10524`		`- // GGML_LOG_INFO(" ]");`
`10525`		`- // }`
`10526`		`- // }`
`10527`		`- print_debug_info(new_state, S_v * S_v * H_v * n_seqs, "new_state", chunk);`
`10528`		`-`
`10529`	`10414`	`free(pc_core_attn_out);`
`10530`	`10415`	`free(pc_attn_inter);`
`10531`	`10416`	`free(pc_v_new);`
`@@ -10622,7 +10507,6 @@ void ggml_compute_forward_delta_net_recurrent_f32(const ggml_compute_params * pa`
`10622`	`10507`	`}`
`10623`	`10508`	`}`
`10624`	`10509`	`}`
`10625`		`- print_debug_info(temp_state, n_seqs * H_v * S_v * S_v, "temp_state_copy", token);`
`10626`	`10510`
`10627`	`10511`	`// 1. last_recurrent_state = last_recurrent_state * g_t (for all seqs and heads)`
`10628`	`10512`	`for (int64_t seq = 0; seq < n_seqs; seq++) {`
`@@ -10636,7 +10520,6 @@ void ggml_compute_forward_delta_net_recurrent_f32(const ggml_compute_params * pa`
`10636`	`10520`	`}`
`10637`	`10521`	`}`
`10638`	`10522`	`}`
`10639`		`- print_debug_info(temp_state, n_seqs * H_v * S_v * S_v, "temp_state_times_g_t", token);`
`10640`	`10523`
`10641`	`10524`	`// 2. kv_mem = (last_recurrent_state * k_t.unsqueeze(-1)).sum(dim=-2) (for all seqs and heads)`
`10642`	`10525`	`for (int64_t seq = 0; seq < n_seqs; seq++) {`
`@@ -10651,7 +10534,6 @@ void ggml_compute_forward_delta_net_recurrent_f32(const ggml_compute_params * pa`
`10651`	`10534`	`}`
`10652`	`10535`	`}`
`10653`	`10536`	`}`
`10654`		`- print_debug_info(kv_mem, n_seqs * H_v * S_v, "kv_mem", token);`
`10655`	`10537`
`10656`	`10538`	`// 3. delta = (v_t - kv_mem) * beta_t (for all seqs and heads)`
`10657`	`10539`	`for (int64_t seq = 0; seq < n_seqs; seq++) {`
`@@ -10663,7 +10545,6 @@ void ggml_compute_forward_delta_net_recurrent_f32(const ggml_compute_params * pa`
`10663`	`10545`	`}`
`10664`	`10546`	`}`
`10665`	`10547`	`}`
`10666`		`- print_debug_info(delta, n_seqs * H_v * S_v, "delta", token);`
`10667`	`10548`
`10668`	`10549`	`// 4. last_recurrent_state = last_recurrent_state + k_t.unsqueeze(-1) * delta.unsqueeze(-2) (for all seqs and heads)`
`10669`	`10550`	`for (int64_t seq = 0; seq < n_seqs; seq++) {`
`@@ -10677,7 +10558,6 @@ void ggml_compute_forward_delta_net_recurrent_f32(const ggml_compute_params * pa`
`10677`	`10558`	`}`
`10678`	`10559`	`}`
`10679`	`10560`	`}`
`10680`		`- print_debug_info(temp_state, n_seqs * H_v * S_v * S_v, "temp_state", token);`
`10681`	`10561`
`10682`	`10562`	`// 5. core_attn_out[:, :, i] = (last_recurrent_state * q_t.unsqueeze(-1)).sum(dim=-2) (for all seqs and heads)`
`10683`	`10563`	`for (int64_t seq = 0; seq < n_seqs; seq++) {`
`@@ -10691,7 +10571,6 @@ void ggml_compute_forward_delta_net_recurrent_f32(const ggml_compute_params * pa`
`10691`	`10571`	`}`
`10692`	`10572`	`}`
`10693`	`10573`	`}`
`10694`		`- print_debug_info(attn_out_t, n_seqs * S_v * H_v, "attn_out_t", token);`
`10695`	`10574`
`10696`	`10575`	`// Store the output for this token (for all seqs and heads)`
`10697`	`10576`	`for (int64_t seq = 0; seq < n_seqs; seq++) {`
Original file line number	Diff line number	Diff line change
`@@ -102,7 +102,7 @@ static bool ggml_op_is_empty(enum ggml_op op) {`
`102`	`102`	`}`
`103`	`103`	`}`
`104`	`104`
`105`		`-static inline float ggml_softplus(float input) {`
	`105`	`+static inline float ggml_compute_softplus_f32(float input) {`
`106`	`106`	`return (input > 20.0f) ? input : logf(1 + expf(input));`
`107`	`107`	`}`
`108`	`108`	`//`