Fix memory corruption

pwilkin · pwilkin · commit 9de7244c2671 · 2025-10-18T11:23:19.000+02:00
diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp
@@ -10865,7 +10865,7 @@ void ggml_compute_forward_delta_net_f32(const ggml_compute_params * params, ggml
     float * dst_data  = (float *) dst->data;
     // Following GLA pattern: output is first part, state is second part
     float * output    = dst_data; // [S_v * H_v, n_tokens, 1, n_seqs] - only real sequence length, not padded
-    float * new_state = dst_data + (S_v * H_v * n_tokens * n_seqs);  // [S_v * H_v, S_v * n_seqs, 1, 1]
+    float * new_state = dst_data + (S_v * H_v * n_tokens * n_seqs);  // [S_v, S_v * H_v, 1, n_seqs]
 
     const int ith = params->ith;
     // const int nth = params->nth;  // nth is unused
@@ -10884,6 +10884,20 @@ void ggml_compute_forward_delta_net_f32(const ggml_compute_params * params, ggml
 
     float * state_data = (float *) src4->data;
 
+    // Init new state with initial state (will probably be zeroes)
+    for (int64_t seq = 0; seq < n_seqs; seq++) {
+        for (int64_t head = 0; head < H_v; head++) {
+            for (int64_t i = 0; i < S_v; i++) {
+                for (int64_t j = 0; j < S_v; j++) {
+                    new_state[seq * (S_v * S_v * H_v) + head * (S_v * S_v) + i * S_v + j] = 
+                        state_data[seq * src4->nb[3] / sizeof(float) + (head * S_v + i) * src4->nb[1] / sizeof(float) + j * src4->nb[0] / sizeof(float)];
+                }
+            }
+        }
+    }
+    print_debug_info(new_state, S_v * S_v * H_v * n_seqs, "init_state", -1);
+
+
     GGML_ASSERT(ggml_is_contiguous(src0));
     GGML_ASSERT(ggml_is_contiguous(src1));
     GGML_ASSERT(ggml_is_contiguous(src2));
@@ -10896,12 +10910,6 @@ void ggml_compute_forward_delta_net_f32(const ggml_compute_params * params, ggml
 
     // int64_t total_params = n_seqs * H_v * num_chunks;
     // int64_t per_thread = (total_params % nth == 0) ? total_params / nth : (total_params / nth) + 1;
-
-    // Create helper lambda for state tensor access
-    const auto state_ptr = [state_data, src4] (int64_t seq, int64_t head, int64_t i, int64_t j) {
-        return state_data + (j * src4->nb[0] / sizeof(float)) + (i * src4->nb[1] / sizeof(float)) +
-            (head * src4->nb[2] / sizeof(float)) + (seq * src4->nb[3] / sizeof(float));
-    };
     
     float * attn          = (float *) malloc(chunk_size * chunk_size * H_v * n_seqs * sizeof(float));
     float * value         = (float *) malloc(chunk_size * S_v * H_v * n_seqs * sizeof(float));
@@ -11048,15 +11056,15 @@ void ggml_compute_forward_delta_net_f32(const ggml_compute_params * params, ggml
 
         // v_prime = (k_cumdecay[:, :, i]) @ last_recurrent_state
         // k_cumdecay has shape [chunk_size, v_head_dim], state has shape [v_head_dim, v_head_dim]
-        delta_matmul_state_chunk_f32(k_cumdecay, state_data, v_prime, chunk_size, S_v, S_v, n_seqs, H_v);
+        delta_matmul_state_chunk_f32(k_cumdecay, new_state, v_prime, chunk_size, S_v, S_v, n_seqs, H_v);
         print_debug_info(v_prime, chunk_size * S_v * H_v * n_seqs, "v_prime_chunk", chunk);
 
         // v_new = v_i - v_prime
         delta_tensor_subtract_chunk_f32(value, v_prime, v_new, chunk_size * S_v, n_seqs, H_v);
         print_debug_info(v_new, chunk_size * S_v * H_v * n_seqs, "v_new_chunk", chunk);
 
         // attn_inter = (q_i * g[:, :, i, :, None].exp()) @ last_recurrent_state
-        delta_matmul_state_chunk_f32(q_g_exp, state_data, attn_inter, chunk_size, S_v, S_v, n_seqs, H_v);
+        delta_matmul_state_chunk_f32(q_g_exp, new_state, attn_inter, chunk_size, S_v, S_v, n_seqs, H_v);
         print_debug_info(attn_inter, chunk_size * S_v * H_v * n_seqs, "attn_inter_chunk", chunk);
 
         // core_attn_out[:, :, i] = attn_inter + attn @ v_new
@@ -11203,19 +11211,16 @@ void ggml_compute_forward_delta_net_f32(const ggml_compute_params * params, ggml
             }
         }
         print_debug_info(output, S_v * H_v * n_tokens * n_seqs, "output", chunk);
-
-        // Update state tensor (all sequences and heads)
+        GGML_LOG_INFO("\nFull output tensor: \n\n");
         for (int64_t seq = 0; seq < n_seqs; seq++) {
             for (int64_t head = 0; head < H_v; head++) {
-                float * temp_state_ptr = temp_state + seq * (S_v * S_v * H_v) + head * (S_v * S_v);
-
-                for (int64_t i = 0; i < S_v; i++) {
-                    for (int64_t j = 0; j < S_v; j++) {
-                        int64_t state_idx             = seq * S_v * S_v * H_v + head * S_v * S_v + i * S_v + j;
-                        new_state[state_idx]          = temp_state_ptr[i * S_v + j];
-                        *(state_ptr(seq, head, i, j)) = temp_state_ptr[i * S_v + j];
+                GGML_LOG_INFO("\n[ ");
+                for (int64_t i = 0; i < n_tokens; i++) {
+                    for (int64_t d = 0; d < S_v; d++) {
+                        GGML_LOG_INFO("%.4f  ", output[seq * (n_tokens * S_v * H_v) + head * (n_tokens * S_v) + (chunk * chunk_size + i) * S_v + d]);
                     }
                 }
+                GGML_LOG_INFO(" ]");
             }
         }
         print_debug_info(new_state, S_v * S_v * H_v * n_seqs, "new_state", chunk);