Skip to content

Commit 5e2e254

Browse files
committed
TEST
1 parent 3df2244 commit 5e2e254

File tree

2 files changed

+308
-0
lines changed

2 files changed

+308
-0
lines changed

tests/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -202,6 +202,7 @@ if (NOT LLAMA_SANITIZE_ADDRESS)
202202
endif()
203203
llama_build_and_test(test-gguf.cpp)
204204
llama_build_and_test(test-backend-ops.cpp)
205+
llama_build_and_test(test-ssm-scan-debug.cpp)
205206

206207
llama_build_and_test(test-model-load-cancel.cpp LABEL "model")
207208
llama_build_and_test(test-autorelease.cpp LABEL "model")

tests/test-ssm-scan-debug.cpp

Lines changed: 307 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,307 @@
1+
#include "ggml.h"
2+
#include "ggml-backend.h"
3+
#include "ggml-cpu.h"
4+
#include <cstdio>
5+
#include <cstdlib>
6+
#include <cstring>
7+
#include <vector>
8+
#include <cmath>
9+
#include <chrono>
10+
11+
#ifdef GGML_USE_VULKAN
12+
#include "ggml-vulkan.h"
13+
#endif
14+
15+
// Test function with configurable parameters
16+
// Note: d_inner is head_dim in CUDA terminology for Mamba-2
17+
static void test_ssm_scan_config(int64_t d_state, int64_t head_dim, int64_t n_head, int64_t n_seq_tokens, int64_t n_seqs, const char* test_name) {
18+
std::chrono::microseconds cpu_duration(0), gpu_duration(0);
19+
printf("\n=== Test: %s ===\n", test_name);
20+
printf("d_state=%ld, head_dim=%ld, n_head=%ld, n_seq_tokens=%ld, n_seqs=%ld\n",
21+
(long)d_state, (long)head_dim, (long)n_head, (long)n_seq_tokens, (long)n_seqs);
22+
const int64_t n_group = 1;
23+
24+
struct ggml_init_params params = {
25+
/*.mem_size =*/ 256*1024*1024, // Increased for larger tests
26+
/*.mem_buffer =*/ NULL,
27+
/*.no_alloc =*/ false,
28+
};
29+
30+
struct ggml_context * ctx = ggml_init(params);
31+
32+
// Create input tensors (updated for Mamba-2 configuration)
33+
struct ggml_tensor * s = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, d_state, head_dim, n_head, n_seqs);
34+
struct ggml_tensor * x = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, head_dim, n_head, n_seq_tokens, n_seqs);
35+
struct ggml_tensor * dt = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, n_head, n_seq_tokens, n_seqs);
36+
struct ggml_tensor * A = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, n_head); // Mamba-2: {1, n_head}
37+
struct ggml_tensor * B = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, d_state, n_group, n_seq_tokens, n_seqs);
38+
struct ggml_tensor * C = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, d_state, n_group, n_seq_tokens, n_seqs);
39+
struct ggml_tensor * ids = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_seqs);
40+
41+
// Initialize with simple values for debugging
42+
// s: initial state (all zeros)
43+
for (int i = 0; i < ggml_nelements(s); i++) {
44+
((float*)s->data)[i] = 0.0f;
45+
}
46+
47+
// x: input (simple pattern: 0.1, 0.2, 0.3, ...)
48+
for (int i = 0; i < ggml_nelements(x); i++) {
49+
((float*)x->data)[i] = 0.1f * (i + 1);
50+
}
51+
52+
// dt: timestep (all 1.0 for simplicity)
53+
for (int i = 0; i < ggml_nelements(dt); i++) {
54+
((float*)dt->data)[i] = 1.0f;
55+
}
56+
57+
// A: decay (all -1.0 for simple exp)
58+
for (int i = 0; i < ggml_nelements(A); i++) {
59+
((float*)A->data)[i] = -1.0f;
60+
}
61+
62+
// B: input matrix (all 1.0)
63+
for (int i = 0; i < ggml_nelements(B); i++) {
64+
((float*)B->data)[i] = 1.0f;
65+
}
66+
67+
// C: output matrix (all 1.0)
68+
for (int i = 0; i < ggml_nelements(C); i++) {
69+
((float*)C->data)[i] = 1.0f;
70+
}
71+
72+
// ids: sequence ids (0, 1, 2, ...)
73+
for (int i = 0; i < n_seqs; i++) {
74+
((int32_t*)ids->data)[i] = i;
75+
}
76+
77+
printf("=== Input dimensions ===\n");
78+
printf("d_state=%ld, head_dim=%ld, n_head=%ld, n_seq_tokens=%ld, n_seqs=%ld\n",
79+
(long)d_state, (long)head_dim, (long)n_head, (long)n_seq_tokens, (long)n_seqs);
80+
81+
printf("\n=== Input x (first 16 values) ===\n");
82+
for (int i = 0; i < std::min(16, (int)ggml_nelements(x)); i++) {
83+
printf("x[%d] = %.3f\n", i, ((float*)x->data)[i]);
84+
}
85+
86+
// Run CPU version with timing
87+
printf("\n=== Running CPU version ===\n");
88+
struct ggml_tensor * out_cpu = ggml_ssm_scan(ctx, s, x, dt, A, B, C, ids);
89+
90+
struct ggml_cgraph * gf = ggml_new_graph(ctx);
91+
ggml_build_forward_expand(gf, out_cpu);
92+
93+
ggml_backend_t cpu_backend = ggml_backend_cpu_init();
94+
95+
auto cpu_start = std::chrono::high_resolution_clock::now();
96+
ggml_backend_graph_compute(cpu_backend, gf);
97+
auto cpu_end = std::chrono::high_resolution_clock::now();
98+
99+
ggml_backend_free(cpu_backend);
100+
101+
cpu_duration = std::chrono::duration_cast<std::chrono::microseconds>(cpu_end - cpu_start);
102+
printf("CPU computation time: %.2f ms\n", cpu_duration.count() / 1000.0);
103+
104+
printf("CPU output (first 32 values):\n");
105+
for (int i = 0; i < std::min(32, (int)ggml_nelements(out_cpu)); i++) {
106+
printf(" [%d] = %.6f\n", i, ((float*)out_cpu->data)[i]);
107+
}
108+
109+
// Run GPU version
110+
printf("\n=== Running GPU version ===\n");
111+
112+
#ifdef GGML_USE_VULKAN
113+
ggml_backend_t backend = ggml_backend_vk_init(0);
114+
if (!backend) {
115+
printf("Vulkan backend not available\n");
116+
ggml_free(ctx);
117+
return;
118+
}
119+
120+
// Create new context for GPU tensors with no_alloc=true
121+
struct ggml_init_params params_gpu = {
122+
/*.mem_size =*/ 128*1024*1024,
123+
/*.mem_buffer =*/ NULL,
124+
/*.no_alloc =*/ true,
125+
};
126+
struct ggml_context * ctx_gpu = ggml_init(params_gpu);
127+
128+
// Create tensor descriptors (no data allocated yet)
129+
struct ggml_tensor * s_gpu = ggml_new_tensor_4d(ctx_gpu, GGML_TYPE_F32, d_state, head_dim, n_head, n_seqs);
130+
struct ggml_tensor * x_gpu = ggml_new_tensor_4d(ctx_gpu, GGML_TYPE_F32, head_dim, n_head, n_seq_tokens, n_seqs);
131+
struct ggml_tensor * dt_gpu = ggml_new_tensor_3d(ctx_gpu, GGML_TYPE_F32, n_head, n_seq_tokens, n_seqs);
132+
struct ggml_tensor * A_gpu = ggml_new_tensor_2d(ctx_gpu, GGML_TYPE_F32, 1, n_head);
133+
struct ggml_tensor * B_gpu = ggml_new_tensor_4d(ctx_gpu, GGML_TYPE_F32, d_state, n_group, n_seq_tokens, n_seqs);
134+
struct ggml_tensor * C_gpu = ggml_new_tensor_4d(ctx_gpu, GGML_TYPE_F32, d_state, n_group, n_seq_tokens, n_seqs);
135+
struct ggml_tensor * ids_gpu = ggml_new_tensor_1d(ctx_gpu, GGML_TYPE_I32, n_seqs);
136+
137+
// Allocate backend buffer for all tensors
138+
ggml_backend_buffer_t buffer = ggml_backend_alloc_ctx_tensors(ctx_gpu, backend);
139+
if (!buffer) {
140+
printf("Failed to allocate backend buffer\n");
141+
ggml_backend_free(backend);
142+
ggml_free(ctx_gpu);
143+
ggml_free(ctx);
144+
return;
145+
}
146+
147+
// Copy data to GPU using backend tensor_set
148+
ggml_backend_tensor_set(s_gpu, s->data, 0, ggml_nbytes(s));
149+
ggml_backend_tensor_set(x_gpu, x->data, 0, ggml_nbytes(x));
150+
ggml_backend_tensor_set(dt_gpu, dt->data, 0, ggml_nbytes(dt));
151+
ggml_backend_tensor_set(A_gpu, A->data, 0, ggml_nbytes(A));
152+
ggml_backend_tensor_set(B_gpu, B->data, 0, ggml_nbytes(B));
153+
ggml_backend_tensor_set(C_gpu, C->data, 0, ggml_nbytes(C));
154+
ggml_backend_tensor_set(ids_gpu, ids->data, 0, ggml_nbytes(ids));
155+
156+
struct ggml_tensor * out_gpu = ggml_ssm_scan(ctx_gpu, s_gpu, x_gpu, dt_gpu, A_gpu, B_gpu, C_gpu, ids_gpu);
157+
158+
struct ggml_cgraph * gf_gpu = ggml_new_graph(ctx_gpu);
159+
ggml_build_forward_expand(gf_gpu, out_gpu);
160+
161+
// Allocate buffer for the output tensor and any intermediate tensors
162+
ggml_backend_buffer_t output_buffer = ggml_backend_alloc_ctx_tensors(ctx_gpu, backend);
163+
if (!output_buffer) {
164+
printf("Failed to allocate output buffer\n");
165+
ggml_backend_buffer_free(buffer);
166+
ggml_backend_free(backend);
167+
ggml_free(ctx_gpu);
168+
ggml_free(ctx);
169+
return;
170+
}
171+
172+
auto gpu_start = std::chrono::high_resolution_clock::now();
173+
ggml_backend_graph_compute(backend, gf_gpu);
174+
auto gpu_end = std::chrono::high_resolution_clock::now();
175+
176+
gpu_duration = std::chrono::duration_cast<std::chrono::microseconds>(gpu_end - gpu_start);
177+
printf("GPU computation time: %.2f ms\n", gpu_duration.count() / 1000.0);
178+
179+
// Copy results back
180+
std::vector<float> gpu_result(ggml_nelements(out_gpu));
181+
ggml_backend_tensor_get(out_gpu, gpu_result.data(), 0, ggml_nbytes(out_gpu));
182+
183+
printf("GPU output (first 32 values):\n");
184+
for (int i = 0; i < std::min(32, (int)ggml_nelements(out_gpu)); i++) {
185+
printf(" [%d] = %.6f\n", i, gpu_result[i]);
186+
}
187+
188+
// Compare with detailed analysis
189+
printf("\n=== Detailed Comparison (CPU vs GPU) ===\n");
190+
int errors = 0;
191+
float max_error = 0.0f;
192+
float avg_error = 0.0f;
193+
int total_elements = ggml_nelements(out_cpu);
194+
195+
for (int i = 0; i < total_elements; i++) {
196+
float cpu_val = ((float*)out_cpu->data)[i];
197+
float gpu_val = gpu_result[i];
198+
float diff = std::abs(cpu_val - gpu_val);
199+
avg_error += diff;
200+
max_error = std::max(max_error, diff);
201+
202+
if (diff > 1e-5) {
203+
if (errors < 10) { // Show first 10 errors
204+
// Calculate which timestep, head, and dimension this is
205+
int elements_per_timestep = n_head * head_dim;
206+
int elements_per_seq = n_seq_tokens * elements_per_timestep;
207+
int seq_idx = i / elements_per_seq;
208+
int remainder = i % elements_per_seq;
209+
int timestep = remainder / elements_per_timestep;
210+
int head_dim_idx = remainder % elements_per_timestep;
211+
int head = head_dim_idx / head_dim;
212+
int dim = head_dim_idx % head_dim;
213+
214+
printf(" [%d] CPU=%.6f GPU=%.6f DIFF=%.6f (seq=%d, t=%d, h=%d, d=%d)\n",
215+
i, cpu_val, gpu_val, diff, seq_idx, timestep, head, dim);
216+
}
217+
errors++;
218+
}
219+
}
220+
221+
avg_error /= total_elements;
222+
223+
printf("Summary:\n");
224+
printf(" Total elements: %d\n", total_elements);
225+
printf(" Errors (>1e-5): %d (%.2f%%)\n", errors, 100.0f * errors / total_elements);
226+
printf(" Max error: %.9f\n", max_error);
227+
printf(" Average error: %.9f\n", avg_error);
228+
229+
if (errors == 0) {
230+
printf(" ✓ PASS: All values match within tolerance!\n");
231+
} else if (avg_error < 1e-4) {
232+
printf(" ⚠ MARGINAL: Small numerical differences\n");
233+
} else {
234+
printf(" ✗ FAIL: Significant numerical errors\n");
235+
}
236+
237+
// Performance comparison
238+
printf("\n=== Performance Comparison ===\n");
239+
if (cpu_duration.count() > 0 && gpu_duration.count() > 0) {
240+
double speedup = (double)cpu_duration.count() / gpu_duration.count();
241+
printf(" CPU time: %.2f ms\n", cpu_duration.count() / 1000.0);
242+
printf(" GPU time: %.2f ms\n", gpu_duration.count() / 1000.0);
243+
printf(" Speedup: %.2fx %s\n", speedup, speedup > 1.0 ? "(GPU faster)" : "(CPU faster)");
244+
245+
// Calculate elements per second
246+
double total_ops = (double)(d_state * head_dim * n_head * n_seq_tokens * n_seqs);
247+
double cpu_gops = total_ops / (cpu_duration.count() / 1000.0) / 1e6;
248+
double gpu_gops = total_ops / (gpu_duration.count() / 1000.0) / 1e6;
249+
printf(" CPU throughput: %.2f million elements/sec\n", cpu_gops);
250+
printf(" GPU throughput: %.2f million elements/sec\n", gpu_gops);
251+
}
252+
253+
ggml_backend_buffer_free(output_buffer);
254+
ggml_backend_buffer_free(buffer);
255+
ggml_backend_free(backend);
256+
ggml_free(ctx_gpu);
257+
#else
258+
printf("Vulkan not enabled in build\n");
259+
#endif
260+
261+
ggml_free(ctx);
262+
}
263+
264+
// Small test case for debugging SSM scan
265+
static void test_ssm_scan_small() {
266+
test_ssm_scan_config(16, 2, 2, 2, 1, "Small test (original)");
267+
}
268+
269+
static void test_ssm_scan_performance() {
270+
printf("=== SSM SCAN PERFORMANCE TESTING ===\n");
271+
printf("Testing supported configurations based on CUDA implementation\n");
272+
273+
// Group 1: Mamba-2 configurations (d_state=128/256)
274+
printf("\n--- Group 1: Mamba-2 (d_state=128) ---\n");
275+
test_ssm_scan_config(128, 16, 8, 16, 1, "Mamba-2 small: d_state=128, head_dim=16");
276+
test_ssm_scan_config(128, 32, 16, 32, 1, "Mamba-2 medium: d_state=128, head_dim=32");
277+
test_ssm_scan_config(128, 64, 32, 64, 1, "Mamba-2 large: d_state=128, head_dim=64");
278+
279+
// Group 2: Very large test (user requested)
280+
printf("\n--- Group 2: Mamba-2 Stress Test ---\n");
281+
test_ssm_scan_config(128, 64, 128, 512, 1, "MASSIVE: d_state=128, head_dim=64, n_head=128, n_seq_tokens=512");
282+
283+
// Group 3: Mamba-2 with d_state=256
284+
printf("\n--- Group 3: Mamba-2 (d_state=256) ---\n");
285+
test_ssm_scan_config(256, 16, 8, 16, 1, "Mamba-2 d_state=256 small");
286+
test_ssm_scan_config(256, 32, 16, 32, 1, "Mamba-2 d_state=256 medium");
287+
288+
// Group 4: Multiple sequences
289+
printf("\n--- Group 4: Multiple Sequences ---\n");
290+
test_ssm_scan_config(128, 32, 16, 64, 2, "Mamba-2 with 2 sequences");
291+
test_ssm_scan_config(128, 32, 16, 64, 4, "Mamba-2 with 4 sequences");
292+
293+
printf("\n=== ALL PERFORMANCE TESTS COMPLETED ===\n");
294+
printf("Note: Only testing Mamba-2 configurations (d_state=128/256) as these are the primary use case\n");
295+
printf("Mamba-1 (d_state=16) requires different tensor setup and is less commonly used\n");
296+
}
297+
298+
int main() {
299+
// Test the massive case specifically requested by user first
300+
printf("=== FOCUSED LARGE MATRIX TEST ===\n");
301+
test_ssm_scan_config(128, 64, 128, 512, 1, "MASSIVE: d_state=128, head_dim=64, n_head=128, n_seq_tokens=512");
302+
303+
// Also run the full performance suite
304+
printf("\n");
305+
test_ssm_scan_performance();
306+
return 0;
307+
}

0 commit comments

Comments
 (0)