|
| 1 | +#include "ggml.h" |
| 2 | +#include "ggml-backend.h" |
| 3 | +#include "ggml-cpu.h" |
| 4 | +#include <cstdio> |
| 5 | +#include <cstdlib> |
| 6 | +#include <cstring> |
| 7 | +#include <vector> |
| 8 | +#include <cmath> |
| 9 | +#include <chrono> |
| 10 | + |
| 11 | +#ifdef GGML_USE_VULKAN |
| 12 | +#include "ggml-vulkan.h" |
| 13 | +#endif |
| 14 | + |
| 15 | +// Test function with configurable parameters |
| 16 | +// Note: d_inner is head_dim in CUDA terminology for Mamba-2 |
| 17 | +static void test_ssm_scan_config(int64_t d_state, int64_t head_dim, int64_t n_head, int64_t n_seq_tokens, int64_t n_seqs, const char* test_name) { |
| 18 | + std::chrono::microseconds cpu_duration(0), gpu_duration(0); |
| 19 | + printf("\n=== Test: %s ===\n", test_name); |
| 20 | + printf("d_state=%ld, head_dim=%ld, n_head=%ld, n_seq_tokens=%ld, n_seqs=%ld\n", |
| 21 | + (long)d_state, (long)head_dim, (long)n_head, (long)n_seq_tokens, (long)n_seqs); |
| 22 | + const int64_t n_group = 1; |
| 23 | + |
| 24 | + struct ggml_init_params params = { |
| 25 | + /*.mem_size =*/ 256*1024*1024, // Increased for larger tests |
| 26 | + /*.mem_buffer =*/ NULL, |
| 27 | + /*.no_alloc =*/ false, |
| 28 | + }; |
| 29 | + |
| 30 | + struct ggml_context * ctx = ggml_init(params); |
| 31 | + |
| 32 | + // Create input tensors (updated for Mamba-2 configuration) |
| 33 | + struct ggml_tensor * s = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, d_state, head_dim, n_head, n_seqs); |
| 34 | + struct ggml_tensor * x = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, head_dim, n_head, n_seq_tokens, n_seqs); |
| 35 | + struct ggml_tensor * dt = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, n_head, n_seq_tokens, n_seqs); |
| 36 | + struct ggml_tensor * A = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, n_head); // Mamba-2: {1, n_head} |
| 37 | + struct ggml_tensor * B = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, d_state, n_group, n_seq_tokens, n_seqs); |
| 38 | + struct ggml_tensor * C = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, d_state, n_group, n_seq_tokens, n_seqs); |
| 39 | + struct ggml_tensor * ids = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_seqs); |
| 40 | + |
| 41 | + // Initialize with simple values for debugging |
| 42 | + // s: initial state (all zeros) |
| 43 | + for (int i = 0; i < ggml_nelements(s); i++) { |
| 44 | + ((float*)s->data)[i] = 0.0f; |
| 45 | + } |
| 46 | + |
| 47 | + // x: input (simple pattern: 0.1, 0.2, 0.3, ...) |
| 48 | + for (int i = 0; i < ggml_nelements(x); i++) { |
| 49 | + ((float*)x->data)[i] = 0.1f * (i + 1); |
| 50 | + } |
| 51 | + |
| 52 | + // dt: timestep (all 1.0 for simplicity) |
| 53 | + for (int i = 0; i < ggml_nelements(dt); i++) { |
| 54 | + ((float*)dt->data)[i] = 1.0f; |
| 55 | + } |
| 56 | + |
| 57 | + // A: decay (all -1.0 for simple exp) |
| 58 | + for (int i = 0; i < ggml_nelements(A); i++) { |
| 59 | + ((float*)A->data)[i] = -1.0f; |
| 60 | + } |
| 61 | + |
| 62 | + // B: input matrix (all 1.0) |
| 63 | + for (int i = 0; i < ggml_nelements(B); i++) { |
| 64 | + ((float*)B->data)[i] = 1.0f; |
| 65 | + } |
| 66 | + |
| 67 | + // C: output matrix (all 1.0) |
| 68 | + for (int i = 0; i < ggml_nelements(C); i++) { |
| 69 | + ((float*)C->data)[i] = 1.0f; |
| 70 | + } |
| 71 | + |
| 72 | + // ids: sequence ids (0, 1, 2, ...) |
| 73 | + for (int i = 0; i < n_seqs; i++) { |
| 74 | + ((int32_t*)ids->data)[i] = i; |
| 75 | + } |
| 76 | + |
| 77 | + printf("=== Input dimensions ===\n"); |
| 78 | + printf("d_state=%ld, head_dim=%ld, n_head=%ld, n_seq_tokens=%ld, n_seqs=%ld\n", |
| 79 | + (long)d_state, (long)head_dim, (long)n_head, (long)n_seq_tokens, (long)n_seqs); |
| 80 | + |
| 81 | + printf("\n=== Input x (first 16 values) ===\n"); |
| 82 | + for (int i = 0; i < std::min(16, (int)ggml_nelements(x)); i++) { |
| 83 | + printf("x[%d] = %.3f\n", i, ((float*)x->data)[i]); |
| 84 | + } |
| 85 | + |
| 86 | + // Run CPU version with timing |
| 87 | + printf("\n=== Running CPU version ===\n"); |
| 88 | + struct ggml_tensor * out_cpu = ggml_ssm_scan(ctx, s, x, dt, A, B, C, ids); |
| 89 | + |
| 90 | + struct ggml_cgraph * gf = ggml_new_graph(ctx); |
| 91 | + ggml_build_forward_expand(gf, out_cpu); |
| 92 | + |
| 93 | + ggml_backend_t cpu_backend = ggml_backend_cpu_init(); |
| 94 | + |
| 95 | + auto cpu_start = std::chrono::high_resolution_clock::now(); |
| 96 | + ggml_backend_graph_compute(cpu_backend, gf); |
| 97 | + auto cpu_end = std::chrono::high_resolution_clock::now(); |
| 98 | + |
| 99 | + ggml_backend_free(cpu_backend); |
| 100 | + |
| 101 | + cpu_duration = std::chrono::duration_cast<std::chrono::microseconds>(cpu_end - cpu_start); |
| 102 | + printf("CPU computation time: %.2f ms\n", cpu_duration.count() / 1000.0); |
| 103 | + |
| 104 | + printf("CPU output (first 32 values):\n"); |
| 105 | + for (int i = 0; i < std::min(32, (int)ggml_nelements(out_cpu)); i++) { |
| 106 | + printf(" [%d] = %.6f\n", i, ((float*)out_cpu->data)[i]); |
| 107 | + } |
| 108 | + |
| 109 | + // Run GPU version |
| 110 | + printf("\n=== Running GPU version ===\n"); |
| 111 | + |
| 112 | +#ifdef GGML_USE_VULKAN |
| 113 | + ggml_backend_t backend = ggml_backend_vk_init(0); |
| 114 | + if (!backend) { |
| 115 | + printf("Vulkan backend not available\n"); |
| 116 | + ggml_free(ctx); |
| 117 | + return; |
| 118 | + } |
| 119 | + |
| 120 | + // Create new context for GPU tensors with no_alloc=true |
| 121 | + struct ggml_init_params params_gpu = { |
| 122 | + /*.mem_size =*/ 128*1024*1024, |
| 123 | + /*.mem_buffer =*/ NULL, |
| 124 | + /*.no_alloc =*/ true, |
| 125 | + }; |
| 126 | + struct ggml_context * ctx_gpu = ggml_init(params_gpu); |
| 127 | + |
| 128 | + // Create tensor descriptors (no data allocated yet) |
| 129 | + struct ggml_tensor * s_gpu = ggml_new_tensor_4d(ctx_gpu, GGML_TYPE_F32, d_state, head_dim, n_head, n_seqs); |
| 130 | + struct ggml_tensor * x_gpu = ggml_new_tensor_4d(ctx_gpu, GGML_TYPE_F32, head_dim, n_head, n_seq_tokens, n_seqs); |
| 131 | + struct ggml_tensor * dt_gpu = ggml_new_tensor_3d(ctx_gpu, GGML_TYPE_F32, n_head, n_seq_tokens, n_seqs); |
| 132 | + struct ggml_tensor * A_gpu = ggml_new_tensor_2d(ctx_gpu, GGML_TYPE_F32, 1, n_head); |
| 133 | + struct ggml_tensor * B_gpu = ggml_new_tensor_4d(ctx_gpu, GGML_TYPE_F32, d_state, n_group, n_seq_tokens, n_seqs); |
| 134 | + struct ggml_tensor * C_gpu = ggml_new_tensor_4d(ctx_gpu, GGML_TYPE_F32, d_state, n_group, n_seq_tokens, n_seqs); |
| 135 | + struct ggml_tensor * ids_gpu = ggml_new_tensor_1d(ctx_gpu, GGML_TYPE_I32, n_seqs); |
| 136 | + |
| 137 | + // Allocate backend buffer for all tensors |
| 138 | + ggml_backend_buffer_t buffer = ggml_backend_alloc_ctx_tensors(ctx_gpu, backend); |
| 139 | + if (!buffer) { |
| 140 | + printf("Failed to allocate backend buffer\n"); |
| 141 | + ggml_backend_free(backend); |
| 142 | + ggml_free(ctx_gpu); |
| 143 | + ggml_free(ctx); |
| 144 | + return; |
| 145 | + } |
| 146 | + |
| 147 | + // Copy data to GPU using backend tensor_set |
| 148 | + ggml_backend_tensor_set(s_gpu, s->data, 0, ggml_nbytes(s)); |
| 149 | + ggml_backend_tensor_set(x_gpu, x->data, 0, ggml_nbytes(x)); |
| 150 | + ggml_backend_tensor_set(dt_gpu, dt->data, 0, ggml_nbytes(dt)); |
| 151 | + ggml_backend_tensor_set(A_gpu, A->data, 0, ggml_nbytes(A)); |
| 152 | + ggml_backend_tensor_set(B_gpu, B->data, 0, ggml_nbytes(B)); |
| 153 | + ggml_backend_tensor_set(C_gpu, C->data, 0, ggml_nbytes(C)); |
| 154 | + ggml_backend_tensor_set(ids_gpu, ids->data, 0, ggml_nbytes(ids)); |
| 155 | + |
| 156 | + struct ggml_tensor * out_gpu = ggml_ssm_scan(ctx_gpu, s_gpu, x_gpu, dt_gpu, A_gpu, B_gpu, C_gpu, ids_gpu); |
| 157 | + |
| 158 | + struct ggml_cgraph * gf_gpu = ggml_new_graph(ctx_gpu); |
| 159 | + ggml_build_forward_expand(gf_gpu, out_gpu); |
| 160 | + |
| 161 | + // Allocate buffer for the output tensor and any intermediate tensors |
| 162 | + ggml_backend_buffer_t output_buffer = ggml_backend_alloc_ctx_tensors(ctx_gpu, backend); |
| 163 | + if (!output_buffer) { |
| 164 | + printf("Failed to allocate output buffer\n"); |
| 165 | + ggml_backend_buffer_free(buffer); |
| 166 | + ggml_backend_free(backend); |
| 167 | + ggml_free(ctx_gpu); |
| 168 | + ggml_free(ctx); |
| 169 | + return; |
| 170 | + } |
| 171 | + |
| 172 | + auto gpu_start = std::chrono::high_resolution_clock::now(); |
| 173 | + ggml_backend_graph_compute(backend, gf_gpu); |
| 174 | + auto gpu_end = std::chrono::high_resolution_clock::now(); |
| 175 | + |
| 176 | + gpu_duration = std::chrono::duration_cast<std::chrono::microseconds>(gpu_end - gpu_start); |
| 177 | + printf("GPU computation time: %.2f ms\n", gpu_duration.count() / 1000.0); |
| 178 | + |
| 179 | + // Copy results back |
| 180 | + std::vector<float> gpu_result(ggml_nelements(out_gpu)); |
| 181 | + ggml_backend_tensor_get(out_gpu, gpu_result.data(), 0, ggml_nbytes(out_gpu)); |
| 182 | + |
| 183 | + printf("GPU output (first 32 values):\n"); |
| 184 | + for (int i = 0; i < std::min(32, (int)ggml_nelements(out_gpu)); i++) { |
| 185 | + printf(" [%d] = %.6f\n", i, gpu_result[i]); |
| 186 | + } |
| 187 | + |
| 188 | + // Compare with detailed analysis |
| 189 | + printf("\n=== Detailed Comparison (CPU vs GPU) ===\n"); |
| 190 | + int errors = 0; |
| 191 | + float max_error = 0.0f; |
| 192 | + float avg_error = 0.0f; |
| 193 | + int total_elements = ggml_nelements(out_cpu); |
| 194 | + |
| 195 | + for (int i = 0; i < total_elements; i++) { |
| 196 | + float cpu_val = ((float*)out_cpu->data)[i]; |
| 197 | + float gpu_val = gpu_result[i]; |
| 198 | + float diff = std::abs(cpu_val - gpu_val); |
| 199 | + avg_error += diff; |
| 200 | + max_error = std::max(max_error, diff); |
| 201 | + |
| 202 | + if (diff > 1e-5) { |
| 203 | + if (errors < 10) { // Show first 10 errors |
| 204 | + // Calculate which timestep, head, and dimension this is |
| 205 | + int elements_per_timestep = n_head * head_dim; |
| 206 | + int elements_per_seq = n_seq_tokens * elements_per_timestep; |
| 207 | + int seq_idx = i / elements_per_seq; |
| 208 | + int remainder = i % elements_per_seq; |
| 209 | + int timestep = remainder / elements_per_timestep; |
| 210 | + int head_dim_idx = remainder % elements_per_timestep; |
| 211 | + int head = head_dim_idx / head_dim; |
| 212 | + int dim = head_dim_idx % head_dim; |
| 213 | + |
| 214 | + printf(" [%d] CPU=%.6f GPU=%.6f DIFF=%.6f (seq=%d, t=%d, h=%d, d=%d)\n", |
| 215 | + i, cpu_val, gpu_val, diff, seq_idx, timestep, head, dim); |
| 216 | + } |
| 217 | + errors++; |
| 218 | + } |
| 219 | + } |
| 220 | + |
| 221 | + avg_error /= total_elements; |
| 222 | + |
| 223 | + printf("Summary:\n"); |
| 224 | + printf(" Total elements: %d\n", total_elements); |
| 225 | + printf(" Errors (>1e-5): %d (%.2f%%)\n", errors, 100.0f * errors / total_elements); |
| 226 | + printf(" Max error: %.9f\n", max_error); |
| 227 | + printf(" Average error: %.9f\n", avg_error); |
| 228 | + |
| 229 | + if (errors == 0) { |
| 230 | + printf(" ✓ PASS: All values match within tolerance!\n"); |
| 231 | + } else if (avg_error < 1e-4) { |
| 232 | + printf(" ⚠ MARGINAL: Small numerical differences\n"); |
| 233 | + } else { |
| 234 | + printf(" ✗ FAIL: Significant numerical errors\n"); |
| 235 | + } |
| 236 | + |
| 237 | + // Performance comparison |
| 238 | + printf("\n=== Performance Comparison ===\n"); |
| 239 | + if (cpu_duration.count() > 0 && gpu_duration.count() > 0) { |
| 240 | + double speedup = (double)cpu_duration.count() / gpu_duration.count(); |
| 241 | + printf(" CPU time: %.2f ms\n", cpu_duration.count() / 1000.0); |
| 242 | + printf(" GPU time: %.2f ms\n", gpu_duration.count() / 1000.0); |
| 243 | + printf(" Speedup: %.2fx %s\n", speedup, speedup > 1.0 ? "(GPU faster)" : "(CPU faster)"); |
| 244 | + |
| 245 | + // Calculate elements per second |
| 246 | + double total_ops = (double)(d_state * head_dim * n_head * n_seq_tokens * n_seqs); |
| 247 | + double cpu_gops = total_ops / (cpu_duration.count() / 1000.0) / 1e6; |
| 248 | + double gpu_gops = total_ops / (gpu_duration.count() / 1000.0) / 1e6; |
| 249 | + printf(" CPU throughput: %.2f million elements/sec\n", cpu_gops); |
| 250 | + printf(" GPU throughput: %.2f million elements/sec\n", gpu_gops); |
| 251 | + } |
| 252 | + |
| 253 | + ggml_backend_buffer_free(output_buffer); |
| 254 | + ggml_backend_buffer_free(buffer); |
| 255 | + ggml_backend_free(backend); |
| 256 | + ggml_free(ctx_gpu); |
| 257 | +#else |
| 258 | + printf("Vulkan not enabled in build\n"); |
| 259 | +#endif |
| 260 | + |
| 261 | + ggml_free(ctx); |
| 262 | +} |
| 263 | + |
| 264 | +// Small test case for debugging SSM scan |
| 265 | +static void test_ssm_scan_small() { |
| 266 | + test_ssm_scan_config(16, 2, 2, 2, 1, "Small test (original)"); |
| 267 | +} |
| 268 | + |
| 269 | +static void test_ssm_scan_performance() { |
| 270 | + printf("=== SSM SCAN PERFORMANCE TESTING ===\n"); |
| 271 | + printf("Testing supported configurations based on CUDA implementation\n"); |
| 272 | + |
| 273 | + // Group 1: Mamba-2 configurations (d_state=128/256) |
| 274 | + printf("\n--- Group 1: Mamba-2 (d_state=128) ---\n"); |
| 275 | + test_ssm_scan_config(128, 16, 8, 16, 1, "Mamba-2 small: d_state=128, head_dim=16"); |
| 276 | + test_ssm_scan_config(128, 32, 16, 32, 1, "Mamba-2 medium: d_state=128, head_dim=32"); |
| 277 | + test_ssm_scan_config(128, 64, 32, 64, 1, "Mamba-2 large: d_state=128, head_dim=64"); |
| 278 | + |
| 279 | + // Group 2: Very large test (user requested) |
| 280 | + printf("\n--- Group 2: Mamba-2 Stress Test ---\n"); |
| 281 | + test_ssm_scan_config(128, 64, 128, 512, 1, "MASSIVE: d_state=128, head_dim=64, n_head=128, n_seq_tokens=512"); |
| 282 | + |
| 283 | + // Group 3: Mamba-2 with d_state=256 |
| 284 | + printf("\n--- Group 3: Mamba-2 (d_state=256) ---\n"); |
| 285 | + test_ssm_scan_config(256, 16, 8, 16, 1, "Mamba-2 d_state=256 small"); |
| 286 | + test_ssm_scan_config(256, 32, 16, 32, 1, "Mamba-2 d_state=256 medium"); |
| 287 | + |
| 288 | + // Group 4: Multiple sequences |
| 289 | + printf("\n--- Group 4: Multiple Sequences ---\n"); |
| 290 | + test_ssm_scan_config(128, 32, 16, 64, 2, "Mamba-2 with 2 sequences"); |
| 291 | + test_ssm_scan_config(128, 32, 16, 64, 4, "Mamba-2 with 4 sequences"); |
| 292 | + |
| 293 | + printf("\n=== ALL PERFORMANCE TESTS COMPLETED ===\n"); |
| 294 | + printf("Note: Only testing Mamba-2 configurations (d_state=128/256) as these are the primary use case\n"); |
| 295 | + printf("Mamba-1 (d_state=16) requires different tensor setup and is less commonly used\n"); |
| 296 | +} |
| 297 | + |
| 298 | +int main() { |
| 299 | + // Test the massive case specifically requested by user first |
| 300 | + printf("=== FOCUSED LARGE MATRIX TEST ===\n"); |
| 301 | + test_ssm_scan_config(128, 64, 128, 512, 1, "MASSIVE: d_state=128, head_dim=64, n_head=128, n_seq_tokens=512"); |
| 302 | + |
| 303 | + // Also run the full performance suite |
| 304 | + printf("\n"); |
| 305 | + test_ssm_scan_performance(); |
| 306 | + return 0; |
| 307 | +} |
0 commit comments