TEST

giuseppe · giuseppe · commit 5e2e2549cd67 · 2025-10-06T22:34:36.000+02:00
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
@@ -202,6 +202,7 @@ if (NOT LLAMA_SANITIZE_ADDRESS)
 endif()
 llama_build_and_test(test-gguf.cpp)
 llama_build_and_test(test-backend-ops.cpp)
+llama_build_and_test(test-ssm-scan-debug.cpp)
 
 llama_build_and_test(test-model-load-cancel.cpp  LABEL "model")
 llama_build_and_test(test-autorelease.cpp        LABEL "model")
diff --git a/tests/test-ssm-scan-debug.cpp b/tests/test-ssm-scan-debug.cpp
@@ -0,0 +1,307 @@
+#include "ggml.h"
+#include "ggml-backend.h"
+#include "ggml-cpu.h"
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <vector>
+#include <cmath>
+#include <chrono>
+
+#ifdef GGML_USE_VULKAN
+#include "ggml-vulkan.h"
+#endif
+
+// Test function with configurable parameters
+// Note: d_inner is head_dim in CUDA terminology for Mamba-2
+static void test_ssm_scan_config(int64_t d_state, int64_t head_dim, int64_t n_head, int64_t n_seq_tokens, int64_t n_seqs, const char* test_name) {
+    std::chrono::microseconds cpu_duration(0), gpu_duration(0);
+    printf("\n=== Test: %s ===\n", test_name);
+    printf("d_state=%ld, head_dim=%ld, n_head=%ld, n_seq_tokens=%ld, n_seqs=%ld\n",
+           (long)d_state, (long)head_dim, (long)n_head, (long)n_seq_tokens, (long)n_seqs);
+    const int64_t n_group = 1;
+
+    struct ggml_init_params params = {
+        /*.mem_size   =*/ 256*1024*1024, // Increased for larger tests
+        /*.mem_buffer =*/ NULL,
+        /*.no_alloc   =*/ false,
+    };
+
+    struct ggml_context * ctx = ggml_init(params);
+
+    // Create input tensors (updated for Mamba-2 configuration)
+    struct ggml_tensor * s   = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, d_state, head_dim, n_head, n_seqs);
+    struct ggml_tensor * x   = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, head_dim, n_head, n_seq_tokens, n_seqs);
+    struct ggml_tensor * dt  = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, n_head, n_seq_tokens, n_seqs);
+    struct ggml_tensor * A   = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, n_head); // Mamba-2: {1, n_head}
+    struct ggml_tensor * B   = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, d_state, n_group, n_seq_tokens, n_seqs);
+    struct ggml_tensor * C   = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, d_state, n_group, n_seq_tokens, n_seqs);
+    struct ggml_tensor * ids = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_seqs);
+
+    // Initialize with simple values for debugging
+    // s: initial state (all zeros)
+    for (int i = 0; i < ggml_nelements(s); i++) {
+        ((float*)s->data)[i] = 0.0f;
+    }
+
+    // x: input (simple pattern: 0.1, 0.2, 0.3, ...)
+    for (int i = 0; i < ggml_nelements(x); i++) {
+        ((float*)x->data)[i] = 0.1f * (i + 1);
+    }
+
+    // dt: timestep (all 1.0 for simplicity)
+    for (int i = 0; i < ggml_nelements(dt); i++) {
+        ((float*)dt->data)[i] = 1.0f;
+    }
+
+    // A: decay (all -1.0 for simple exp)
+    for (int i = 0; i < ggml_nelements(A); i++) {
+        ((float*)A->data)[i] = -1.0f;
+    }
+
+    // B: input matrix (all 1.0)
+    for (int i = 0; i < ggml_nelements(B); i++) {
+        ((float*)B->data)[i] = 1.0f;
+    }
+
+    // C: output matrix (all 1.0)
+    for (int i = 0; i < ggml_nelements(C); i++) {
+        ((float*)C->data)[i] = 1.0f;
+    }
+
+    // ids: sequence ids (0, 1, 2, ...)
+    for (int i = 0; i < n_seqs; i++) {
+        ((int32_t*)ids->data)[i] = i;
+    }
+
+    printf("=== Input dimensions ===\n");
+    printf("d_state=%ld, head_dim=%ld, n_head=%ld, n_seq_tokens=%ld, n_seqs=%ld\n",
+           (long)d_state, (long)head_dim, (long)n_head, (long)n_seq_tokens, (long)n_seqs);
+
+    printf("\n=== Input x (first 16 values) ===\n");
+    for (int i = 0; i < std::min(16, (int)ggml_nelements(x)); i++) {
+        printf("x[%d] = %.3f\n", i, ((float*)x->data)[i]);
+    }
+
+    // Run CPU version with timing
+    printf("\n=== Running CPU version ===\n");
+    struct ggml_tensor * out_cpu = ggml_ssm_scan(ctx, s, x, dt, A, B, C, ids);
+
+    struct ggml_cgraph * gf = ggml_new_graph(ctx);
+    ggml_build_forward_expand(gf, out_cpu);
+
+    ggml_backend_t cpu_backend = ggml_backend_cpu_init();
+
+    auto cpu_start = std::chrono::high_resolution_clock::now();
+    ggml_backend_graph_compute(cpu_backend, gf);
+    auto cpu_end = std::chrono::high_resolution_clock::now();
+
+    ggml_backend_free(cpu_backend);
+
+    cpu_duration = std::chrono::duration_cast<std::chrono::microseconds>(cpu_end - cpu_start);
+    printf("CPU computation time: %.2f ms\n", cpu_duration.count() / 1000.0);
+
+    printf("CPU output (first 32 values):\n");
+    for (int i = 0; i < std::min(32, (int)ggml_nelements(out_cpu)); i++) {
+        printf("  [%d] = %.6f\n", i, ((float*)out_cpu->data)[i]);
+    }
+
+    // Run GPU version
+    printf("\n=== Running GPU version ===\n");
+
+#ifdef GGML_USE_VULKAN
+    ggml_backend_t backend = ggml_backend_vk_init(0);
+    if (!backend) {
+        printf("Vulkan backend not available\n");
+        ggml_free(ctx);
+        return;
+    }
+
+    // Create new context for GPU tensors with no_alloc=true
+    struct ggml_init_params params_gpu = {
+        /*.mem_size   =*/ 128*1024*1024,
+        /*.mem_buffer =*/ NULL,
+        /*.no_alloc   =*/ true,
+    };
+    struct ggml_context * ctx_gpu = ggml_init(params_gpu);
+
+    // Create tensor descriptors (no data allocated yet)
+    struct ggml_tensor * s_gpu   = ggml_new_tensor_4d(ctx_gpu, GGML_TYPE_F32, d_state, head_dim, n_head, n_seqs);
+    struct ggml_tensor * x_gpu   = ggml_new_tensor_4d(ctx_gpu, GGML_TYPE_F32, head_dim, n_head, n_seq_tokens, n_seqs);
+    struct ggml_tensor * dt_gpu  = ggml_new_tensor_3d(ctx_gpu, GGML_TYPE_F32, n_head, n_seq_tokens, n_seqs);
+    struct ggml_tensor * A_gpu   = ggml_new_tensor_2d(ctx_gpu, GGML_TYPE_F32, 1, n_head);
+    struct ggml_tensor * B_gpu   = ggml_new_tensor_4d(ctx_gpu, GGML_TYPE_F32, d_state, n_group, n_seq_tokens, n_seqs);
+    struct ggml_tensor * C_gpu   = ggml_new_tensor_4d(ctx_gpu, GGML_TYPE_F32, d_state, n_group, n_seq_tokens, n_seqs);
+    struct ggml_tensor * ids_gpu = ggml_new_tensor_1d(ctx_gpu, GGML_TYPE_I32, n_seqs);
+
+    // Allocate backend buffer for all tensors
+    ggml_backend_buffer_t buffer = ggml_backend_alloc_ctx_tensors(ctx_gpu, backend);
+    if (!buffer) {
+        printf("Failed to allocate backend buffer\n");
+        ggml_backend_free(backend);
+        ggml_free(ctx_gpu);
+        ggml_free(ctx);
+        return;
+    }
+
+    // Copy data to GPU using backend tensor_set
+    ggml_backend_tensor_set(s_gpu, s->data, 0, ggml_nbytes(s));
+    ggml_backend_tensor_set(x_gpu, x->data, 0, ggml_nbytes(x));
+    ggml_backend_tensor_set(dt_gpu, dt->data, 0, ggml_nbytes(dt));
+    ggml_backend_tensor_set(A_gpu, A->data, 0, ggml_nbytes(A));
+    ggml_backend_tensor_set(B_gpu, B->data, 0, ggml_nbytes(B));
+    ggml_backend_tensor_set(C_gpu, C->data, 0, ggml_nbytes(C));
+    ggml_backend_tensor_set(ids_gpu, ids->data, 0, ggml_nbytes(ids));
+
+    struct ggml_tensor * out_gpu = ggml_ssm_scan(ctx_gpu, s_gpu, x_gpu, dt_gpu, A_gpu, B_gpu, C_gpu, ids_gpu);
+
+    struct ggml_cgraph * gf_gpu = ggml_new_graph(ctx_gpu);
+    ggml_build_forward_expand(gf_gpu, out_gpu);
+
+    // Allocate buffer for the output tensor and any intermediate tensors
+    ggml_backend_buffer_t output_buffer = ggml_backend_alloc_ctx_tensors(ctx_gpu, backend);
+    if (!output_buffer) {
+        printf("Failed to allocate output buffer\n");
+        ggml_backend_buffer_free(buffer);
+        ggml_backend_free(backend);
+        ggml_free(ctx_gpu);
+        ggml_free(ctx);
+        return;
+    }
+
+    auto gpu_start = std::chrono::high_resolution_clock::now();
+    ggml_backend_graph_compute(backend, gf_gpu);
+    auto gpu_end = std::chrono::high_resolution_clock::now();
+
+    gpu_duration = std::chrono::duration_cast<std::chrono::microseconds>(gpu_end - gpu_start);
+    printf("GPU computation time: %.2f ms\n", gpu_duration.count() / 1000.0);
+
+    // Copy results back
+    std::vector<float> gpu_result(ggml_nelements(out_gpu));
+    ggml_backend_tensor_get(out_gpu, gpu_result.data(), 0, ggml_nbytes(out_gpu));
+
+    printf("GPU output (first 32 values):\n");
+    for (int i = 0; i < std::min(32, (int)ggml_nelements(out_gpu)); i++) {
+        printf("  [%d] = %.6f\n", i, gpu_result[i]);
+    }
+
+    // Compare with detailed analysis
+    printf("\n=== Detailed Comparison (CPU vs GPU) ===\n");
+    int errors = 0;
+    float max_error = 0.0f;
+    float avg_error = 0.0f;
+    int total_elements = ggml_nelements(out_cpu);
+
+    for (int i = 0; i < total_elements; i++) {
+        float cpu_val = ((float*)out_cpu->data)[i];
+        float gpu_val = gpu_result[i];
+        float diff = std::abs(cpu_val - gpu_val);
+        avg_error += diff;
+        max_error = std::max(max_error, diff);
+
+        if (diff > 1e-5) {
+            if (errors < 10) { // Show first 10 errors
+                // Calculate which timestep, head, and dimension this is
+                int elements_per_timestep = n_head * head_dim;
+                int elements_per_seq = n_seq_tokens * elements_per_timestep;
+                int seq_idx = i / elements_per_seq;
+                int remainder = i % elements_per_seq;
+                int timestep = remainder / elements_per_timestep;
+                int head_dim_idx = remainder % elements_per_timestep;
+                int head = head_dim_idx / head_dim;
+                int dim = head_dim_idx % head_dim;
+
+                printf("  [%d] CPU=%.6f GPU=%.6f DIFF=%.6f (seq=%d, t=%d, h=%d, d=%d)\n",
+                       i, cpu_val, gpu_val, diff, seq_idx, timestep, head, dim);
+            }
+            errors++;
+        }
+    }
+
+    avg_error /= total_elements;
+
+    printf("Summary:\n");
+    printf("  Total elements: %d\n", total_elements);
+    printf("  Errors (>1e-5): %d (%.2f%%)\n", errors, 100.0f * errors / total_elements);
+    printf("  Max error: %.9f\n", max_error);
+    printf("  Average error: %.9f\n", avg_error);
+
+    if (errors == 0) {
+        printf("  ✓ PASS: All values match within tolerance!\n");
+    } else if (avg_error < 1e-4) {
+        printf("  ⚠ MARGINAL: Small numerical differences\n");
+    } else {
+        printf("  ✗ FAIL: Significant numerical errors\n");
+    }
+
+    // Performance comparison
+    printf("\n=== Performance Comparison ===\n");
+    if (cpu_duration.count() > 0 && gpu_duration.count() > 0) {
+        double speedup = (double)cpu_duration.count() / gpu_duration.count();
+        printf("  CPU time: %.2f ms\n", cpu_duration.count() / 1000.0);
+        printf("  GPU time: %.2f ms\n", gpu_duration.count() / 1000.0);
+        printf("  Speedup: %.2fx %s\n", speedup, speedup > 1.0 ? "(GPU faster)" : "(CPU faster)");
+
+        // Calculate elements per second
+        double total_ops = (double)(d_state * head_dim * n_head * n_seq_tokens * n_seqs);
+        double cpu_gops = total_ops / (cpu_duration.count() / 1000.0) / 1e6;
+        double gpu_gops = total_ops / (gpu_duration.count() / 1000.0) / 1e6;
+        printf("  CPU throughput: %.2f million elements/sec\n", cpu_gops);
+        printf("  GPU throughput: %.2f million elements/sec\n", gpu_gops);
+    }
+
+    ggml_backend_buffer_free(output_buffer);
+    ggml_backend_buffer_free(buffer);
+    ggml_backend_free(backend);
+    ggml_free(ctx_gpu);
+#else
+    printf("Vulkan not enabled in build\n");
+#endif
+
+    ggml_free(ctx);
+}
+
+// Small test case for debugging SSM scan
+static void test_ssm_scan_small() {
+    test_ssm_scan_config(16, 2, 2, 2, 1, "Small test (original)");
+}
+
+static void test_ssm_scan_performance() {
+    printf("=== SSM SCAN PERFORMANCE TESTING ===\n");
+    printf("Testing supported configurations based on CUDA implementation\n");
+
+    // Group 1: Mamba-2 configurations (d_state=128/256)
+    printf("\n--- Group 1: Mamba-2 (d_state=128) ---\n");
+    test_ssm_scan_config(128, 16, 8, 16, 1, "Mamba-2 small: d_state=128, head_dim=16");
+    test_ssm_scan_config(128, 32, 16, 32, 1, "Mamba-2 medium: d_state=128, head_dim=32");
+    test_ssm_scan_config(128, 64, 32, 64, 1, "Mamba-2 large: d_state=128, head_dim=64");
+
+    // Group 2: Very large test (user requested)
+    printf("\n--- Group 2: Mamba-2 Stress Test ---\n");
+    test_ssm_scan_config(128, 64, 128, 512, 1, "MASSIVE: d_state=128, head_dim=64, n_head=128, n_seq_tokens=512");
+
+    // Group 3: Mamba-2 with d_state=256
+    printf("\n--- Group 3: Mamba-2 (d_state=256) ---\n");
+    test_ssm_scan_config(256, 16, 8, 16, 1, "Mamba-2 d_state=256 small");
+    test_ssm_scan_config(256, 32, 16, 32, 1, "Mamba-2 d_state=256 medium");
+
+    // Group 4: Multiple sequences
+    printf("\n--- Group 4: Multiple Sequences ---\n");
+    test_ssm_scan_config(128, 32, 16, 64, 2, "Mamba-2 with 2 sequences");
+    test_ssm_scan_config(128, 32, 16, 64, 4, "Mamba-2 with 4 sequences");
+
+    printf("\n=== ALL PERFORMANCE TESTS COMPLETED ===\n");
+    printf("Note: Only testing Mamba-2 configurations (d_state=128/256) as these are the primary use case\n");
+    printf("Mamba-1 (d_state=16) requires different tensor setup and is less commonly used\n");
+}
+
+int main() {
+    // Test the massive case specifically requested by user first
+    printf("=== FOCUSED LARGE MATRIX TEST ===\n");
+    test_ssm_scan_config(128, 64, 128, 512, 1, "MASSIVE: d_state=128, head_dim=64, n_head=128, n_seq_tokens=512");
+
+    // Also run the full performance suite
+    printf("\n");
+    test_ssm_scan_performance();
+    return 0;
+}