tests: set tensor usage as weight for weight tensors

taronaeo · taronaeo · commit 61ee32dec38e · 2025-12-14T18:05:51.000+08:00
only for mul_mat and mul_mat_id ops

Signed-off-by: Aaron Teo &lt;aaron.teo1@ibm.com&gt;
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
@@ -1160,6 +1160,9 @@ struct test_case {
 
     std::vector<ggml_tensor *> sentinels;
 
+    // Track weight tensors for separate buffer allocation with GGML_BACKEND_BUFFER_USAGE_WEIGHTS
+    std::vector<ggml_tensor *> weight_tensors;
+
     std::string current_op_name;
 
     void add_sentinel(ggml_context * ctx) {
@@ -1238,6 +1241,8 @@ struct test_case {
                        const char *   op_names_filter,
                        printer *      output_printer) {
         mode = MODE_TEST;
+        weight_tensors.clear();
+        sentinels.clear();
 
         ggml_init_params params = {
             /* .mem_size = */ ggml_tensor_overhead()*128 + ggml_graph_overhead(),
@@ -1288,10 +1293,35 @@ struct test_case {
         // post-graph sentinel
         add_sentinel(ctx);
 
-        // allocate
+        // allocate weight tensors in a separate buffer with GGML_BACKEND_BUFFER_USAGE_WEIGHTS
+        ggml_backend_buffer_t weights_buf = nullptr;
+        if (!weight_tensors.empty()) {
+            // Calculate total size needed for weight tensors
+            size_t weight_size = 0;
+            for (ggml_tensor * wt : weight_tensors) {
+                weight_size += ggml_backend_buft_get_alloc_size(ggml_backend_get_default_buffer_type(backend1), wt);
+            }
+            weight_size = GGML_PAD(weight_size, ggml_backend_buft_get_alignment(ggml_backend_get_default_buffer_type(backend1)));
+
+            weights_buf = ggml_backend_buft_alloc_buffer(ggml_backend_get_default_buffer_type(backend1), weight_size);
+            if (weights_buf == NULL) {
+                printf("failed to allocate weight tensors [%s] ", ggml_backend_name(backend1));
+                ggml_free(ctx);
+                return test_status_t::FAIL;
+            }
+            ggml_backend_buffer_set_usage(weights_buf, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
+
+            // Allocate each weight tensor in the weights buffer
+            ggml_tallocr weights_talloc = ggml_tallocr_new(weights_buf);
+            for (ggml_tensor * wt : weight_tensors) {
+                ggml_tallocr_alloc(&weights_talloc, wt);
+            }
+        }
+
+        // allocate remaining tensors
         ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors(ctx, backend1);
 
-        if (buf == NULL) {
+        if (buf == NULL && weights_buf == NULL) {
             printf("failed to allocate tensors [%s] ", ggml_backend_name(backend1));
             ggml_free(ctx);
             return test_status_t::FAIL;
@@ -1385,6 +1415,9 @@ struct test_case {
 
         const bool cmp_ok = ggml_backend_compare_graph_backend(backend1, backend2, gf, callback, &ud, run_whole_graph() ? out : nullptr);
 
+        if (weights_buf) {
+            ggml_backend_buffer_free(weights_buf);
+        }
         ggml_backend_buffer_free(buf);
 
         ggml_free(ctx);
@@ -1404,6 +1437,7 @@ struct test_case {
 
     bool eval_perf(ggml_backend_t backend, const char * op_names_filter, printer * output_printer) {
         mode = MODE_PERF;
+        weight_tensors.clear();
 
         static const size_t graph_nodes = 8192;
 
@@ -1432,10 +1466,34 @@ struct test_case {
             return true;
         }
 
-        // allocate
+        // allocate weight tensors in a separate buffer with GGML_BACKEND_BUFFER_USAGE_WEIGHTS
+        ggml_backend_buffer_ptr weights_buf(nullptr); // smart ptr
+        if (!weight_tensors.empty()) {
+            // Calculate total size needed for weight tensors
+            size_t weight_size = 0;
+            for (ggml_tensor * wt : weight_tensors) {
+                weight_size += ggml_backend_buft_get_alloc_size(ggml_backend_get_default_buffer_type(backend), wt);
+            }
+            weight_size = GGML_PAD(weight_size, ggml_backend_buft_get_alignment(ggml_backend_get_default_buffer_type(backend)));
+
+            weights_buf.reset(ggml_backend_buft_alloc_buffer(ggml_backend_get_default_buffer_type(backend), weight_size));
+            if (weights_buf == NULL) {
+                printf("failed to allocate weight tensors\n");
+                return false;
+            }
+            ggml_backend_buffer_set_usage(weights_buf.get(), GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
+
+            // Allocate each weight tensor in the weights buffer
+            ggml_tallocr weights_talloc = ggml_tallocr_new(weights_buf.get());
+            for (ggml_tensor * wt : weight_tensors) {
+                ggml_tallocr_alloc(&weights_talloc, wt);
+            }
+        }
+
+        // allocate remaining tensors
         ggml_backend_buffer_ptr buf(ggml_backend_alloc_ctx_tensors(ctx.get(), backend)); // smart ptr
 
-        if (buf == NULL) {
+        if (buf == NULL && weights_buf == NULL) {
             printf("failed to allocate tensors\n");
             return false;
         }
@@ -1534,6 +1592,7 @@ struct test_case {
 
     bool eval_support(ggml_backend_t backend, const char * op_names_filter, printer * output_printer) {
         mode = MODE_SUPPORT;
+        weight_tensors.clear();
 
         static const size_t graph_nodes = 8192;
 
@@ -1569,6 +1628,7 @@ struct test_case {
 
     bool eval_grad(ggml_backend_t backend, const char * op_names_filter, printer * output_printer) {
         mode = MODE_GRAD;
+        weight_tensors.clear();
         const std::vector<float> expect = grad_expect();
 
         ggml_init_params params = {
@@ -1679,9 +1739,35 @@ struct test_case {
             return true;
         }
 
-        // allocate
+        // allocate weight tensors in a separate buffer with GGML_BACKEND_BUFFER_USAGE_WEIGHTS
+        ggml_backend_buffer_ptr weights_buf(nullptr); // smart ptr
+        if (!weight_tensors.empty()) {
+            // Calculate total size needed for weight tensors
+            size_t weight_size = 0;
+            for (ggml_tensor * wt : weight_tensors) {
+                weight_size += ggml_backend_buft_get_alloc_size(ggml_backend_get_default_buffer_type(backend), wt);
+            }
+            weight_size = GGML_PAD(weight_size, ggml_backend_buft_get_alignment(ggml_backend_get_default_buffer_type(backend)));
+
+            weights_buf.reset(ggml_backend_buft_alloc_buffer(ggml_backend_get_default_buffer_type(backend), weight_size));
+            if (weights_buf == NULL) {
+                test_operation_info info(op_desc(out), vars(), ggml_backend_name(backend));
+                info.set_error("weight allocation", "");
+                output_printer->print_operation(info);
+                return false;
+            }
+            ggml_backend_buffer_set_usage(weights_buf.get(), GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
+
+            // Allocate each weight tensor in the weights buffer
+            ggml_tallocr weights_talloc = ggml_tallocr_new(weights_buf.get());
+            for (ggml_tensor * wt : weight_tensors) {
+                ggml_tallocr_alloc(&weights_talloc, wt);
+            }
+        }
+
+        // allocate remaining tensors
         ggml_backend_buffer_ptr buf(ggml_backend_alloc_ctx_tensors(ctx.get(), backend)); // smart ptr
-        if (buf == NULL) {
+        if (buf == NULL && weights_buf == NULL) {
             test_operation_info info(op_desc(out), vars(), ggml_backend_name(backend));
             info.set_error("allocation", "");
             output_printer->print_operation(info);
@@ -3606,6 +3692,7 @@ struct test_mul_mat : public test_case {
 
             a = ggml_new_tensor_4d(ctx, type_a, ne_a[per[0]], ne_a[per[1]], ne_a[per[2]], ne_a[per[3]]);
             b = ggml_new_tensor_4d(ctx, type_b, ne_b[per[0]], ne_b[per[1]], ne_b[per[2]], ne_b[per[3]]);
+            weight_tensors.push_back(a); // Track weight tensor for GGML_BACKEND_BUFFER_USAGE_WEIGHTS
             if (!ggml_is_quantized(type_a)) {
                 if (bs[1] == 1 && nr[1] == 1) {
                     ggml_set_param(a);
@@ -3623,6 +3710,7 @@ struct test_mul_mat : public test_case {
             const int64_t k_physical = k_v == 0 ? k : k_v;
             a = ggml_new_tensor_4d(ctx, type_a, k_physical, m, bs[0],       bs[1]);
             b = ggml_new_tensor_4d(ctx, type_b, k_physical, n, bs[0]*nr[0], bs[1]*nr[1]);
+            weight_tensors.push_back(a); // Track weight tensor for GGML_BACKEND_BUFFER_USAGE_WEIGHTS
 
             if (!ggml_is_quantized(type_a)) {
                 if (bs[1] == 1 && nr[1] == 1) {
@@ -3716,6 +3804,7 @@ struct test_mul_mat_id : public test_case {
         // C^T = A * B^T: (k, m) * (k, n) => (m, n)
         ggml_tensor * as = ggml_new_tensor_3d(ctx, type_a, k, m, n_mats);
         ggml_set_name(as, "as");
+        weight_tensors.push_back(as); // Track weight tensor for GGML_BACKEND_BUFFER_USAGE_WEIGHTS
 
         ggml_tensor * ids = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, n_mats, n);
         ggml_set_name(ids, "ids");
@@ -3776,6 +3865,7 @@ struct test_mul_mat_id_fusion : public test_case {
         // C^T = A * B^T: (k, m) * (k, n) => (m, n)
         ggml_tensor * as = ggml_new_tensor_3d(ctx, type_a, k, m, n_mats);
         ggml_set_name(as, "as");
+        weight_tensors.push_back(as); // Track weight tensor for GGML_BACKEND_BUFFER_USAGE_WEIGHTS
 
         ggml_tensor * ids = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, n_mats, n);
         ggml_set_name(ids, "ids");
@@ -3792,6 +3882,7 @@ struct test_mul_mat_id_fusion : public test_case {
 
         for (uint32_t i = 1; i < o; ++i) {
             ggml_tensor * a2 = ggml_new_tensor_3d(ctx, type_a, k, m, n_mats);
+            weight_tensors.push_back(a2); // Track weight tensor for GGML_BACKEND_BUFFER_USAGE_WEIGHTS
             ggml_tensor * out2 = ggml_mul_mat_id(ctx, a2, b, ids);
             ggml_set_name(out2, "out2");
             out = ggml_add(ctx, out, out2);
@@ -7861,9 +7952,24 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
     test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 30, 30, 7, 1 }, { 8, 30, 7, 1 }));
     test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 42, 42, 5, 2 }, { 10, 42, 5, 2 }));
     test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 64, 64, 2, 2 }, { 10, 64, 2, 2 }));
+    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 64, 64, 2, 2 }, { 64, 64, 2, 2 }));
+    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 79, 79, 5, 3 }, { 417, 79, 5, 3 }));
+    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 128, 128, 4, 2 }, { 32, 128, 4, 2 }));
+    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 80, 80, 2, 8 }, { 80, 80, 2, 8 }));
+    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 80, 80, 2, 8 }, { 79, 80, 2, 8 }));
+    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 80, 80, 2, 8 }, { 81, 80, 2, 8 }));
+    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 80, 80, 8, 8 }, { 80, 80, 8, 8 }));
+    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 80, 80, 8, 8 }, { 79, 80, 8, 8 }));
+    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 80, 80, 8, 8 }, { 81, 80, 8, 8 }));
+    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 84, 84, 4, 4 }, { 32, 84, 4, 4 }));
+    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 95, 95, 8, 8 }, { 40, 95, 8, 8 }));
     test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 100, 100, 4, 4 }, { 41, 100, 4, 4 }));
     test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 128, 128, 4, 4 }, { 31, 128, 4, 4 }));
-    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 64, 64, 4, 4 }, { 300, 64, 4, 4 }));
+    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 128, 128, 4, 4 }, { 32, 128, 4, 4 }));
+    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 128, 128, 3, 4 }, { 32, 128, 3, 4 }));
+    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 128, 128, 4, 1 }, { 32, 128, 4, 1 }));
+    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 64, 64, 4, 4 }, { 200, 64, 4, 4 }));
+    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 64, 64, 4, 4 }, { 384, 64, 4, 4 }));
 
     for (bool v : {false, true}) {
         for (bool circular : {false, true}) {
@@ -8064,12 +8170,13 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {
     test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 16416, 1, 128, {8,  1}, {4, 1}, {0, 2, 1, 3}));
     test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 128, 1, 16416, {8,  1}, {4, 1}, {0, 1, 2, 3}, 2*16416));
 
-    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 64, 64, 4, 2 }, { 6, 64, 4, 2 }));
-    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 128, 128, 4, 1 }, { 8, 128, 4, 1 }));
+    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 64, 64, 4, 4 }, { 32, 64, 4, 4 }));
+    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 128, 128, 4, 2 }, { 32, 128, 4, 2 }));
     // qwen3next with CHUNK_SIZE 64
     test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 64, 64, 8, 32 }, { 64, 64, 8, 32 }));
     // qwen3next with CHUNK_SIZE 128
     test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 128, 128, 4, 32 }, { 128, 128, 4, 32 }));
+    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 256, 256, 4, 2 }, { 128, 256, 4, 2 }));
 
     test_cases.emplace_back(new test_tri(GGML_TRI_TYPE_LOWER, GGML_TYPE_F32, { 256, 256, 4, 4 }));
     test_cases.emplace_back(new test_tri(GGML_TRI_TYPE_UPPER_DIAG, GGML_TYPE_F32, { 1024, 1024, 8, 4 }));