@@ -1160,6 +1160,9 @@ struct test_case {
11601160
11611161 std::vector<ggml_tensor *> sentinels;
11621162
1163+ // Track weight tensors for separate buffer allocation with GGML_BACKEND_BUFFER_USAGE_WEIGHTS
1164+ std::vector<ggml_tensor *> weight_tensors;
1165+
11631166 std::string current_op_name;
11641167
11651168 void add_sentinel (ggml_context * ctx) {
@@ -1238,6 +1241,8 @@ struct test_case {
12381241 const char * op_names_filter,
12391242 printer * output_printer) {
12401243 mode = MODE_TEST;
1244+ weight_tensors.clear ();
1245+ sentinels.clear ();
12411246
12421247 ggml_init_params params = {
12431248 /* .mem_size = */ ggml_tensor_overhead ()*128 + ggml_graph_overhead (),
@@ -1288,10 +1293,35 @@ struct test_case {
12881293 // post-graph sentinel
12891294 add_sentinel (ctx);
12901295
1291- // allocate
1296+ // allocate weight tensors in a separate buffer with GGML_BACKEND_BUFFER_USAGE_WEIGHTS
1297+ ggml_backend_buffer_t weights_buf = nullptr ;
1298+ if (!weight_tensors.empty ()) {
1299+ // Calculate total size needed for weight tensors
1300+ size_t weight_size = 0 ;
1301+ for (ggml_tensor * wt : weight_tensors) {
1302+ weight_size += ggml_backend_buft_get_alloc_size (ggml_backend_get_default_buffer_type (backend1), wt);
1303+ }
1304+ weight_size = GGML_PAD (weight_size, ggml_backend_buft_get_alignment (ggml_backend_get_default_buffer_type (backend1)));
1305+
1306+ weights_buf = ggml_backend_buft_alloc_buffer (ggml_backend_get_default_buffer_type (backend1), weight_size);
1307+ if (weights_buf == NULL ) {
1308+ printf (" failed to allocate weight tensors [%s] " , ggml_backend_name (backend1));
1309+ ggml_free (ctx);
1310+ return test_status_t ::FAIL;
1311+ }
1312+ ggml_backend_buffer_set_usage (weights_buf, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
1313+
1314+ // Allocate each weight tensor in the weights buffer
1315+ ggml_tallocr weights_talloc = ggml_tallocr_new (weights_buf);
1316+ for (ggml_tensor * wt : weight_tensors) {
1317+ ggml_tallocr_alloc (&weights_talloc, wt);
1318+ }
1319+ }
1320+
1321+ // allocate remaining tensors
12921322 ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors (ctx, backend1);
12931323
1294- if (buf == NULL ) {
1324+ if (buf == NULL && weights_buf == NULL ) {
12951325 printf (" failed to allocate tensors [%s] " , ggml_backend_name (backend1));
12961326 ggml_free (ctx);
12971327 return test_status_t ::FAIL;
@@ -1385,6 +1415,9 @@ struct test_case {
13851415
13861416 const bool cmp_ok = ggml_backend_compare_graph_backend (backend1, backend2, gf, callback, &ud, run_whole_graph () ? out : nullptr );
13871417
1418+ if (weights_buf) {
1419+ ggml_backend_buffer_free (weights_buf);
1420+ }
13881421 ggml_backend_buffer_free (buf);
13891422
13901423 ggml_free (ctx);
@@ -1404,6 +1437,7 @@ struct test_case {
14041437
14051438 bool eval_perf (ggml_backend_t backend, const char * op_names_filter, printer * output_printer) {
14061439 mode = MODE_PERF;
1440+ weight_tensors.clear ();
14071441
14081442 static const size_t graph_nodes = 8192 ;
14091443
@@ -1432,10 +1466,34 @@ struct test_case {
14321466 return true ;
14331467 }
14341468
1435- // allocate
1469+ // allocate weight tensors in a separate buffer with GGML_BACKEND_BUFFER_USAGE_WEIGHTS
1470+ ggml_backend_buffer_ptr weights_buf (nullptr ); // smart ptr
1471+ if (!weight_tensors.empty ()) {
1472+ // Calculate total size needed for weight tensors
1473+ size_t weight_size = 0 ;
1474+ for (ggml_tensor * wt : weight_tensors) {
1475+ weight_size += ggml_backend_buft_get_alloc_size (ggml_backend_get_default_buffer_type (backend), wt);
1476+ }
1477+ weight_size = GGML_PAD (weight_size, ggml_backend_buft_get_alignment (ggml_backend_get_default_buffer_type (backend)));
1478+
1479+ weights_buf.reset (ggml_backend_buft_alloc_buffer (ggml_backend_get_default_buffer_type (backend), weight_size));
1480+ if (weights_buf == NULL ) {
1481+ printf (" failed to allocate weight tensors\n " );
1482+ return false ;
1483+ }
1484+ ggml_backend_buffer_set_usage (weights_buf.get (), GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
1485+
1486+ // Allocate each weight tensor in the weights buffer
1487+ ggml_tallocr weights_talloc = ggml_tallocr_new (weights_buf.get ());
1488+ for (ggml_tensor * wt : weight_tensors) {
1489+ ggml_tallocr_alloc (&weights_talloc, wt);
1490+ }
1491+ }
1492+
1493+ // allocate remaining tensors
14361494 ggml_backend_buffer_ptr buf (ggml_backend_alloc_ctx_tensors (ctx.get (), backend)); // smart ptr
14371495
1438- if (buf == NULL ) {
1496+ if (buf == NULL && weights_buf == NULL ) {
14391497 printf (" failed to allocate tensors\n " );
14401498 return false ;
14411499 }
@@ -1534,6 +1592,7 @@ struct test_case {
15341592
15351593 bool eval_support (ggml_backend_t backend, const char * op_names_filter, printer * output_printer) {
15361594 mode = MODE_SUPPORT;
1595+ weight_tensors.clear ();
15371596
15381597 static const size_t graph_nodes = 8192 ;
15391598
@@ -1569,6 +1628,7 @@ struct test_case {
15691628
15701629 bool eval_grad (ggml_backend_t backend, const char * op_names_filter, printer * output_printer) {
15711630 mode = MODE_GRAD;
1631+ weight_tensors.clear ();
15721632 const std::vector<float > expect = grad_expect ();
15731633
15741634 ggml_init_params params = {
@@ -1679,9 +1739,35 @@ struct test_case {
16791739 return true ;
16801740 }
16811741
1682- // allocate
1742+ // allocate weight tensors in a separate buffer with GGML_BACKEND_BUFFER_USAGE_WEIGHTS
1743+ ggml_backend_buffer_ptr weights_buf (nullptr ); // smart ptr
1744+ if (!weight_tensors.empty ()) {
1745+ // Calculate total size needed for weight tensors
1746+ size_t weight_size = 0 ;
1747+ for (ggml_tensor * wt : weight_tensors) {
1748+ weight_size += ggml_backend_buft_get_alloc_size (ggml_backend_get_default_buffer_type (backend), wt);
1749+ }
1750+ weight_size = GGML_PAD (weight_size, ggml_backend_buft_get_alignment (ggml_backend_get_default_buffer_type (backend)));
1751+
1752+ weights_buf.reset (ggml_backend_buft_alloc_buffer (ggml_backend_get_default_buffer_type (backend), weight_size));
1753+ if (weights_buf == NULL ) {
1754+ test_operation_info info (op_desc (out), vars (), ggml_backend_name (backend));
1755+ info.set_error (" weight allocation" , " " );
1756+ output_printer->print_operation (info);
1757+ return false ;
1758+ }
1759+ ggml_backend_buffer_set_usage (weights_buf.get (), GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
1760+
1761+ // Allocate each weight tensor in the weights buffer
1762+ ggml_tallocr weights_talloc = ggml_tallocr_new (weights_buf.get ());
1763+ for (ggml_tensor * wt : weight_tensors) {
1764+ ggml_tallocr_alloc (&weights_talloc, wt);
1765+ }
1766+ }
1767+
1768+ // allocate remaining tensors
16831769 ggml_backend_buffer_ptr buf (ggml_backend_alloc_ctx_tensors (ctx.get (), backend)); // smart ptr
1684- if (buf == NULL ) {
1770+ if (buf == NULL && weights_buf == NULL ) {
16851771 test_operation_info info (op_desc (out), vars (), ggml_backend_name (backend));
16861772 info.set_error (" allocation" , " " );
16871773 output_printer->print_operation (info);
@@ -3606,6 +3692,7 @@ struct test_mul_mat : public test_case {
36063692
36073693 a = ggml_new_tensor_4d (ctx, type_a, ne_a[per[0 ]], ne_a[per[1 ]], ne_a[per[2 ]], ne_a[per[3 ]]);
36083694 b = ggml_new_tensor_4d (ctx, type_b, ne_b[per[0 ]], ne_b[per[1 ]], ne_b[per[2 ]], ne_b[per[3 ]]);
3695+ weight_tensors.push_back (a); // Track weight tensor for GGML_BACKEND_BUFFER_USAGE_WEIGHTS
36093696 if (!ggml_is_quantized (type_a)) {
36103697 if (bs[1 ] == 1 && nr[1 ] == 1 ) {
36113698 ggml_set_param (a);
@@ -3623,6 +3710,7 @@ struct test_mul_mat : public test_case {
36233710 const int64_t k_physical = k_v == 0 ? k : k_v;
36243711 a = ggml_new_tensor_4d (ctx, type_a, k_physical, m, bs[0 ], bs[1 ]);
36253712 b = ggml_new_tensor_4d (ctx, type_b, k_physical, n, bs[0 ]*nr[0 ], bs[1 ]*nr[1 ]);
3713+ weight_tensors.push_back (a); // Track weight tensor for GGML_BACKEND_BUFFER_USAGE_WEIGHTS
36263714
36273715 if (!ggml_is_quantized (type_a)) {
36283716 if (bs[1 ] == 1 && nr[1 ] == 1 ) {
@@ -3716,6 +3804,7 @@ struct test_mul_mat_id : public test_case {
37163804 // C^T = A * B^T: (k, m) * (k, n) => (m, n)
37173805 ggml_tensor * as = ggml_new_tensor_3d (ctx, type_a, k, m, n_mats);
37183806 ggml_set_name (as, " as" );
3807+ weight_tensors.push_back (as); // Track weight tensor for GGML_BACKEND_BUFFER_USAGE_WEIGHTS
37193808
37203809 ggml_tensor * ids = ggml_new_tensor_2d (ctx, GGML_TYPE_I32, n_mats, n);
37213810 ggml_set_name (ids, " ids" );
@@ -3776,6 +3865,7 @@ struct test_mul_mat_id_fusion : public test_case {
37763865 // C^T = A * B^T: (k, m) * (k, n) => (m, n)
37773866 ggml_tensor * as = ggml_new_tensor_3d (ctx, type_a, k, m, n_mats);
37783867 ggml_set_name (as, " as" );
3868+ weight_tensors.push_back (as); // Track weight tensor for GGML_BACKEND_BUFFER_USAGE_WEIGHTS
37793869
37803870 ggml_tensor * ids = ggml_new_tensor_2d (ctx, GGML_TYPE_I32, n_mats, n);
37813871 ggml_set_name (ids, " ids" );
@@ -3792,6 +3882,7 @@ struct test_mul_mat_id_fusion : public test_case {
37923882
37933883 for (uint32_t i = 1 ; i < o; ++i) {
37943884 ggml_tensor * a2 = ggml_new_tensor_3d (ctx, type_a, k, m, n_mats);
3885+ weight_tensors.push_back (a2); // Track weight tensor for GGML_BACKEND_BUFFER_USAGE_WEIGHTS
37953886 ggml_tensor * out2 = ggml_mul_mat_id (ctx, a2, b, ids);
37963887 ggml_set_name (out2, " out2" );
37973888 out = ggml_add (ctx, out, out2);
@@ -7861,9 +7952,24 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
78617952 test_cases.emplace_back (new test_solve_tri (GGML_TYPE_F32, { 30 , 30 , 7 , 1 }, { 8 , 30 , 7 , 1 }));
78627953 test_cases.emplace_back (new test_solve_tri (GGML_TYPE_F32, { 42 , 42 , 5 , 2 }, { 10 , 42 , 5 , 2 }));
78637954 test_cases.emplace_back (new test_solve_tri (GGML_TYPE_F32, { 64 , 64 , 2 , 2 }, { 10 , 64 , 2 , 2 }));
7955+ test_cases.emplace_back (new test_solve_tri (GGML_TYPE_F32, { 64 , 64 , 2 , 2 }, { 64 , 64 , 2 , 2 }));
7956+ test_cases.emplace_back (new test_solve_tri (GGML_TYPE_F32, { 79 , 79 , 5 , 3 }, { 417 , 79 , 5 , 3 }));
7957+ test_cases.emplace_back (new test_solve_tri (GGML_TYPE_F32, { 128 , 128 , 4 , 2 }, { 32 , 128 , 4 , 2 }));
7958+ test_cases.emplace_back (new test_solve_tri (GGML_TYPE_F32, { 80 , 80 , 2 , 8 }, { 80 , 80 , 2 , 8 }));
7959+ test_cases.emplace_back (new test_solve_tri (GGML_TYPE_F32, { 80 , 80 , 2 , 8 }, { 79 , 80 , 2 , 8 }));
7960+ test_cases.emplace_back (new test_solve_tri (GGML_TYPE_F32, { 80 , 80 , 2 , 8 }, { 81 , 80 , 2 , 8 }));
7961+ test_cases.emplace_back (new test_solve_tri (GGML_TYPE_F32, { 80 , 80 , 8 , 8 }, { 80 , 80 , 8 , 8 }));
7962+ test_cases.emplace_back (new test_solve_tri (GGML_TYPE_F32, { 80 , 80 , 8 , 8 }, { 79 , 80 , 8 , 8 }));
7963+ test_cases.emplace_back (new test_solve_tri (GGML_TYPE_F32, { 80 , 80 , 8 , 8 }, { 81 , 80 , 8 , 8 }));
7964+ test_cases.emplace_back (new test_solve_tri (GGML_TYPE_F32, { 84 , 84 , 4 , 4 }, { 32 , 84 , 4 , 4 }));
7965+ test_cases.emplace_back (new test_solve_tri (GGML_TYPE_F32, { 95 , 95 , 8 , 8 }, { 40 , 95 , 8 , 8 }));
78647966 test_cases.emplace_back (new test_solve_tri (GGML_TYPE_F32, { 100 , 100 , 4 , 4 }, { 41 , 100 , 4 , 4 }));
78657967 test_cases.emplace_back (new test_solve_tri (GGML_TYPE_F32, { 128 , 128 , 4 , 4 }, { 31 , 128 , 4 , 4 }));
7866- test_cases.emplace_back (new test_solve_tri (GGML_TYPE_F32, { 64 , 64 , 4 , 4 }, { 300 , 64 , 4 , 4 }));
7968+ test_cases.emplace_back (new test_solve_tri (GGML_TYPE_F32, { 128 , 128 , 4 , 4 }, { 32 , 128 , 4 , 4 }));
7969+ test_cases.emplace_back (new test_solve_tri (GGML_TYPE_F32, { 128 , 128 , 3 , 4 }, { 32 , 128 , 3 , 4 }));
7970+ test_cases.emplace_back (new test_solve_tri (GGML_TYPE_F32, { 128 , 128 , 4 , 1 }, { 32 , 128 , 4 , 1 }));
7971+ test_cases.emplace_back (new test_solve_tri (GGML_TYPE_F32, { 64 , 64 , 4 , 4 }, { 200 , 64 , 4 , 4 }));
7972+ test_cases.emplace_back (new test_solve_tri (GGML_TYPE_F32, { 64 , 64 , 4 , 4 }, { 384 , 64 , 4 , 4 }));
78677973
78687974 for (bool v : {false , true }) {
78697975 for (bool circular : {false , true }) {
@@ -8064,12 +8170,13 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {
80648170 test_cases.emplace_back (new test_mul_mat (GGML_TYPE_F16, GGML_TYPE_F32, 16416 , 1 , 128 , {8 , 1 }, {4 , 1 }, {0 , 2 , 1 , 3 }));
80658171 test_cases.emplace_back (new test_mul_mat (GGML_TYPE_F16, GGML_TYPE_F32, 128 , 1 , 16416 , {8 , 1 }, {4 , 1 }, {0 , 1 , 2 , 3 }, 2 *16416 ));
80668172
8067- test_cases.emplace_back (new test_solve_tri (GGML_TYPE_F32, { 64 , 64 , 4 , 2 }, { 6 , 64 , 4 , 2 }));
8068- test_cases.emplace_back (new test_solve_tri (GGML_TYPE_F32, { 128 , 128 , 4 , 1 }, { 8 , 128 , 4 , 1 }));
8173+ test_cases.emplace_back (new test_solve_tri (GGML_TYPE_F32, { 64 , 64 , 4 , 4 }, { 32 , 64 , 4 , 4 }));
8174+ test_cases.emplace_back (new test_solve_tri (GGML_TYPE_F32, { 128 , 128 , 4 , 2 }, { 32 , 128 , 4 , 2 }));
80698175 // qwen3next with CHUNK_SIZE 64
80708176 test_cases.emplace_back (new test_solve_tri (GGML_TYPE_F32, { 64 , 64 , 8 , 32 }, { 64 , 64 , 8 , 32 }));
80718177 // qwen3next with CHUNK_SIZE 128
80728178 test_cases.emplace_back (new test_solve_tri (GGML_TYPE_F32, { 128 , 128 , 4 , 32 }, { 128 , 128 , 4 , 32 }));
8179+ test_cases.emplace_back (new test_solve_tri (GGML_TYPE_F32, { 256 , 256 , 4 , 2 }, { 128 , 256 , 4 , 2 }));
80738180
80748181 test_cases.emplace_back (new test_tri (GGML_TRI_TYPE_LOWER, GGML_TYPE_F32, { 256 , 256 , 4 , 4 }));
80758182 test_cases.emplace_back (new test_tri (GGML_TRI_TYPE_UPPER_DIAG, GGML_TYPE_F32, { 1024 , 1024 , 8 , 4 }));
0 commit comments