@@ -219,6 +219,7 @@ struct cmd_params {
219219 std::vector<std::vector<llama_model_tensor_buft_override>> tensor_buft_overrides;
220220 std::vector<bool > use_mmap;
221221 std::vector<bool > embeddings;
222+ std::vector<bool > disable_op_offload;
222223 ggml_numa_strategy numa;
223224 int reps;
224225 ggml_sched_priority prio;
@@ -253,6 +254,7 @@ static const cmd_params cmd_params_defaults = {
253254 /* tensor_buft_overrides*/ { std::vector<llama_model_tensor_buft_override>{{nullptr ,nullptr }} },
254255 /* use_mmap */ { true },
255256 /* embeddings */ { false },
257+ /* disable_op_offload */ { false },
256258 /* numa */ GGML_NUMA_STRATEGY_DISABLED,
257259 /* reps */ 5 ,
258260 /* prio */ GGML_SCHED_PRIO_NORMAL,
@@ -311,6 +313,7 @@ static void print_usage(int /* argc */, char ** argv) {
311313 join (cmd_params_defaults.embeddings , " ," ).c_str ());
312314 printf (" -ts, --tensor-split <ts0/ts1/..> (default: 0)\n " );
313315 printf (" -ot --override-tensors <tensor name pattern>=<buffer type>;... (default: disabled)\n " );
316+ printf (" -dopo, --disable-op-offload <i> (default: 0)\n " );
314317 printf (" -r, --repetitions <n> (default: %d)\n " , cmd_params_defaults.reps );
315318 printf (" --prio <0|1|2|3> (default: %d)\n " , cmd_params_defaults.prio );
316319 printf (" --delay <0...N> (seconds) (default: %d)\n " , cmd_params_defaults.delay );
@@ -588,6 +591,13 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
588591 }
589592 auto p = string_split<bool >(argv[i], split_delim);
590593 params.embeddings .insert (params.embeddings .end (), p.begin (), p.end ());
594+ } else if (arg == " -dopo" || arg == " --disable-op-offload" ) {
595+ if (++i >= argc) {
596+ invalid_param = true ;
597+ break ;
598+ }
599+ auto p = string_split<bool >(argv[i], split_delim);
600+ params.disable_op_offload .insert (params.disable_op_offload .end (), p.begin (), p.end ());
591601 } else if (arg == " -ts" || arg == " --tensor-split" ) {
592602 if (++i >= argc) {
593603 invalid_param = true ;
@@ -794,6 +804,9 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
794804 if (params.embeddings .empty ()) {
795805 params.embeddings = cmd_params_defaults.embeddings ;
796806 }
807+ if (params.disable_op_offload .empty ()) {
808+ params.disable_op_offload = cmd_params_defaults.disable_op_offload ;
809+ }
797810 if (params.n_threads .empty ()) {
798811 params.n_threads = cmd_params_defaults.n_threads ;
799812 }
@@ -833,6 +846,7 @@ struct cmd_params_instance {
833846 std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;
834847 bool use_mmap;
835848 bool embeddings;
849+ bool disable_op_offload;
836850
837851 llama_model_params to_llama_mparams () const {
838852 llama_model_params mparams = llama_model_default_params ();
@@ -894,14 +908,15 @@ struct cmd_params_instance {
894908 llama_context_params to_llama_cparams () const {
895909 llama_context_params cparams = llama_context_default_params ();
896910
897- cparams.n_ctx = n_prompt + n_gen + n_depth;
898- cparams.n_batch = n_batch;
899- cparams.n_ubatch = n_ubatch;
900- cparams.type_k = type_k;
901- cparams.type_v = type_v;
902- cparams.offload_kqv = !no_kv_offload;
903- cparams.flash_attn = flash_attn;
904- cparams.embeddings = embeddings;
911+ cparams.n_ctx = n_prompt + n_gen + n_depth;
912+ cparams.n_batch = n_batch;
913+ cparams.n_ubatch = n_ubatch;
914+ cparams.type_k = type_k;
915+ cparams.type_v = type_v;
916+ cparams.offload_kqv = !no_kv_offload;
917+ cparams.flash_attn = flash_attn;
918+ cparams.embeddings = embeddings;
919+ cparams.disable_op_offload = disable_op_offload;
905920
906921 return cparams;
907922 }
@@ -921,6 +936,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
921936 for (const auto & ot : params.tensor_buft_overrides )
922937 for (const auto & mmp : params.use_mmap )
923938 for (const auto & embd : params.embeddings )
939+ for (const auto & dopo : params.disable_op_offload )
924940 for (const auto & nb : params.n_batch )
925941 for (const auto & nub : params.n_ubatch )
926942 for (const auto & tk : params.type_k )
@@ -959,6 +975,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
959975 /* .tensor_buft_overrides = */ ot,
960976 /* .use_mmap = */ mmp,
961977 /* .embeddings = */ embd,
978+ /* .disable_op_offload= */ dopo,
962979 };
963980 instances.push_back (instance);
964981 }
@@ -990,6 +1007,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
9901007 /* .tensor_buft_overrides = */ ot,
9911008 /* .use_mmap = */ mmp,
9921009 /* .embeddings = */ embd,
1010+ /* .disable_op_offload= */ dopo,
9931011 };
9941012 instances.push_back (instance);
9951013 }
@@ -1021,6 +1039,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
10211039 /* .tensor_buft_overrides = */ ot,
10221040 /* .use_mmap = */ mmp,
10231041 /* .embeddings = */ embd,
1042+ /* .disable_op_offload= */ dopo,
10241043 };
10251044 instances.push_back (instance);
10261045 }
@@ -1056,6 +1075,7 @@ struct test {
10561075 std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;
10571076 bool use_mmap;
10581077 bool embeddings;
1078+ bool disable_op_offload;
10591079 int n_prompt;
10601080 int n_gen;
10611081 int n_depth;
@@ -1089,6 +1109,7 @@ struct test {
10891109 tensor_buft_overrides = inst.tensor_buft_overrides ;
10901110 use_mmap = inst.use_mmap ;
10911111 embeddings = inst.embeddings ;
1112+ disable_op_offload = inst.disable_op_offload ;
10921113 n_prompt = inst.n_prompt ;
10931114 n_gen = inst.n_gen ;
10941115 n_depth = inst.n_depth ;
@@ -1134,7 +1155,7 @@ struct test {
11341155 " model_type" , " model_size" , " model_n_params" , " n_batch" , " n_ubatch" , " n_threads" ,
11351156 " cpu_mask" , " cpu_strict" , " poll" , " type_k" , " type_v" , " n_gpu_layers" ,
11361157 " split_mode" , " main_gpu" , " no_kv_offload" , " flash_attn" , " tensor_split" , " tensor_buft_overrides" ,
1137- " use_mmap" , " embeddings" , " n_prompt" , " n_gen" , " n_depth" , " test_time" ,
1158+ " use_mmap" , " embeddings" , " disable_op_offload " , " n_prompt" , " n_gen" , " n_depth" , " test_time" ,
11381159 " avg_ns" , " stddev_ns" , " avg_ts" , " stddev_ts" ,
11391160 };
11401161 return fields;
@@ -1146,7 +1167,7 @@ struct test {
11461167 if (field == " build_number" || field == " n_batch" || field == " n_ubatch" || field == " n_threads" ||
11471168 field == " poll" || field == " model_size" || field == " model_n_params" || field == " n_gpu_layers" ||
11481169 field == " main_gpu" || field == " n_prompt" || field == " n_gen" || field == " n_depth" ||
1149- field == " avg_ns" || field == " stddev_ns" ) {
1170+ field == " avg_ns" || field == " stddev_ns" || field == " disable_op_offload " ) {
11501171 return INT;
11511172 }
11521173 if (field == " f16_kv" || field == " no_kv_offload" || field == " cpu_strict" || field == " flash_attn" ||
@@ -1222,6 +1243,7 @@ struct test {
12221243 tensor_buft_overrides_str,
12231244 std::to_string (use_mmap),
12241245 std::to_string (embeddings),
1246+ std::to_string (disable_op_offload),
12251247 std::to_string (n_prompt),
12261248 std::to_string (n_gen),
12271249 std::to_string (n_depth),
@@ -1404,6 +1426,9 @@ struct markdown_printer : public printer {
14041426 if (field == " test" ) {
14051427 return 15 ;
14061428 }
1429+ if (field == " disable_op_offload" ) {
1430+ return 4 ;
1431+ }
14071432
14081433 int width = std::max ((int ) field.length (), 10 );
14091434
@@ -1435,6 +1460,9 @@ struct markdown_printer : public printer {
14351460 if (field == " embeddings" ) {
14361461 return " embd" ;
14371462 }
1463+ if (field == " disable_op_offload" ) {
1464+ return " dopo" ;
1465+ }
14381466 if (field == " tensor_split" ) {
14391467 return " ts" ;
14401468 }
@@ -1503,6 +1531,9 @@ struct markdown_printer : public printer {
15031531 if (params.embeddings .size () > 1 || params.embeddings != cmd_params_defaults.embeddings ) {
15041532 fields.emplace_back (" embeddings" );
15051533 }
1534+ if (params.disable_op_offload .size () > 1 || params.disable_op_offload != cmd_params_defaults.disable_op_offload ) {
1535+ fields.emplace_back (" disable_op_offload" );
1536+ }
15061537 fields.emplace_back (" test" );
15071538 fields.emplace_back (" t/s" );
15081539
0 commit comments