@@ -219,6 +219,7 @@ struct cmd_params {
219219    std::vector<std::vector<llama_model_tensor_buft_override>> tensor_buft_overrides;
220220    std::vector<bool >                use_mmap;
221221    std::vector<bool >                embeddings;
222+     std::vector<bool >                no_op_offload;
222223    ggml_numa_strategy               numa;
223224    int                               reps;
224225    ggml_sched_priority              prio;
@@ -253,6 +254,7 @@ static const cmd_params cmd_params_defaults = {
253254    /*  tensor_buft_overrides*/   { std::vector<llama_model_tensor_buft_override>{{nullptr ,nullptr }} },
254255    /*  use_mmap             */   { true  },
255256    /*  embeddings           */   { false  },
257+     /*  no_op_offload        */   { false  },
256258    /*  numa                 */   GGML_NUMA_STRATEGY_DISABLED,
257259    /*  reps                 */   5 ,
258260    /*  prio                 */   GGML_SCHED_PRIO_NORMAL,
@@ -311,6 +313,7 @@ static void print_usage(int /* argc */, char ** argv) {
311313           join (cmd_params_defaults.embeddings , " ,"  ).c_str ());
312314    printf ("   -ts, --tensor-split <ts0/ts1/..>          (default: 0)\n "  );
313315    printf ("   -ot --override-tensors <tensor name pattern>=<buffer type>;... (default: disabled)\n "  );
316+     printf ("   -nopo, --no-op-offload <i>                (default: 0)\n "  );
314317    printf ("   -r, --repetitions <n>                     (default: %d)\n "  , cmd_params_defaults.reps );
315318    printf ("   --prio <0|1|2|3>                          (default: %d)\n "  , cmd_params_defaults.prio );
316319    printf ("   --delay <0...N> (seconds)                 (default: %d)\n "  , cmd_params_defaults.delay );
@@ -588,6 +591,13 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
588591            }
589592            auto  p = string_split<bool >(argv[i], split_delim);
590593            params.embeddings .insert (params.embeddings .end (), p.begin (), p.end ());
594+         } else  if  (arg == " -nopo"   || arg == " --no-op-offload"  ) {
595+             if  (++i >= argc) {
596+                 invalid_param = true ;
597+                 break ;
598+             }
599+             auto  p = string_split<bool >(argv[i], split_delim);
600+             params.no_op_offload .insert (params.no_op_offload .end (), p.begin (), p.end ());
591601        } else  if  (arg == " -ts"   || arg == " --tensor-split"  ) {
592602            if  (++i >= argc) {
593603                invalid_param = true ;
@@ -794,6 +804,9 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
794804    if  (params.embeddings .empty ()) {
795805        params.embeddings  = cmd_params_defaults.embeddings ;
796806    }
807+     if  (params.no_op_offload .empty ()) {
808+         params.no_op_offload  = cmd_params_defaults.no_op_offload ;
809+     }
797810    if  (params.n_threads .empty ()) {
798811        params.n_threads  = cmd_params_defaults.n_threads ;
799812    }
@@ -833,6 +846,7 @@ struct cmd_params_instance {
833846    std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;
834847    bool                use_mmap;
835848    bool                embeddings;
849+     bool                no_op_offload;
836850
837851    llama_model_params to_llama_mparams () const  {
838852        llama_model_params mparams = llama_model_default_params ();
@@ -902,6 +916,7 @@ struct cmd_params_instance {
902916        cparams.offload_kqv  = !no_kv_offload;
903917        cparams.flash_attn   = flash_attn;
904918        cparams.embeddings   = embeddings;
919+         cparams.op_offload   = !no_op_offload;
905920
906921        return  cparams;
907922    }
@@ -921,6 +936,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
921936    for  (const  auto  & ot : params.tensor_buft_overrides )
922937    for  (const  auto  & mmp : params.use_mmap )
923938    for  (const  auto  & embd : params.embeddings )
939+     for  (const  auto  & nopo : params.no_op_offload )
924940    for  (const  auto  & nb : params.n_batch )
925941    for  (const  auto  & nub : params.n_ubatch )
926942    for  (const  auto  & tk : params.type_k )
@@ -959,6 +975,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
959975                /*  .tensor_buft_overrides = */   ot,
960976                /*  .use_mmap     = */   mmp,
961977                /*  .embeddings   = */   embd,
978+                 /*  .no_op_offload= */   nopo,
962979            };
963980            instances.push_back (instance);
964981        }
@@ -990,6 +1007,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
9901007                /*  .tensor_buft_overrides = */   ot,
9911008                /*  .use_mmap     = */   mmp,
9921009                /*  .embeddings   = */   embd,
1010+                 /*  .no_op_offload= */   nopo,
9931011            };
9941012            instances.push_back (instance);
9951013        }
@@ -1021,6 +1039,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
10211039                /*  .tensor_buft_overrides = */   ot,
10221040                /*  .use_mmap     = */   mmp,
10231041                /*  .embeddings   = */   embd,
1042+                 /*  .no_op_offload= */   nopo,
10241043            };
10251044            instances.push_back (instance);
10261045        }
@@ -1056,6 +1075,7 @@ struct test {
10561075    std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;
10571076    bool                      use_mmap;
10581077    bool                      embeddings;
1078+     bool                      no_op_offload;
10591079    int                       n_prompt;
10601080    int                       n_gen;
10611081    int                       n_depth;
@@ -1089,6 +1109,7 @@ struct test {
10891109        tensor_buft_overrides = inst.tensor_buft_overrides ;
10901110        use_mmap       = inst.use_mmap ;
10911111        embeddings     = inst.embeddings ;
1112+         no_op_offload  = inst.no_op_offload ;
10921113        n_prompt       = inst.n_prompt ;
10931114        n_gen          = inst.n_gen ;
10941115        n_depth        = inst.n_depth ;
@@ -1134,7 +1155,7 @@ struct test {
11341155            " model_type"  ,   " model_size"  ,   " model_n_params"  , " n_batch"  ,    " n_ubatch"  ,     " n_threads"  ,
11351156            " cpu_mask"  ,     " cpu_strict"  ,   " poll"  ,           " type_k"  ,     " type_v"  ,       " n_gpu_layers"  ,
11361157            " split_mode"  ,   " main_gpu"  ,     " no_kv_offload"  ,  " flash_attn"  , " tensor_split"  , " tensor_buft_overrides"  ,
1137-             " use_mmap"  ,     " embeddings"  ,   " n_prompt"  ,       " n_gen"  ,      " n_depth"  ,      " test_time"  ,
1158+             " use_mmap"  ,     " embeddings"  ,   " no_op_offload " ,    " n_prompt"  ,       " n_gen"  ,      " n_depth"  ,      " test_time"  ,
11381159            " avg_ns"  ,       " stddev_ns"  ,    " avg_ts"  ,         " stddev_ts"  ,
11391160        };
11401161        return  fields;
@@ -1146,7 +1167,7 @@ struct test {
11461167        if  (field == " build_number"   || field == " n_batch"   || field == " n_ubatch"   || field == " n_threads"   ||
11471168            field == " poll"   || field == " model_size"   || field == " model_n_params"   || field == " n_gpu_layers"   ||
11481169            field == " main_gpu"   || field == " n_prompt"   || field == " n_gen"   || field == " n_depth"   ||
1149-             field == " avg_ns"   || field == " stddev_ns"  ) {
1170+             field == " avg_ns"   || field == " stddev_ns"  || field ==  " no_op_offload "  ) {
11501171            return  INT;
11511172        }
11521173        if  (field == " f16_kv"   || field == " no_kv_offload"   || field == " cpu_strict"   || field == " flash_attn"   ||
@@ -1222,6 +1243,7 @@ struct test {
12221243                                            tensor_buft_overrides_str,
12231244                                            std::to_string (use_mmap),
12241245                                            std::to_string (embeddings),
1246+                                             std::to_string (no_op_offload),
12251247                                            std::to_string (n_prompt),
12261248                                            std::to_string (n_gen),
12271249                                            std::to_string (n_depth),
@@ -1404,6 +1426,9 @@ struct markdown_printer : public printer {
14041426        if  (field == " test"  ) {
14051427            return  15 ;
14061428        }
1429+         if  (field == " no_op_offload"  ) {
1430+             return  4 ;
1431+         }
14071432
14081433        int  width = std::max ((int ) field.length (), 10 );
14091434
@@ -1435,6 +1460,9 @@ struct markdown_printer : public printer {
14351460        if  (field == " embeddings"  ) {
14361461            return  " embd"  ;
14371462        }
1463+         if  (field == " no_op_offload"  ) {
1464+             return  " nopo"  ;
1465+         }
14381466        if  (field == " tensor_split"  ) {
14391467            return  " ts"  ;
14401468        }
@@ -1503,6 +1531,9 @@ struct markdown_printer : public printer {
15031531        if  (params.embeddings .size () > 1  || params.embeddings  != cmd_params_defaults.embeddings ) {
15041532            fields.emplace_back (" embeddings"  );
15051533        }
1534+         if  (params.no_op_offload .size () > 1  || params.no_op_offload  != cmd_params_defaults.no_op_offload ) {
1535+             fields.emplace_back (" no_op_offload"  );
1536+         }
15061537        fields.emplace_back (" test"  );
15071538        fields.emplace_back (" t/s"  );
15081539
0 commit comments