@@ -36,6 +36,46 @@ static uint64_t get_time_ns() {
3636 return std::chrono::nanoseconds (clock::now ().time_since_epoch ()).count ();
3737}
3838
39+ bool tensor_buft_override_equal (const llama_model_tensor_buft_override& a, const llama_model_tensor_buft_override& b) {
40+ if (a.pattern != b.pattern ) {
41+ // cString comparison that may be null
42+ if (a.pattern == nullptr || b.pattern == nullptr ) {
43+ return false ;
44+ }
45+ if (strcmp (a.pattern , b.pattern ) != 0 ) {
46+ return false ;
47+ }
48+ }
49+ if (a.buft != b.buft ) {
50+ return false ;
51+ }
52+ return true ;
53+ }
54+
55+ bool vec_tensor_buft_override_equal (const std::vector<llama_model_tensor_buft_override>& a, const std::vector<llama_model_tensor_buft_override>& b) {
56+ if (a.size () != b.size ()) {
57+ return false ;
58+ }
59+ for (size_t i = 0 ; i < a.size (); i++) {
60+ if (!tensor_buft_override_equal (a[i], b[i])) {
61+ return false ;
62+ }
63+ }
64+ return true ;
65+ }
66+
67+ bool vec_vec_tensor_buft_override_equal (const std::vector<std::vector<llama_model_tensor_buft_override>>& a, const std::vector<std::vector<llama_model_tensor_buft_override>>& b) {
68+ if (a.size () != b.size ()) {
69+ return false ;
70+ }
71+ for (size_t i = 0 ; i < a.size (); i++) {
72+ if (!vec_tensor_buft_override_equal (a[i], b[i])) {
73+ return false ;
74+ }
75+ }
76+ return true ;
77+ }
78+
3979template <class T > static std::string join (const std::vector<T> & values, const std::string & delim) {
4080 std::ostringstream str;
4181 for (size_t i = 0 ; i < values.size (); i++) {
@@ -175,13 +215,13 @@ struct cmd_params {
175215 std::vector<bool > no_kv_offload;
176216 std::vector<bool > flash_attn;
177217 std::vector<std::vector<float >> tensor_split;
218+ std::vector<std::vector<llama_model_tensor_buft_override>> tensor_buft_overrides;
178219 std::vector<bool > use_mmap;
179220 std::vector<bool > embeddings;
180221 ggml_numa_strategy numa;
181222 int reps;
182223 ggml_sched_priority prio;
183224 int delay;
184- std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;
185225 bool verbose;
186226 bool progress;
187227 output_formats output_format;
@@ -208,13 +248,13 @@ static const cmd_params cmd_params_defaults = {
208248 /* no_kv_offload */ { false },
209249 /* flash_attn */ { false },
210250 /* tensor_split */ { std::vector<float >(llama_max_devices (), 0 .0f ) },
251+ /* tensor_buft_overrides*/ {},
211252 /* use_mmap */ { true },
212253 /* embeddings */ { false },
213254 /* numa */ GGML_NUMA_STRATEGY_DISABLED,
214255 /* reps */ 5 ,
215256 /* prio */ GGML_SCHED_PRIO_NORMAL,
216257 /* delay */ 0 ,
217- /* tensor_buft_overrides*/ {},
218258 /* verbose */ false ,
219259 /* progress */ false ,
220260 /* output_format */ MARKDOWN,
@@ -267,10 +307,10 @@ static void print_usage(int /* argc */, char ** argv) {
267307 printf (" -embd, --embeddings <0|1> (default: %s)\n " ,
268308 join (cmd_params_defaults.embeddings , " ," ).c_str ());
269309 printf (" -ts, --tensor-split <ts0/ts1/..> (default: 0)\n " );
310+ printf (" -ot --override-tensors <tensor name pattern>=<buffer type>;... (default:disabled)\n " );
270311 printf (" -r, --repetitions <n> (default: %d)\n " , cmd_params_defaults.reps );
271312 printf (" --prio <0|1|2|3> (default: %d)\n " , cmd_params_defaults.prio );
272313 printf (" --delay <0...N> (seconds) (default: %d)\n " , cmd_params_defaults.delay );
273- printf (" -ot --override-tensors <tensor name pattern>=<buffer type>,... (default:disabled)\n " );
274314 printf (" -o, --output <csv|json|jsonl|md|sql> (default: %s)\n " ,
275315 output_format_str (cmd_params_defaults.output_format ));
276316 printf (" -oe, --output-err <csv|json|jsonl|md|sql> (default: %s)\n " ,
@@ -560,24 +600,6 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
560600 }
561601 params.tensor_split .push_back (tensor_split);
562602 }
563- } else if (arg == " -r" || arg == " --repetitions" ) {
564- if (++i >= argc) {
565- invalid_param = true ;
566- break ;
567- }
568- params.reps = std::stoi (argv[i]);
569- } else if (arg == " --prio" ) {
570- if (++i >= argc) {
571- invalid_param = true ;
572- break ;
573- }
574- params.prio = (enum ggml_sched_priority) std::stoi (argv[i]);
575- } else if (arg == " --delay" ) {
576- if (++i >= argc) {
577- invalid_param = true ;
578- break ;
579- }
580- params.delay = std::stoi (argv[i]);
581603 } else if (arg == " -ot" || arg == " --override-tensor" ) {
582604 if (++i >= argc) {
583605 invalid_param = true ;
@@ -595,39 +617,73 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
595617 }
596618 }
597619 }
598- auto override_span_len = std::strcspn (value, " ," );
599- while (override_span_len > 0 ) {
620+ auto override_group_span_len = std::strcspn (value, " ," );
621+ while (override_group_span_len > 0 ) {
600622 // Stamps null terminators into the argv
601623 // value for this option to avoid the
602624 // memory leak present in the implementation
603- // over in arg.cpp. Maybe allowable because we
625+ // over in arg.cpp. Acceptable because we
604626 // only parse these args once in this program.
605- auto override = value;
606- if (value[override_span_len ] != ' \0 ' ) {
607- value[override_span_len ] = ' \0 ' ;
608- value = &value[override_span_len + 1 ];
627+ auto override_group = value;
628+ if (value[override_group_span_len ] != ' \0 ' ) {
629+ value[override_group_span_len ] = ' \0 ' ;
630+ value = &value[override_group_span_len + 1 ];
609631 } else {
610- value = &value[override_span_len];
611- }
612- auto tensor_name_span_len = std::strcspn (override , " =" );
613- if (tensor_name_span_len >= override_span_len) {
614- invalid_param = true ;
615- break ;
632+ value = &value[override_group_span_len];
616633 }
617- override [tensor_name_span_len] = ' \0 ' ;
618- auto tensor_name = override ;
619- auto buffer_type = &override [tensor_name_span_len + 1 ];
620- if (buft_list.find (buffer_type) == buft_list.end ()) {
621- printf (" Available buffer types:\n " );
622- for (const auto & it : buft_list) {
623- printf (" %s\n " , ggml_backend_buft_name (it.second ));
634+ std::vector<llama_model_tensor_buft_override> group_tensor_buft_overrides{};
635+ auto override_span_len = std::strcspn (override_group, " ;" );
636+ while (override_span_len > 0 ) {
637+ auto override = override_group;
638+ if (override_group[override_span_len] != ' \0 ' ) {
639+ override_group[override_span_len] = ' \0 ' ;
640+ override_group = &override_group[override_span_len + 1 ];
641+ } else {
642+ override_group = &override_group[override_span_len];
624643 }
625- invalid_param = true ;
644+ auto tensor_name_span_len = std::strcspn (override , " =" );
645+ if (tensor_name_span_len >= override_span_len) {
646+ invalid_param = true ;
647+ break ;
648+ }
649+ override [tensor_name_span_len] = ' \0 ' ;
650+ auto tensor_name = override ;
651+ auto buffer_type = &override [tensor_name_span_len + 1 ];
652+ if (buft_list.find (buffer_type) == buft_list.end ()) {
653+ printf (" Available buffer types:\n " );
654+ for (const auto & it : buft_list) {
655+ printf (" %s\n " , ggml_backend_buft_name (it.second ));
656+ }
657+ invalid_param = true ;
658+ break ;
659+ }
660+ group_tensor_buft_overrides.push_back ({tensor_name, buft_list.at (buffer_type)});
661+ override_span_len = std::strcspn (override_group, " ;" );
662+ }
663+ if (invalid_param) {
626664 break ;
627665 }
628- params.tensor_buft_overrides .push_back ({tensor_name, buft_list.at (buffer_type)});
629- override_span_len = std::strcspn (value, " ," );
666+ params.tensor_buft_overrides .push_back (group_tensor_buft_overrides);
667+ override_group_span_len = std::strcspn (value, " ," );
668+ }
669+ } else if (arg == " -r" || arg == " --repetitions" ) {
670+ if (++i >= argc) {
671+ invalid_param = true ;
672+ break ;
630673 }
674+ params.reps = std::stoi (argv[i]);
675+ } else if (arg == " --prio" ) {
676+ if (++i >= argc) {
677+ invalid_param = true ;
678+ break ;
679+ }
680+ params.prio = (enum ggml_sched_priority) std::stoi (argv[i]);
681+ } else if (arg == " --delay" ) {
682+ if (++i >= argc) {
683+ invalid_param = true ;
684+ break ;
685+ }
686+ params.delay = std::stoi (argv[i]);
631687 } else if (arg == " -o" || arg == " --output" ) {
632688 if (++i >= argc) {
633689 invalid_param = true ;
@@ -701,6 +757,9 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
701757 if (params.tensor_split .empty ()) {
702758 params.tensor_split = cmd_params_defaults.tensor_split ;
703759 }
760+ if (params.tensor_buft_overrides .empty ()) {
761+ params.tensor_buft_overrides = cmd_params_defaults.tensor_buft_overrides ;
762+ }
704763 if (params.use_mmap .empty ()) {
705764 params.use_mmap = cmd_params_defaults.use_mmap ;
706765 }
@@ -721,8 +780,10 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
721780 }
722781
723782 // Attach terminators to options that requre them
724- if (!params.tensor_buft_overrides .empty ()) {
725- params.tensor_buft_overrides .push_back ({nullptr , nullptr });
783+ for (auto &tensor_buft_override_list : params.tensor_buft_overrides ) {
784+ if (!tensor_buft_override_list.empty ()) {
785+ tensor_buft_override_list.push_back ({nullptr , nullptr });
786+ }
726787 }
727788
728789 return params;
@@ -805,7 +866,7 @@ struct cmd_params_instance {
805866 bool equal_mparams (const cmd_params_instance & other) const {
806867 return model == other.model && n_gpu_layers == other.n_gpu_layers && rpc_servers_str == other.rpc_servers_str &&
807868 split_mode == other.split_mode && main_gpu == other.main_gpu && use_mmap == other.use_mmap &&
808- tensor_split == other.tensor_split ;
869+ tensor_split == other.tensor_split && vec_tensor_buft_override_equal (tensor_buft_overrides, other. tensor_buft_overrides ) ;
809870 }
810871
811872 llama_context_params to_llama_cparams () const {
@@ -835,6 +896,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
835896 for (const auto & sm : params.split_mode )
836897 for (const auto & mg : params.main_gpu )
837898 for (const auto & ts : params.tensor_split )
899+ for (const auto & ot : params.tensor_buft_overrides )
838900 for (const auto & mmp : params.use_mmap )
839901 for (const auto & embd : params.embeddings )
840902 for (const auto & nb : params.n_batch )
@@ -870,7 +932,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
870932 /* .no_kv_offload= */ nkvo,
871933 /* .flash_attn = */ fa,
872934 /* .tensor_split = */ ts,
873- /* .tensor_buft_overrides = */ params. tensor_buft_overrides ,
935+ /* .tensor_buft_overrides = */ ot ,
874936 /* .use_mmap = */ mmp,
875937 /* .embeddings = */ embd,
876938 };
@@ -900,7 +962,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
900962 /* .no_kv_offload= */ nkvo,
901963 /* .flash_attn = */ fa,
902964 /* .tensor_split = */ ts,
903- /* .tensor_buft_overrides = */ params. tensor_buft_overrides ,
965+ /* .tensor_buft_overrides = */ ot ,
904966 /* .use_mmap = */ mmp,
905967 /* .embeddings = */ embd,
906968 };
@@ -930,7 +992,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
930992 /* .no_kv_offload= */ nkvo,
931993 /* .flash_attn = */ fa,
932994 /* .tensor_split = */ ts,
933- /* .tensor_buft_overrides = */ params. tensor_buft_overrides ,
995+ /* .tensor_buft_overrides = */ ot ,
934996 /* .use_mmap = */ mmp,
935997 /* .embeddings = */ embd,
936998 };
@@ -965,6 +1027,7 @@ struct test {
9651027 bool no_kv_offload;
9661028 bool flash_attn;
9671029 std::vector<float > tensor_split;
1030+ std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;
9681031 bool use_mmap;
9691032 bool embeddings;
9701033 int n_prompt;
@@ -996,6 +1059,7 @@ struct test {
9961059 no_kv_offload = inst.no_kv_offload ;
9971060 flash_attn = inst.flash_attn ;
9981061 tensor_split = inst.tensor_split ;
1062+ tensor_buft_overrides = inst.tensor_buft_overrides ;
9991063 use_mmap = inst.use_mmap ;
10001064 embeddings = inst.embeddings ;
10011065 n_prompt = inst.n_prompt ;
@@ -1041,9 +1105,9 @@ struct test {
10411105 " build_commit" , " build_number" , " cpu_info" , " gpu_info" , " backends" , " model_filename" ,
10421106 " model_type" , " model_size" , " model_n_params" , " n_batch" , " n_ubatch" , " n_threads" ,
10431107 " cpu_mask" , " cpu_strict" , " poll" , " type_k" , " type_v" , " n_gpu_layers" ,
1044- " split_mode" , " main_gpu" , " no_kv_offload" , " flash_attn" , " tensor_split" , " use_mmap " ,
1045- " embeddings " , " n_prompt " , " n_gen " , " test_time " , " avg_ns " , " stddev_ns " ,
1046- " avg_ts" , " stddev_ts" ,
1108+ " split_mode" , " main_gpu" , " no_kv_offload" , " flash_attn" , " tensor_split" , " tensor_buft_overrides " ,
1109+ " use_mmap " , " embeddings " , " n_prompt " , " n_gen " , " test_time " , " avg_ns " ,
1110+ " stddev_ns " , " avg_ts" , " stddev_ts" ,
10471111 };
10481112 return fields;
10491113 }
@@ -1069,6 +1133,7 @@ struct test {
10691133
10701134 std::vector<std::string> get_values () const {
10711135 std::string tensor_split_str;
1136+ std::string tensor_buft_overrides_str;
10721137 int max_nonzero = 0 ;
10731138 for (size_t i = 0 ; i < llama_max_devices (); i++) {
10741139 if (tensor_split[i] > 0 ) {
@@ -1083,6 +1148,19 @@ struct test {
10831148 tensor_split_str += " /" ;
10841149 }
10851150 }
1151+ for (int i = 0 ; i < tensor_buft_overrides.size ()-1 ; i++) {
1152+ // Last element of tensor_buft_overrides is always a null pattern
1153+ if (tensor_buft_overrides[i].pattern == nullptr ) {
1154+ tensor_buft_overrides_str += " none" ;
1155+ } else {
1156+ tensor_buft_overrides_str += tensor_buft_overrides[i].pattern ;
1157+ tensor_buft_overrides_str += " =" ;
1158+ tensor_buft_overrides_str += ggml_backend_buft_name (tensor_buft_overrides[i].buft );
1159+ }
1160+ if (i + 2 < tensor_buft_overrides.size ()) {
1161+ tensor_buft_overrides_str += " ;" ;
1162+ }
1163+ }
10861164 std::vector<std::string> values = { build_commit,
10871165 std::to_string (build_number),
10881166 cpu_info,
@@ -1106,6 +1184,7 @@ struct test {
11061184 std::to_string (no_kv_offload),
11071185 std::to_string (flash_attn),
11081186 tensor_split_str,
1187+ tensor_buft_overrides_str,
11091188 std::to_string (use_mmap),
11101189 std::to_string (embeddings),
11111190 std::to_string (n_prompt),
@@ -1323,6 +1402,9 @@ struct markdown_printer : public printer {
13231402 if (field == " tensor_split" ) {
13241403 return " ts" ;
13251404 }
1405+ if (field == " tensor_buft_overrides" ) {
1406+ return " ot" ;
1407+ }
13261408 return field;
13271409 }
13281410
@@ -1376,6 +1458,9 @@ struct markdown_printer : public printer {
13761458 if (params.tensor_split .size () > 1 || params.tensor_split != cmd_params_defaults.tensor_split ) {
13771459 fields.emplace_back (" tensor_split" );
13781460 }
1461+ if (params.tensor_buft_overrides .size () > 1 || !vec_vec_tensor_buft_override_equal (params.tensor_buft_overrides , cmd_params_defaults.tensor_buft_overrides )) {
1462+ fields.emplace_back (" tensor_buft_overrides" );
1463+ }
13791464 if (params.use_mmap .size () > 1 || params.use_mmap != cmd_params_defaults.use_mmap ) {
13801465 fields.emplace_back (" use_mmap" );
13811466 }
0 commit comments