@@ -135,6 +135,86 @@ static std::string get_gpu_info() {
135135 return join (gpu_list, " , " );
136136}
137137
138+ static std::vector<ggml_backend_dev_t > parse_devices_arg (const std::string & value) {
139+ std::vector<ggml_backend_dev_t > devices;
140+ std::string trimmed = string_strip (value);
141+ if (trimmed.empty ()) {
142+ throw std::invalid_argument (" no devices specified" );
143+ }
144+ if (trimmed == " auto" ) {
145+ return devices;
146+ }
147+
148+ auto dev_names = string_split<std::string>(trimmed, ' /' );
149+ if (dev_names.size () == 1 && string_strip (dev_names[0 ]) == " none" ) {
150+ devices.push_back (nullptr );
151+ return devices;
152+ }
153+
154+ for (auto & name : dev_names) {
155+ std::string dev_name = string_strip (name);
156+ if (dev_name.empty ()) {
157+ throw std::invalid_argument (" invalid device specification" );
158+ }
159+ auto * dev = ggml_backend_dev_by_name (dev_name.c_str ());
160+ if (!dev || ggml_backend_dev_type (dev) == GGML_BACKEND_DEVICE_TYPE_CPU) {
161+ throw std::invalid_argument (string_format (" invalid device: %s" , dev_name.c_str ()));
162+ }
163+ devices.push_back (dev);
164+ }
165+
166+ devices.push_back (nullptr );
167+ return devices;
168+ }
169+
170+ [[noreturn]] static void print_available_devices_and_exit () {
171+ std::vector<ggml_backend_dev_t > devices;
172+ for (size_t i = 0 ; i < ggml_backend_dev_count (); ++i) {
173+ auto * dev = ggml_backend_dev_get (i);
174+ auto ty = ggml_backend_dev_type (dev);
175+ if (ty == GGML_BACKEND_DEVICE_TYPE_CPU) {
176+ continue ;
177+ }
178+ devices.push_back (dev);
179+ }
180+
181+ printf (" Available devices:\n " );
182+ if (devices.empty ()) {
183+ printf (" (none)\n " );
184+ }
185+ for (auto * dev : devices) {
186+ size_t free = 0 ;
187+ size_t total = 0 ;
188+ ggml_backend_dev_memory (dev, &free, &total);
189+ printf (" %s: %s (%zu MiB, %zu MiB free)\n " ,
190+ ggml_backend_dev_name (dev),
191+ ggml_backend_dev_description (dev),
192+ total / 1024 / 1024 ,
193+ free / 1024 / 1024 );
194+ }
195+ exit (0 );
196+ }
197+
198+ static std::string devices_to_string (const std::vector<ggml_backend_dev_t > & devices) {
199+ if (devices.empty ()) {
200+ return " auto" ;
201+ }
202+
203+ if (devices.size () == 1 && devices[0 ] == nullptr ) {
204+ return " none" ;
205+ }
206+
207+ std::vector<std::string> names;
208+ for (auto * dev : devices) {
209+ if (dev == nullptr ) {
210+ break ;
211+ }
212+ names.push_back (ggml_backend_dev_name (dev));
213+ }
214+
215+ return join (names, " /" );
216+ }
217+
138218// command line params
139219enum output_formats { NONE, CSV, JSON, JSONL, MARKDOWN, SQL };
140220
@@ -256,6 +336,7 @@ struct cmd_params {
256336 std::vector<int > main_gpu;
257337 std::vector<bool > no_kv_offload;
258338 std::vector<bool > flash_attn;
339+ std::vector<std::vector<ggml_backend_dev_t >> devices;
259340 std::vector<std::vector<float >> tensor_split;
260341 std::vector<std::vector<llama_model_tensor_buft_override>> tensor_buft_overrides;
261342 std::vector<bool > use_mmap;
@@ -293,6 +374,7 @@ static const cmd_params cmd_params_defaults = {
293374 /* main_gpu */ { 0 },
294375 /* no_kv_offload */ { false },
295376 /* flash_attn */ { false },
377+ /* devices */ { std::vector<ggml_backend_dev_t >() },
296378 /* tensor_split */ { std::vector<float >(llama_max_devices (), 0 .0f ) },
297379 /* tensor_buft_overrides*/ { std::vector<llama_model_tensor_buft_override>{ { nullptr , nullptr } } },
298380 /* use_mmap */ { true },
@@ -325,6 +407,7 @@ static void print_usage(int /* argc */, char ** argv) {
325407 output_format_str (cmd_params_defaults.output_format ));
326408 printf (" -oe, --output-err <csv|json|jsonl|md|sql> output format printed to stderr (default: %s)\n " ,
327409 output_format_str (cmd_params_defaults.output_format_stderr ));
410+ printf (" --list-devices list available devices and exit\n " );
328411 printf (" -v, --verbose verbose output\n " );
329412 printf (" --progress print test progress indicators\n " );
330413 printf (" --no-warmup skip warmup runs before benchmarking\n " );
@@ -369,6 +452,7 @@ static void print_usage(int /* argc */, char ** argv) {
369452 join (cmd_params_defaults.no_kv_offload , " ," ).c_str ());
370453 printf (" -fa, --flash-attn <0|1> (default: %s)\n " ,
371454 join (cmd_params_defaults.flash_attn , " ," ).c_str ());
455+ printf (" -dev, --device <dev0/dev1/...> (default: auto)\n " );
372456 printf (" -mmp, --mmap <0|1> (default: %s)\n " ,
373457 join (cmd_params_defaults.use_mmap , " ," ).c_str ());
374458 printf (" -embd, --embeddings <0|1> (default: %s)\n " ,
@@ -533,6 +617,26 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
533617 break ;
534618 }
535619 params.type_v .insert (params.type_v .end (), types.begin (), types.end ());
620+ } else if (arg == " -dev" || arg == " --device" ) {
621+ if (++i >= argc) {
622+ invalid_param = true ;
623+ break ;
624+ }
625+ auto combos = string_split<std::string>(argv[i], split_delim);
626+ for (const auto & combo : combos) {
627+ try {
628+ params.devices .push_back (parse_devices_arg (combo));
629+ } catch (const std::exception & e) {
630+ fprintf (stderr, " error: %s\\ n" , e.what ());
631+ invalid_param = true ;
632+ break ;
633+ }
634+ }
635+ if (invalid_param) {
636+ break ;
637+ }
638+ } else if (arg == " --list-devices" ) {
639+ print_available_devices_and_exit ();
536640 } else if (arg == " -t" || arg == " --threads" ) {
537641 if (++i >= argc) {
538642 invalid_param = true ;
@@ -870,6 +974,9 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
870974 if (params.flash_attn .empty ()) {
871975 params.flash_attn = cmd_params_defaults.flash_attn ;
872976 }
977+ if (params.devices .empty ()) {
978+ params.devices = cmd_params_defaults.devices ;
979+ }
873980 if (params.tensor_split .empty ()) {
874981 params.tensor_split = cmd_params_defaults.tensor_split ;
875982 }
@@ -921,6 +1028,7 @@ struct cmd_params_instance {
9211028 int main_gpu;
9221029 bool no_kv_offload;
9231030 bool flash_attn;
1031+ std::vector<ggml_backend_dev_t > devices;
9241032 std::vector<float > tensor_split;
9251033 std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;
9261034 bool use_mmap;
@@ -931,7 +1039,9 @@ struct cmd_params_instance {
9311039 llama_model_params mparams = llama_model_default_params ();
9321040
9331041 mparams.n_gpu_layers = n_gpu_layers;
934- if (!rpc_servers_str.empty ()) {
1042+ if (!devices.empty ()) {
1043+ mparams.devices = const_cast <ggml_backend_dev_t *>(devices.data ());
1044+ } else if (!rpc_servers_str.empty ()) {
9351045 auto rpc_servers = string_split<std::string>(rpc_servers_str, ' ,' );
9361046
9371047 // add RPC devices
@@ -948,13 +1058,13 @@ struct cmd_params_instance {
9481058 fprintf (stderr, " %s: failed to find RPC device add function\n " , __func__);
9491059 exit (1 );
9501060 }
951- static std::vector<ggml_backend_dev_t > devices ;
952- devices .clear ();
1061+ static std::vector<ggml_backend_dev_t > rpc_devices ;
1062+ rpc_devices .clear ();
9531063 // RPC devices should always come first for performance reasons
9541064 for (const std::string & server : rpc_servers) {
9551065 ggml_backend_dev_t dev = ggml_backend_rpc_add_device_fn (server.c_str ());
9561066 if (dev) {
957- devices .push_back (dev);
1067+ rpc_devices .push_back (dev);
9581068 } else {
9591069 fprintf (stderr, " %s: failed to add RPC device for server '%s'\n " , __func__, server.c_str ());
9601070 exit (1 );
@@ -971,16 +1081,16 @@ struct cmd_params_instance {
9711081 break ;
9721082
9731083 case GGML_BACKEND_DEVICE_TYPE_GPU:
974- devices .push_back (dev);
1084+ rpc_devices .push_back (dev);
9751085 break ;
9761086
9771087 case GGML_BACKEND_DEVICE_TYPE_IGPU:
9781088 // iGPUs are not used when there are RPC servers
9791089 break ;
9801090 }
9811091 }
982- devices .push_back (nullptr );
983- mparams.devices = devices .data ();
1092+ rpc_devices .push_back (nullptr );
1093+ mparams.devices = rpc_devices .data ();
9841094 }
9851095 }
9861096 mparams.split_mode = split_mode;
@@ -1031,6 +1141,7 @@ struct cmd_params_instance {
10311141 return model == other.model && n_gpu_layers == other.n_gpu_layers && n_cpu_moe == other.n_cpu_moe &&
10321142 rpc_servers_str == other.rpc_servers_str && split_mode == other.split_mode &&
10331143 main_gpu == other.main_gpu && use_mmap == other.use_mmap && tensor_split == other.tensor_split &&
1144+ devices == other.devices &&
10341145 vec_tensor_buft_override_equal (tensor_buft_overrides, other.tensor_buft_overrides );
10351146 }
10361147
@@ -1063,6 +1174,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
10631174 for (const auto & rpc : params.rpc_servers )
10641175 for (const auto & sm : params.split_mode )
10651176 for (const auto & mg : params.main_gpu )
1177+ for (const auto & devs : params.devices )
10661178 for (const auto & ts : params.tensor_split )
10671179 for (const auto & ot : params.tensor_buft_overrides )
10681180 for (const auto & mmp : params.use_mmap )
@@ -1103,6 +1215,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
11031215 /* .main_gpu = */ mg,
11041216 /* .no_kv_offload= */ nkvo,
11051217 /* .flash_attn = */ fa,
1218+ /* .devices = */ devs,
11061219 /* .tensor_split = */ ts,
11071220 /* .tensor_buft_overrides = */ ot,
11081221 /* .use_mmap = */ mmp,
@@ -1136,6 +1249,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
11361249 /* .main_gpu = */ mg,
11371250 /* .no_kv_offload= */ nkvo,
11381251 /* .flash_attn = */ fa,
1252+ /* .devices = */ devs,
11391253 /* .tensor_split = */ ts,
11401254 /* .tensor_buft_overrides = */ ot,
11411255 /* .use_mmap = */ mmp,
@@ -1169,6 +1283,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
11691283 /* .main_gpu = */ mg,
11701284 /* .no_kv_offload= */ nkvo,
11711285 /* .flash_attn = */ fa,
1286+ /* .devices = */ devs,
11721287 /* .tensor_split = */ ts,
11731288 /* .tensor_buft_overrides = */ ot,
11741289 /* .use_mmap = */ mmp,
@@ -1206,6 +1321,7 @@ struct test {
12061321 int main_gpu;
12071322 bool no_kv_offload;
12081323 bool flash_attn;
1324+ std::vector<ggml_backend_dev_t > devices;
12091325 std::vector<float > tensor_split;
12101326 std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;
12111327 bool use_mmap;
@@ -1241,6 +1357,7 @@ struct test {
12411357 main_gpu = inst.main_gpu ;
12421358 no_kv_offload = inst.no_kv_offload ;
12431359 flash_attn = inst.flash_attn ;
1360+ devices = inst.devices ;
12441361 tensor_split = inst.tensor_split ;
12451362 tensor_buft_overrides = inst.tensor_buft_overrides ;
12461363 use_mmap = inst.use_mmap ;
@@ -1292,9 +1409,9 @@ struct test {
12921409 " n_ubatch" , " n_threads" , " cpu_mask" , " cpu_strict" , " poll" ,
12931410 " type_k" , " type_v" , " n_gpu_layers" , " n_cpu_moe" , " split_mode" ,
12941411 " main_gpu" , " no_kv_offload" , " flash_attn" , " tensor_split" , " tensor_buft_overrides" ,
1295- " use_mmap " , " embeddings " , " no_op_offload " , " n_prompt " , " n_gen " ,
1296- " n_depth " , " test_time " , " avg_ns " , " stddev_ns " , " avg_ts " ,
1297- " stddev_ts"
1412+ " devices " , " use_mmap " , " embeddings " , " no_op_offload " , " n_prompt " ,
1413+ " n_gen " , " n_depth " , " test_time " , " avg_ns " , " stddev_ns " ,
1414+ " avg_ts " , " stddev_ts"
12981415 };
12991416 return fields;
13001417 }
@@ -1378,6 +1495,7 @@ struct test {
13781495 std::to_string (main_gpu),
13791496 std::to_string (no_kv_offload),
13801497 std::to_string (flash_attn),
1498+ devices_to_string (devices),
13811499 tensor_split_str,
13821500 tensor_buft_overrides_str,
13831501 std::to_string (use_mmap),
@@ -1559,6 +1677,9 @@ struct markdown_printer : public printer {
15591677 if (field == " flash_attn" ) {
15601678 return 2 ;
15611679 }
1680+ if (field == " devices" ) {
1681+ return -12 ;
1682+ }
15621683 if (field == " use_mmap" ) {
15631684 return 4 ;
15641685 }
@@ -1602,6 +1723,9 @@ struct markdown_printer : public printer {
16021723 if (field == " no_op_offload" ) {
16031724 return " nopo" ;
16041725 }
1726+ if (field == " devices" ) {
1727+ return " dev" ;
1728+ }
16051729 if (field == " tensor_split" ) {
16061730 return " ts" ;
16071731 }
@@ -1661,6 +1785,9 @@ struct markdown_printer : public printer {
16611785 if (params.flash_attn .size () > 1 || params.flash_attn != cmd_params_defaults.flash_attn ) {
16621786 fields.emplace_back (" flash_attn" );
16631787 }
1788+ if (params.devices .size () > 1 || params.devices != cmd_params_defaults.devices ) {
1789+ fields.emplace_back (" devices" );
1790+ }
16641791 if (params.tensor_split .size () > 1 || params.tensor_split != cmd_params_defaults.tensor_split ) {
16651792 fields.emplace_back (" tensor_split" );
16661793 }
0 commit comments