@@ -200,6 +200,9 @@ int32_t cpu_get_num_math() {
200200 return cpu_get_num_physical_cores ();
201201}
202202
203+ //
204+ // Arg utils
205+ //
203206common_webui common_webui_from_name (const std::string& format) {
204207 if (format == " none" ) {
205208 return COMMON_WEBUI_NONE;
@@ -224,6 +227,14 @@ static std::string read_file(const std::string& fname) {
224227 file.close ();
225228 return content;
226229}
230+
231+ static std::string parse_device_list (const std::string& value) {
232+ if (value==" " || value.find (" -" )!= std::string::npos) {
233+ throw std::invalid_argument (" no devices specified" );
234+ }
235+ return value;
236+ }
237+
227238//
228239// CLI argument parsing
229240//
@@ -1066,7 +1077,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
10661077 }
10671078 return true ;
10681079 }
1069- if (arg == " -ngld" || arg == " --gpu-layers-draft" || arg == " --gpu-layers-draft" ) {
1080+ if (arg == " -ngld" || arg == " --gpu-layers-draft" || arg == " --n- gpu-layers-draft" ) {
10701081 CHECK_ARG
10711082 params.n_gpu_layers_draft = std::stoi (argv[i]);
10721083 if (!llama_supports_gpu_offload ()) {
@@ -1213,6 +1224,18 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
12131224 else { invalid_param = true ; }
12141225 return true ;
12151226 }
1227+ if (arg == " -dev" || arg == " --device" ) {
1228+ CHECK_ARG
1229+ std::string value (argv[i]);
1230+ params.devices = parse_device_list (value);
1231+ return true ;
1232+ }
1233+ if (arg == " -devd" || arg == " --device-draft" ) {
1234+ CHECK_ARG
1235+ std::string value (argv[i]);
1236+ params.devices_draft = parse_device_list (value);
1237+ return true ;
1238+ }
12161239 if (arg == " -v" || arg == " --verbose" ) {
12171240 params.verbosity = 1 ;
12181241 return true ;
@@ -2002,6 +2025,12 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
20022025 " - row: split rows across GPUs" });
20032026 options.push_back ({ " *" , " -ts, --tensor-split SPLIT" ,
20042027 " fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1" });
2028+ options.push_back ({ " *" , " -dev, --device dev1,dev2" ,
2029+ " comma-separated list of devices to use for offloading (none = don't offload)\n "
2030+ " Example: CUDA0,CUDA1,RPC[192.168.0.1:8080]\n " });
2031+ options.push_back ({ " *" , " -devd, --device-draft dev1,dev2" ,
2032+ " comma-separated list of devices to use for offloading for the draft model (none = don't offload)\n "
2033+ " Example: CUDA0,CUDA1,RPC[192.168.0.1:8080]\n " });
20052034 options.push_back ({ " *" , " -mg, --main-gpu i" , " the GPU to use for the model (with split-mode = none),\n "
20062035 " or for intermediate results and KV (with split-mode = row) (default: %d)" , params.main_gpu });
20072036 }
@@ -2575,7 +2604,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
25752604 } else {
25762605 model = llama_load_model_from_file (params.model .c_str (), mparams);
25772606 }
2578-
2607+
25792608 if (model == NULL ) {
25802609 fprintf (stderr, " %s: error: failed to load model '%s'\n " , __func__, params.model .c_str ());
25812610 return iparams;
@@ -2692,6 +2721,7 @@ void llama_lora_adapters_apply(struct llama_context * ctx, std::vector<llama_lor
26922721
26932722struct llama_model_params llama_model_params_from_gpt_params (const gpt_params & params) {
26942723 auto mparams = llama_model_default_params ();
2724+ mparams.devices = params.devices .c_str ();
26952725
26962726 if (params.n_gpu_layers != -1 ) {
26972727 mparams.n_gpu_layers = params.n_gpu_layers ;
0 commit comments