Skip to content

Commit 6dc5bd8

Browse files
authored
Support --device and --device-draft parameter (ikawrakow#866)
* add --device and --device-draft parameter * don't print debug message in release mode * fix * bug fix to throw exception when no device specified * add const --------- Co-authored-by: firecoperana <firecoperana>
1 parent bdf4f0d commit 6dc5bd8

File tree

12 files changed

+282
-39
lines changed

12 files changed

+282
-39
lines changed

common/common.cpp

Lines changed: 32 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -200,6 +200,9 @@ int32_t cpu_get_num_math() {
200200
return cpu_get_num_physical_cores();
201201
}
202202

203+
//
204+
// Arg utils
205+
//
203206
common_webui common_webui_from_name(const std::string& format) {
204207
if (format == "none") {
205208
return COMMON_WEBUI_NONE;
@@ -224,6 +227,14 @@ static std::string read_file(const std::string& fname) {
224227
file.close();
225228
return content;
226229
}
230+
231+
static std::string parse_device_list(const std::string& value) {
232+
if (value==" " || value.find("-")!= std::string::npos) {
233+
throw std::invalid_argument("no devices specified");
234+
}
235+
return value;
236+
}
237+
227238
//
228239
// CLI argument parsing
229240
//
@@ -1066,7 +1077,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
10661077
}
10671078
return true;
10681079
}
1069-
if (arg == "-ngld" || arg == "--gpu-layers-draft" || arg == "--gpu-layers-draft") {
1080+
if (arg == "-ngld" || arg == "--gpu-layers-draft" || arg == "--n-gpu-layers-draft") {
10701081
CHECK_ARG
10711082
params.n_gpu_layers_draft = std::stoi(argv[i]);
10721083
if (!llama_supports_gpu_offload()) {
@@ -1213,6 +1224,18 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
12131224
else { invalid_param = true; }
12141225
return true;
12151226
}
1227+
if (arg == "-dev" || arg == "--device") {
1228+
CHECK_ARG
1229+
std::string value(argv[i]);
1230+
params.devices = parse_device_list(value);
1231+
return true;
1232+
}
1233+
if (arg == "-devd" || arg == "--device-draft") {
1234+
CHECK_ARG
1235+
std::string value(argv[i]);
1236+
params.devices_draft = parse_device_list(value);
1237+
return true;
1238+
}
12161239
if (arg == "-v" || arg == "--verbose") {
12171240
params.verbosity = 1;
12181241
return true;
@@ -2002,6 +2025,12 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
20022025
" - row: split rows across GPUs" });
20032026
options.push_back({ "*", "-ts, --tensor-split SPLIT",
20042027
"fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1" });
2028+
options.push_back({ "*", "-dev, --device dev1,dev2",
2029+
"comma-separated list of devices to use for offloading (none = don't offload)\n"
2030+
"Example: CUDA0,CUDA1,RPC[192.168.0.1:8080]\n" });
2031+
options.push_back({ "*", "-devd, --device-draft dev1,dev2",
2032+
"comma-separated list of devices to use for offloading for the draft model (none = don't offload)\n"
2033+
"Example: CUDA0,CUDA1,RPC[192.168.0.1:8080]\n" });
20052034
options.push_back({ "*", "-mg, --main-gpu i", "the GPU to use for the model (with split-mode = none),\n"
20062035
"or for intermediate results and KV (with split-mode = row) (default: %d)", params.main_gpu });
20072036
}
@@ -2575,7 +2604,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
25752604
} else {
25762605
model = llama_load_model_from_file(params.model.c_str(), mparams);
25772606
}
2578-
2607+
25792608
if (model == NULL) {
25802609
fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
25812610
return iparams;
@@ -2692,6 +2721,7 @@ void llama_lora_adapters_apply(struct llama_context * ctx, std::vector<llama_lor
26922721

26932722
struct llama_model_params llama_model_params_from_gpt_params(const gpt_params & params) {
26942723
auto mparams = llama_model_default_params();
2724+
mparams.devices = params.devices.c_str();
26952725

26962726
if (params.n_gpu_layers != -1) {
26972727
mparams.n_gpu_layers = params.n_gpu_layers;

common/common.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -126,6 +126,9 @@ struct model_paths {
126126
};
127127

128128
struct gpt_params {
129+
std::string devices;
130+
std::string devices_draft;
131+
129132
uint32_t seed = LLAMA_DEFAULT_SEED; // RNG seed
130133

131134
int32_t n_threads = cpu_get_num_math();
@@ -193,6 +196,7 @@ struct gpt_params {
193196
std::string logits_file = ""; // file for saving *all* logits
194197
std::string rpc_servers = ""; // comma separated list of RPC servers
195198

199+
196200
std::vector<std::string> in_files; // all input files
197201
std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
198202
std::vector<llama_model_kv_override> kv_overrides;
@@ -440,6 +444,7 @@ bool fs_create_directory_with_parents(const std::string & path);
440444
std::string fs_get_cache_directory();
441445
std::string fs_get_cache_file(const std::string & filename);
442446

447+
443448
//
444449
// Model utils
445450
//

common/speculative.cpp

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -91,10 +91,10 @@ bool llama_speculative_are_compatible(
9191
const struct llama_vocab * vocab_dft = llama_get_model_vocab(model_dft);
9292

9393
const bool vocab_type_tgt = llama_vocab_type(model_tgt);
94-
LLAMA_LOG_INFO("%s: vocab_type tgt: %d\n", __func__, vocab_type_tgt);
94+
LLAMA_LOG_DEBUG("%s: vocab_type tgt: %d\n", __func__, vocab_type_tgt);
9595

9696
const bool vocab_type_dft = llama_vocab_type(model_dft);
97-
LLAMA_LOG_INFO("%s: vocab_type dft: %d\n", __func__, vocab_type_dft);
97+
LLAMA_LOG_DEBUG("%s: vocab_type dft: %d\n", __func__, vocab_type_dft);
9898

9999
if (vocab_type_tgt != vocab_type_dft) {
100100
LLAMA_LOG_INFO("%s: draft model vocab type must match target model to use speculation but ", __func__);
@@ -203,13 +203,13 @@ std::vector<llama_token> llama_speculative_gen_draft(
203203
std::string text;
204204
text = llama_detokenize(ctx_tgt, prompt_tgt_main_model, true);
205205
text = replace_to_dft(spec, text);
206-
LLAMA_LOG_INFO("%s: main->draft detokenized string: '%s'\n", __func__, text.c_str());
206+
LLAMA_LOG_DEBUG("%s: main->draft detokenized string: '%s'\n", __func__, text.c_str());
207207
prompt_tgt_draft_model = llama_tokenize(ctx_dft, text, false, true);
208208

209209
// convert id_last to draft vocab
210210
std::vector<llama_token> id_last_vec(1, id_last);
211211
text = llama_detokenize(ctx_tgt, id_last_vec);
212-
LLAMA_LOG_INFO("main->draft detokenized id_last(%d): '%s'\n", id_last, text.c_str());
212+
LLAMA_LOG_DEBUG("main->draft detokenized id_last(%d): '%s'\n", id_last, text.c_str());
213213
id_last = llama_tokenize(ctx_dft, text, false, true)[0];
214214
}
215215
// prompt_tgt's tokens will always be compatible with ctx_dft
@@ -233,8 +233,7 @@ std::vector<llama_token> llama_speculative_gen_draft(
233233
reuse_n = cur;
234234
}
235235
}
236-
237-
LLAMA_LOG_INFO("%s: reuse_i = %d, reuse_n = %d, prompt = %d\n", __func__, reuse_i, reuse_n, (int) prompt_dft.size());
236+
LLAMA_LOG_DEBUG("%s: reuse_i = %d, reuse_n = %d, prompt = %d\n", __func__, reuse_i, reuse_n, (int) prompt_dft.size());
238237

239238
std::vector<llama_token> result;
240239
result.reserve(params.n_draft);
@@ -344,7 +343,7 @@ std::vector<llama_token> llama_speculative_gen_draft(
344343
if (!spec->vocab_dft_compatible) {
345344
std::string detokenized = llama_detokenize(ctx_dft, result, true);
346345
detokenized = replace_to_tgt(spec, detokenized);
347-
LLAMA_LOG_INFO("draft->main detokenized string: '%s'\n", detokenized.c_str());
346+
LLAMA_LOG_DEBUG("draft->main detokenized string: '%s'\n", detokenized.c_str());
348347
result = llama_tokenize(ctx_tgt, detokenized, false, true);
349348
if (result.size() > (size_t)params.n_draft) {
350349
result.resize(params.n_draft);

examples/server/server.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1249,6 +1249,7 @@ struct server_context {
12491249
LOG_INFO("loading draft model", {{"model", params.model_draft}});
12501250

12511251
gpt_params params_dft;
1252+
params_dft.devices = params.devices_draft;
12521253
params_dft.model = params.model_draft;
12531254
params_dft.n_ctx = params.n_ctx_draft == 0 ? params.n_ctx / params.n_parallel : params.n_ctx_draft;
12541255
params_dft.n_gpu_layers = params.n_gpu_layers_draft;
@@ -1273,7 +1274,7 @@ struct server_context {
12731274

12741275
cparams_dft = llama_context_params_from_gpt_params(params_dft);
12751276
cparams_dft.n_batch = n_ctx_dft;
1276-
1277+
12771278
model_draft = llama_init_dft.model;
12781279
ctx_draft = llama_init_dft.context;
12791280
}

examples/speculative/speculative.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,7 @@ int main(int argc, char ** argv) {
7171
ctx_tgt = llama_init_tgt.context;
7272

7373
// load the draft model
74+
params.devices = params.devices_draft;
7475
params.model = params.model_draft;
7576
params.n_gpu_layers = params.n_gpu_layers_draft;
7677
if (params.n_threads_draft > 0) {

ggml/src/ggml-backend.cpp

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
#include <cstdio>
1010
#include <cstdlib>
1111
#include <cstring>
12+
#include <string>
1213
#include <vector>
1314
#include <set>
1415

@@ -528,6 +529,16 @@ GGML_CALL void ggml_backend_register(const char * name, ggml_backend_init_fn ini
528529
ggml_backend_registry_count++;
529530
}
530531

532+
// Backend (reg) enumeration
533+
static bool striequals(const char* a, const char* b) {
534+
for (; *a && *b; a++, b++) {
535+
if (std::tolower(*a) != std::tolower(*b)) {
536+
return false;
537+
}
538+
}
539+
return *a == *b;
540+
}
541+
531542
size_t ggml_backend_reg_get_count(void) {
532543
ggml_backend_registry_init();
533544

@@ -539,7 +550,7 @@ size_t ggml_backend_reg_find_by_name(const char * name) {
539550

540551
for (size_t i = 0; i < ggml_backend_registry_count; i++) {
541552
// TODO: case insensitive in a portable way
542-
if (strcmp(ggml_backend_registry[i].name, name) == 0) {
553+
if (striequals(ggml_backend_registry[i].name, name)) {
543554
return i;
544555
}
545556
}

include/llama.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -342,6 +342,9 @@ extern "C" {
342342
};
343343

344344
struct llama_model_params {
345+
// comma separated list of devices to use for offloading
346+
const char* devices;
347+
345348
int32_t n_gpu_layers; // number of layers to store in VRAM
346349
int32_t mla; // MLA implementation to use (only applicable to DeepSeek models at this point)
347350
enum llama_split_mode split_mode; // how to split the model across multiple GPUs

src/llama-context.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -202,4 +202,7 @@ struct llama_context {
202202
struct ggml_tensor * inp_embd_enc; // F32 [n_embd, n_outputs_enc]
203203
struct ggml_tensor * inp_KQ_mask_cross; // F32 [n_outputs_enc, n_batch]
204204
struct ggml_tensor * inp_scale = nullptr; // F32 [n_tokens]
205+
206+
ggml_backend_t ggml_backend_by_name(const char * name);
207+
205208
};

src/llama-cparams.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,9 @@ struct llama_cparams {
1212
uint32_t n_threads; // number of threads to use for generation
1313
uint32_t n_threads_batch; // number of threads to use for batch processing
1414

15+
std::vector<std::string> devices;
16+
std::vector<std::string> devices_draft;
17+
1518
float rope_freq_base;
1619
float rope_freq_scale;
1720

src/llama-impl.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,12 @@ void llama_log_callback_default(ggml_log_level level, const char * text, void *
3838

3939
#define LLAMA_LOG_INFO(...) llama_log_internal(GGML_LOG_LEVEL_INFO , __VA_ARGS__)
4040
#define LLAMA_LOG_DEBUG(...) llama_log_internal(GGML_LOG_LEVEL_DEBUG , __VA_ARGS__)
41+
#ifdef NDEBUG
42+
// Release mode - make LLAMA_LOG_DEBUG a no-op
43+
#define LLAMA_LOG_DEBUG(...) ((void)0)
44+
#else
45+
#define LLAMA_LOG_DEBUG(...) llama_log_internal(GGML_LOG_LEVEL_DEBUG, __VA_ARGS__)
46+
#endif
4147
#define LLAMA_LOG_WARN(...) llama_log_internal(GGML_LOG_LEVEL_WARN , __VA_ARGS__)
4248
#define LLAMA_LOG_ERROR(...) llama_log_internal(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
4349

0 commit comments

Comments
 (0)