Support --device and --device-draft parameter (ikawrakow#866)

firecoperana · web-flow · commit 6dc5bd847bfb · 2025-10-27T18:13:28.000+02:00
* add --device and --device-draft parameter

* don't print debug message in release mode

* fix

* bug fix to throw exception when no device specified

* add const

---------

Co-authored-by: firecoperana &lt;firecoperana&gt;
diff --git a/common/common.cpp b/common/common.cpp
@@ -200,6 +200,9 @@ int32_t cpu_get_num_math() {
     return cpu_get_num_physical_cores();
 }
 
+//
+// Arg utils
+//
 common_webui common_webui_from_name(const std::string& format) {
     if (format == "none") {
         return COMMON_WEBUI_NONE;
@@ -224,6 +227,14 @@ static std::string read_file(const std::string& fname) {
     file.close();
     return content;
 }
+
+static std::string parse_device_list(const std::string& value) {
+    if (value==" " || value.find("-")!= std::string::npos) {
+        throw std::invalid_argument("no devices specified");
+    }
+    return value;
+}
+
 //
 // CLI argument parsing
 //
@@ -1066,7 +1077,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
         }
         return true;
     }
-    if (arg == "-ngld" || arg == "--gpu-layers-draft" || arg == "--gpu-layers-draft") {
+    if (arg == "-ngld" || arg == "--gpu-layers-draft" || arg == "--n-gpu-layers-draft") {
         CHECK_ARG
         params.n_gpu_layers_draft = std::stoi(argv[i]);
         if (!llama_supports_gpu_offload()) {
@@ -1213,6 +1224,18 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
         else { invalid_param = true; }
         return true;
     }
+    if (arg == "-dev" || arg == "--device") {
+        CHECK_ARG
+        std::string value(argv[i]);
+        params.devices = parse_device_list(value);
+        return true;
+    }
+    if (arg == "-devd" || arg == "--device-draft") {
+        CHECK_ARG
+        std::string value(argv[i]);
+        params.devices_draft = parse_device_list(value);
+        return true;
+    }
     if (arg == "-v" || arg == "--verbose") {
         params.verbosity = 1;
         return true;
@@ -2002,6 +2025,12 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
                                                                         "  - row: split rows across GPUs" });
         options.push_back({ "*",           "-ts,   --tensor-split SPLIT",
                                                                         "fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1" });
+        options.push_back({ "*",           "-dev,   --device dev1,dev2",
+                                                                         "comma-separated list of devices to use for offloading (none = don't offload)\n"
+                                                                         "Example: CUDA0,CUDA1,RPC[192.168.0.1:8080]\n" });
+        options.push_back({ "*",           "-devd,   --device-draft dev1,dev2",
+                                                                         "comma-separated list of devices to use for offloading for the draft model (none = don't offload)\n"
+                                                                         "Example: CUDA0,CUDA1,RPC[192.168.0.1:8080]\n" });
         options.push_back({ "*",           "-mg,   --main-gpu i",       "the GPU to use for the model (with split-mode = none),\n"
                                                                         "or for intermediate results and KV (with split-mode = row) (default: %d)", params.main_gpu });
     }
@@ -2575,7 +2604,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
     } else {
         model = llama_load_model_from_file(params.model.c_str(), mparams);
     }
-
+    
     if (model == NULL) {
         fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
         return iparams;
@@ -2692,6 +2721,7 @@ void llama_lora_adapters_apply(struct llama_context * ctx, std::vector<llama_lor
 
 struct llama_model_params llama_model_params_from_gpt_params(const gpt_params & params) {
     auto mparams = llama_model_default_params();
+    mparams.devices = params.devices.c_str(); 
 
     if (params.n_gpu_layers != -1) {
         mparams.n_gpu_layers = params.n_gpu_layers;
diff --git a/common/common.h b/common/common.h
@@ -126,6 +126,9 @@ struct model_paths {
 };
 
 struct gpt_params {
+    std::string devices;
+    std::string devices_draft;
+
     uint32_t seed                 = LLAMA_DEFAULT_SEED; // RNG seed
 
     int32_t n_threads             = cpu_get_num_math();
@@ -193,6 +196,7 @@ struct gpt_params {
     std::string logits_file          = ""; // file for saving *all* logits
     std::string rpc_servers          = ""; // comma separated list of RPC servers
 
+
     std::vector<std::string> in_files;   // all input files
     std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
     std::vector<llama_model_kv_override> kv_overrides;
@@ -440,6 +444,7 @@ bool fs_create_directory_with_parents(const std::string & path);
 std::string fs_get_cache_directory();
 std::string fs_get_cache_file(const std::string & filename);
 
+
 //
 // Model utils
 //
diff --git a/common/speculative.cpp b/common/speculative.cpp
@@ -91,10 +91,10 @@ bool llama_speculative_are_compatible(
     const struct llama_vocab * vocab_dft = llama_get_model_vocab(model_dft);
 
     const bool vocab_type_tgt = llama_vocab_type(model_tgt);
-    LLAMA_LOG_INFO("%s: vocab_type tgt: %d\n", __func__, vocab_type_tgt);
+    LLAMA_LOG_DEBUG("%s: vocab_type tgt: %d\n", __func__, vocab_type_tgt);
 
     const bool vocab_type_dft = llama_vocab_type(model_dft);
-    LLAMA_LOG_INFO("%s: vocab_type dft: %d\n", __func__, vocab_type_dft);
+    LLAMA_LOG_DEBUG("%s: vocab_type dft: %d\n", __func__, vocab_type_dft);
 
     if (vocab_type_tgt != vocab_type_dft) {
         LLAMA_LOG_INFO("%s: draft model vocab type must match target model to use speculation but ", __func__);
@@ -203,13 +203,13 @@ std::vector<llama_token> llama_speculative_gen_draft(
         std::string text;
         text = llama_detokenize(ctx_tgt, prompt_tgt_main_model, true);
         text = replace_to_dft(spec, text);
-        LLAMA_LOG_INFO("%s: main->draft detokenized string: '%s'\n", __func__, text.c_str());
+        LLAMA_LOG_DEBUG("%s: main->draft detokenized string: '%s'\n", __func__, text.c_str());
         prompt_tgt_draft_model = llama_tokenize(ctx_dft, text, false, true);
 
         // convert id_last to draft vocab
         std::vector<llama_token> id_last_vec(1, id_last);
         text = llama_detokenize(ctx_tgt, id_last_vec);
-        LLAMA_LOG_INFO("main->draft detokenized id_last(%d): '%s'\n", id_last, text.c_str());
+        LLAMA_LOG_DEBUG("main->draft detokenized id_last(%d): '%s'\n", id_last, text.c_str());
         id_last = llama_tokenize(ctx_dft, text, false, true)[0];
     }
     // prompt_tgt's tokens will always be compatible with ctx_dft
@@ -233,8 +233,7 @@ std::vector<llama_token> llama_speculative_gen_draft(
             reuse_n = cur;
         }
     }
-
-    LLAMA_LOG_INFO("%s: reuse_i = %d, reuse_n = %d, prompt = %d\n", __func__, reuse_i, reuse_n, (int) prompt_dft.size());
+    LLAMA_LOG_DEBUG("%s: reuse_i = %d, reuse_n = %d, prompt = %d\n", __func__, reuse_i, reuse_n, (int) prompt_dft.size());
 
     std::vector<llama_token> result;
     result.reserve(params.n_draft);
@@ -344,7 +343,7 @@ std::vector<llama_token> llama_speculative_gen_draft(
     if (!spec->vocab_dft_compatible) {
         std::string detokenized = llama_detokenize(ctx_dft, result, true);
         detokenized = replace_to_tgt(spec, detokenized);
-        LLAMA_LOG_INFO("draft->main detokenized string: '%s'\n", detokenized.c_str());
+        LLAMA_LOG_DEBUG("draft->main detokenized string: '%s'\n", detokenized.c_str());
         result = llama_tokenize(ctx_tgt, detokenized, false, true);
         if (result.size() > (size_t)params.n_draft) {
             result.resize(params.n_draft);
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
@@ -1249,6 +1249,7 @@ struct server_context {
             LOG_INFO("loading draft model", {{"model", params.model_draft}});
 
             gpt_params params_dft;
+            params_dft.devices = params.devices_draft;
             params_dft.model = params.model_draft;
             params_dft.n_ctx = params.n_ctx_draft == 0 ? params.n_ctx / params.n_parallel : params.n_ctx_draft;
             params_dft.n_gpu_layers = params.n_gpu_layers_draft;
@@ -1273,7 +1274,7 @@ struct server_context {
 
             cparams_dft = llama_context_params_from_gpt_params(params_dft);
             cparams_dft.n_batch = n_ctx_dft;
-
+            
             model_draft = llama_init_dft.model;
             ctx_draft = llama_init_dft.context;
         }
diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp
@@ -71,6 +71,7 @@ int main(int argc, char ** argv) {
     ctx_tgt = llama_init_tgt.context;
 
     // load the draft model
+    params.devices = params.devices_draft;
     params.model = params.model_draft;
     params.n_gpu_layers = params.n_gpu_layers_draft;
     if (params.n_threads_draft > 0) {
diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
@@ -9,6 +9,7 @@
 #include <cstdio>
 #include <cstdlib>
 #include <cstring>
+#include <string>
 #include <vector>
 #include <set>
 
@@ -528,6 +529,16 @@ GGML_CALL void ggml_backend_register(const char * name, ggml_backend_init_fn ini
     ggml_backend_registry_count++;
 }
 
+// Backend (reg) enumeration
+static bool striequals(const char* a, const char* b) {
+    for (; *a && *b; a++, b++) {
+        if (std::tolower(*a) != std::tolower(*b)) {
+            return false;
+        }
+    }
+    return *a == *b;
+}
+
 size_t ggml_backend_reg_get_count(void) {
     ggml_backend_registry_init();
 
@@ -539,7 +550,7 @@ size_t ggml_backend_reg_find_by_name(const char * name) {
 
     for (size_t i = 0; i < ggml_backend_registry_count; i++) {
         // TODO: case insensitive in a portable way
-        if (strcmp(ggml_backend_registry[i].name, name) == 0) {
+        if (striequals(ggml_backend_registry[i].name, name)) {
             return i;
         }
     }
diff --git a/include/llama.h b/include/llama.h
@@ -342,6 +342,9 @@ extern "C" {
     };
 
     struct llama_model_params {
+        // comma separated list of devices to use for offloading
+        const char* devices;
+
         int32_t n_gpu_layers; // number of layers to store in VRAM
         int32_t mla;          // MLA implementation to use (only applicable to DeepSeek models at this point)
         enum llama_split_mode split_mode; // how to split the model across multiple GPUs
diff --git a/src/llama-context.h b/src/llama-context.h
@@ -202,4 +202,7 @@ struct llama_context {
     struct ggml_tensor * inp_embd_enc;      // F32 [n_embd, n_outputs_enc]
     struct ggml_tensor * inp_KQ_mask_cross; // F32 [n_outputs_enc, n_batch]
     struct ggml_tensor * inp_scale = nullptr; // F32 [n_tokens]
+
+    ggml_backend_t ggml_backend_by_name(const char * name);
+
 };
diff --git a/src/llama-cparams.h b/src/llama-cparams.h
@@ -12,6 +12,9 @@ struct llama_cparams {
     uint32_t n_threads;       // number of threads to use for generation
     uint32_t n_threads_batch; // number of threads to use for batch processing
 
+    std::vector<std::string> devices;
+    std::vector<std::string> devices_draft;
+
     float rope_freq_base;
     float rope_freq_scale;
 
diff --git a/src/llama-impl.h b/src/llama-impl.h
@@ -38,6 +38,12 @@ void llama_log_callback_default(ggml_log_level level, const char * text, void *
 
 #define LLAMA_LOG_INFO(...)  llama_log_internal(GGML_LOG_LEVEL_INFO , __VA_ARGS__)
 #define LLAMA_LOG_DEBUG(...)  llama_log_internal(GGML_LOG_LEVEL_DEBUG , __VA_ARGS__)
+#ifdef NDEBUG
+// Release mode - make LLAMA_LOG_DEBUG a no-op
+#define LLAMA_LOG_DEBUG(...) ((void)0)
+#else
+#define LLAMA_LOG_DEBUG(...) llama_log_internal(GGML_LOG_LEVEL_DEBUG, __VA_ARGS__)
+#endif
 #define LLAMA_LOG_WARN(...)  llama_log_internal(GGML_LOG_LEVEL_WARN , __VA_ARGS__)
 #define LLAMA_LOG_ERROR(...) llama_log_internal(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
 
diff --git a/src/llama-model.h b/src/llama-model.h
@@ -264,6 +264,7 @@ struct llama_model {
     int n_gpu_layers;
 
     std::vector<std::string> rpc_servers;
+    std::vector<int32_t> devices;
 
     // gguf metadata
     std::unordered_map<std::string, std::string> gguf_kv;
diff --git a/src/llama.cpp b/src/llama.cpp