ggml-org · peardox · Apr 16, 2025 · Apr 16, 2025 · Apr 16, 2025 · Apr 16, 2025
diff --git a/examples/bench/bench.cpp b/examples/bench/bench.cpp
@@ -11,6 +11,9 @@ struct whisper_params {
     int32_t what = 0; // what to benchmark: 0 - whisper encoder, 1 - memcpy, 2 - ggml_mul_mat
 
     std::string model = "models/ggml-base.en.bin";
+    #ifdef GGML_BACKEND_DL
+    std::string device = "";
+    #endif
 
     bool use_gpu    = true;
     bool flash_attn = false;
@@ -28,6 +31,9 @@ static bool whisper_params_parse(int argc, char ** argv, whisper_params & params
         }
         else if (arg == "-t"  || arg == "--threads")    { params.n_threads  = std::stoi(argv[++i]); }
         else if (arg == "-m"  || arg == "--model")      { params.model      = argv[++i]; }
+        #ifdef GGML_BACKEND_DL
+        else if (arg == "-d"  || arg == "--device")     { params.device     = argv[++i]; }
+        #endif
         else if (arg == "-w"  || arg == "--what")       { params.what       = atoi(argv[++i]); }
         else if (arg == "-ng" || arg == "--no-gpu")     { params.use_gpu    = false; }
         else if (arg == "-fa" || arg == "--flash-attn") { params.flash_attn = true; }
@@ -46,24 +52,52 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
     fprintf(stderr, "usage: %s [options]\n", argv[0]);
     fprintf(stderr, "\n");
     fprintf(stderr, "options:\n");
-    fprintf(stderr, "  -h,       --help        [default] show this help message and exit\n");
-    fprintf(stderr, "  -t N,     --threads N   [%-7d] number of threads to use during computation\n", params.n_threads);
-    fprintf(stderr, "  -m FNAME, --model FNAME [%-7s] model path\n",                                  params.model.c_str());
-    fprintf(stderr, "  -w N,     --what N      [%-7d] what to benchmark:\n",                          params.what);
-    fprintf(stderr, "                           %-7s  0 - whisper\n",                                 "");
-    fprintf(stderr, "                           %-7s  1 - memcpy\n",                                  "");
-    fprintf(stderr, "                           %-7s  2 - ggml_mul_mat\n",                            "");
-    fprintf(stderr, "  -ng,      --no-gpu      [%-7s] disable GPU\n",                                 params.use_gpu ? "false" : "true");
-    fprintf(stderr, "  -fa,      --flash-attn  [%-7s] enable flash attention\n",                      params.flash_attn ? "true" : "false");
+    fprintf(stderr, "  -h,       --help           [default] show this help message and exit\n");
+    fprintf(stderr, "  -t N,     --threads N      [%-7d] number of threads to use during computation\n", params.n_threads);
+    fprintf(stderr, "  -m FNAME, --model FNAME    [%-7s] model path\n",                                  params.model.c_str());
+    #ifdef GGML_BACKEND_DL
+    fprintf(stderr, "  -d DEVICE, --device DEVICE [%-7s] device type\n"                                , params.device.c_str());
+    fprintf(stderr, "                              valid devices : blas, cann, cpu, cuda, hip, kompute,\n");
+    fprintf(stderr, "                                              musa, opencl, rpc, sycl and vulkan\n");
+    fprintf(stderr, "                                              Optional libraries must be supplied\n");
+    #endif
+    fprintf(stderr, "  -w N,     --what N         [%-7d] what to benchmark:\n",                          params.what);
+    fprintf(stderr, "                              %-7s  0 - whisper\n",                                 "");
+    fprintf(stderr, "                              %-7s  1 - memcpy\n",                                  "");
+    fprintf(stderr, "                              %-7s  2 - ggml_mul_mat\n",                            "");
+    fprintf(stderr, "  -ng,      --no-gpu         [%-7s] disable GPU\n",                                 params.use_gpu ? "false" : "true");
+    fprintf(stderr, "  -fa,      --flash-attn     [%-7s] enable flash attention\n",                      params.flash_attn ? "true" : "false");
     fprintf(stderr, "\n");
 }
 
 static int whisper_bench_full(const whisper_params & params) {
     // whisper init
 
+    // If we're using a GGML_BACKEND_DL build we need to load backends before
+    // the model is initialised in whisper_init_from_file_with_params
+    // Failure to do this will result in attempts to query null devices
+    #ifdef GGML_BACKEND_DL
+    // If params.device is "" then load all devices otherwise just load named 
+    // device (and hope they got it right). Really should check against valid 
+    // device names
+    if (params.device.empty()) {
+        ggml_backend_load_all();
+    } else {
+        if(ggml_backend_load_best(params.device.c_str(), true, nullptr) == nullptr) {
+            fprintf(stderr, "error: could not load device %s\n", params.device.c_str());
+            return 5;
+        }
+    }
+    #endif
+
     struct whisper_context_params cparams = whisper_context_default_params();
 
+    #ifdef GGML_BACKEND_DL
+    // Always allow GPU if GGML_BACKEND_DL as it can be overriden or only choice
+    cparams.use_gpu    = true;
+    #else
     cparams.use_gpu    = params.use_gpu;
+    #endif
     cparams.flash_attn = params.flash_attn;
 
     struct whisper_context * ctx = whisper_init_from_file_with_params(params.model.c_str(), cparams);

diff --git a/examples/cli/cli.cpp b/examples/cli/cli.cpp
@@ -1002,6 +1002,13 @@ int main(int argc, char ** argv) {
         whisper_log_set(cb_log_disable, NULL);
     }
 
+    // If we're using a GGML_BACKEND_DL build we need to load backends before
+    // the model is initialised in whisper_init_from_file_with_params
+    // Failure to do this will result in attempts to query null devices
+    #ifdef GGML_BACKEND_DL
+    ggml_backend_load_all();
+    #endif
+
     // whisper init
 
     struct whisper_context_params cparams = whisper_context_default_params();

diff --git a/examples/stream/stream.cpp b/examples/stream/stream.cpp
@@ -155,6 +155,13 @@ int main(int argc, char ** argv) {
         exit(0);
     }
 
+    // If we're using a GGML_BACKEND_DL build we need to load backends before
+    // the model is initialised in whisper_init_from_file_with_params
+    // Failure to do this will result in attempts to query null devices
+    #ifdef GGML_BACKEND_DL
+    ggml_backend_load_all();
+    #endif
+
     struct whisper_context_params cparams = whisper_context_default_params();
 
     cparams.use_gpu    = params.use_gpu;

diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
@@ -348,7 +348,12 @@ extern "C" {
     // CPU buffer types are always available
     GGML_API ggml_backend_buffer_t      ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size);
     GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void);
-
+
+    // Load Best exposed to allow loading of specific types of backend
+    // Notably this allows you to load only one specific backend ignoring all
+    // others (e.g. only load cuda - without cpu)
+    GGML_API ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent, const char * user_search_path);
+
 #ifdef  __cplusplus
 }
 #endif
diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
@@ -346,7 +346,18 @@ bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor *
     return ggml_backend_dev_offload_op(backend->device, op);
 }
 
+void print_error_no_device(void) {
+    fprintf(stderr, "You are attampting to use a null backend.\n");
+    fprintf(stderr, "Please verify the backend is loaded before you try to use one\n");
+    fprintf(stderr, "See bench.cpp / cli.cpp / stream.cpp for example\n");
+}
+
 ggml_backend_dev_t ggml_backend_get_device(ggml_backend_t backend) {
+    #ifdef GGML_BACKEND_DL
+    if (backend == nullptr) {
+        print_error_no_device();
+    }
+    #endif
     return backend->device;
 }
 
@@ -469,6 +480,11 @@ void ggml_backend_dev_get_props(ggml_backend_dev_t device, struct ggml_backend_d
 }
 
 ggml_backend_reg_t ggml_backend_dev_backend_reg(ggml_backend_dev_t device) {
+    #ifdef GGML_BACKEND_DL
+    if (device == nullptr) {
+        print_error_no_device();
+    }
+    #endif
     return device->reg;
 }
 
@@ -1455,7 +1471,9 @@ ggml_backend_sched_t ggml_backend_sched_new(
         bool parallel) {
     GGML_ASSERT(n_backends > 0);
     GGML_ASSERT(n_backends <= GGML_SCHED_MAX_BACKENDS);
+#ifndef GGML_BACKEND_DL // What's wrong with a GPU here ?
     GGML_ASSERT(ggml_backend_dev_type(ggml_backend_get_device(backends[n_backends - 1])) == GGML_BACKEND_DEVICE_TYPE_CPU);
+#endif
 
     struct ggml_backend_sched * sched = (ggml_backend_sched *) calloc(1, sizeof(struct ggml_backend_sched));
 

diff --git a/src/whisper.cpp b/src/whisper.cpp
@@ -208,15 +208,6 @@ static bool ggml_graph_compute_helper(
     return t;
 }
 
-static void whisper_load_backends() {
-#ifdef GGML_BACKEND_DL
-    static std::once_flag flag;
-    std::call_once(flag, []() {
-        ggml_backend_load_all();
-    });
-#endif
-}
-
 // TODO: move these functions to ggml-base with support for ggml-backend?
 
 static ggml_tensor * whisper_set_f32(struct ggml_tensor * t, float v) {
@@ -1313,8 +1304,6 @@ static size_t aheads_masks_nbytes(struct whisper_aheads_masks & aheads_masks) {
 static ggml_backend_t whisper_backend_init_gpu(const whisper_context_params & params) {
     ggml_log_set(g_state.log_callback, g_state.log_callback_user_data);
 
-    whisper_load_backends();
-
     ggml_backend_dev_t dev = nullptr;
 
     int cnt = 0;
@@ -1372,6 +1361,10 @@ static std::vector<ggml_backend_t> whisper_backend_init(const whisper_context_pa
 
     ggml_backend_t backend_cpu = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr);
     if (backend_cpu == nullptr) {
+        #ifdef GGML_BACKEND_DL
+        // If not using a load_all it is possible CPU is null
+        return result;
+        #endif  
         throw std::runtime_error("failed to initialize CPU backend");
     }
     result.push_back(backend_cpu);
@@ -1407,6 +1400,12 @@ static buft_list_t make_buft_list(whisper_context_params & params) {
 
     // CPU Extra
     auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
+    #ifdef GGML_BACKEND_DL
+    // If not using a load_all it is possible CPU is null
+    if(cpu_dev == nullptr) {
+        return buft_list;
+    }
+    #endif
     auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev);
     auto get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t)
         ggml_backend_reg_get_proc_address(cpu_reg, "ggml_backend_dev_get_extra_bufts");
@@ -4321,8 +4320,6 @@ static int whisper_has_openvino(void) {
 const char * whisper_print_system_info(void) {
     static std::string s;
 
-    whisper_load_backends();
-
     s  = "";
     s += "WHISPER : ";
     s += "COREML = "    + std::to_string(whisper_has_coreml())     + " | ";
@@ -6776,8 +6773,6 @@ WHISPER_API int whisper_bench_ggml_mul_mat(int n_threads) {
 }
 
 WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) {
-    whisper_load_backends();
-
     static std::string s;
     s = "";
     char strbuf[256];