Skip to content
Closed
Show file tree
Hide file tree
Changes from 10 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 43 additions & 9 deletions examples/bench/bench.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,9 @@ struct whisper_params {
int32_t what = 0; // what to benchmark: 0 - whisper encoder, 1 - memcpy, 2 - ggml_mul_mat

std::string model = "models/ggml-base.en.bin";
#ifdef GGML_BACKEND_DL
std::string device = "";
#endif

bool use_gpu = true;
bool flash_attn = false;
Expand All @@ -28,6 +31,9 @@ static bool whisper_params_parse(int argc, char ** argv, whisper_params & params
}
else if (arg == "-t" || arg == "--threads") { params.n_threads = std::stoi(argv[++i]); }
else if (arg == "-m" || arg == "--model") { params.model = argv[++i]; }
#ifdef GGML_BACKEND_DL
else if (arg == "-d" || arg == "--device") { params.device = argv[++i]; }
#endif
else if (arg == "-w" || arg == "--what") { params.what = atoi(argv[++i]); }
else if (arg == "-ng" || arg == "--no-gpu") { params.use_gpu = false; }
else if (arg == "-fa" || arg == "--flash-attn") { params.flash_attn = true; }
Expand All @@ -46,24 +52,52 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
fprintf(stderr, "usage: %s [options]\n", argv[0]);
fprintf(stderr, "\n");
fprintf(stderr, "options:\n");
fprintf(stderr, " -h, --help [default] show this help message and exit\n");
fprintf(stderr, " -t N, --threads N [%-7d] number of threads to use during computation\n", params.n_threads);
fprintf(stderr, " -m FNAME, --model FNAME [%-7s] model path\n", params.model.c_str());
fprintf(stderr, " -w N, --what N [%-7d] what to benchmark:\n", params.what);
fprintf(stderr, " %-7s 0 - whisper\n", "");
fprintf(stderr, " %-7s 1 - memcpy\n", "");
fprintf(stderr, " %-7s 2 - ggml_mul_mat\n", "");
fprintf(stderr, " -ng, --no-gpu [%-7s] disable GPU\n", params.use_gpu ? "false" : "true");
fprintf(stderr, " -fa, --flash-attn [%-7s] enable flash attention\n", params.flash_attn ? "true" : "false");
fprintf(stderr, " -h, --help [default] show this help message and exit\n");
fprintf(stderr, " -t N, --threads N [%-7d] number of threads to use during computation\n", params.n_threads);
fprintf(stderr, " -m FNAME, --model FNAME [%-7s] model path\n", params.model.c_str());
#ifdef GGML_BACKEND_DL
fprintf(stderr, " -d DEVICE, --device DEVICE [%-7s] device type\n" , params.device.c_str());
fprintf(stderr, " valid devices : blas, cann, cpu, cuda, hip, kompute,\n");
fprintf(stderr, " musa, opencl, rpc, sycl and vulkan\n");
fprintf(stderr, " Optional libraries must be supplied\n");
#endif
fprintf(stderr, " -w N, --what N [%-7d] what to benchmark:\n", params.what);
fprintf(stderr, " %-7s 0 - whisper\n", "");
fprintf(stderr, " %-7s 1 - memcpy\n", "");
fprintf(stderr, " %-7s 2 - ggml_mul_mat\n", "");
fprintf(stderr, " -ng, --no-gpu [%-7s] disable GPU\n", params.use_gpu ? "false" : "true");
fprintf(stderr, " -fa, --flash-attn [%-7s] enable flash attention\n", params.flash_attn ? "true" : "false");
fprintf(stderr, "\n");
}

static int whisper_bench_full(const whisper_params & params) {
// whisper init

// If we're using a GGML_BACKEND_DL build we need to load backends before
// the model is initialised in whisper_init_from_file_with_params
// Failure to do this will result in attempts to query null devices
#ifdef GGML_BACKEND_DL
// If params.device is "" then load all devices otherwise just load named
// device (and hope they got it right). Really should check against valid
// device names
if (params.device.empty()) {
ggml_backend_load_all();
} else {
if(ggml_backend_load_best(params.device.c_str(), true, nullptr) == nullptr) {
fprintf(stderr, "error: could not load device %s\n", params.device.c_str());
return 5;
}
Copy link
Member

@slaren slaren Apr 17, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This function is not currently available to applications, but I agree that it should. I will make the change necessary to make this function public, but at the moment it cannot be used here.

There is also an important distinction between devices and backends. Backends may have multiple devices, e.g. in a system with multiple GPUs, and it would be good to add the ability to whisper.cpp to choose what device to use, but that would need to be done in a different way (e.g. by making whisper.cpp accept a ggml_backend_dev_t object in whisper_context_params). The implementation in llama.cpp may be useful to use as a guide, although it is a bit more complicated since llama.cpp can use multiple devices at the same time.

In conclusion:

  • Adding a --backend parameter to choose which backend to load would be good, but either needs to use ggml_backend_load to load specifically the file given by the user, or it would need to wait until ggml_backend_load_best is made public
  • Adding a --device parameter to choose which device to use would also be good, but it must be a separate setting

Probably better left for a separate PR.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So in the meanwhile I can safely expose ggml_backend_load_best before it's public? That's a good start from my point of view (fairly lost without it loading cpu on older machines).

I'll wrap it in a whisper_load_device function making sure that there's at least one cpu at end of list which should be OK?

Two modifications I made to ggml-backend.cpp were dealing with passing a nullptr to functions that wanted to return a member of the passed parameter.

}
#endif

struct whisper_context_params cparams = whisper_context_default_params();

#ifdef GGML_BACKEND_DL
// Always allow GPU if GGML_BACKEND_DL as it can be overriden or only choice
cparams.use_gpu = true;
#else
cparams.use_gpu = params.use_gpu;
#endif
cparams.flash_attn = params.flash_attn;

struct whisper_context * ctx = whisper_init_from_file_with_params(params.model.c_str(), cparams);
Expand Down
7 changes: 7 additions & 0 deletions examples/cli/cli.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1002,6 +1002,13 @@ int main(int argc, char ** argv) {
whisper_log_set(cb_log_disable, NULL);
}

// If we're using a GGML_BACKEND_DL build we need to load backends before
// the model is initialised in whisper_init_from_file_with_params
// Failure to do this will result in attempts to query null devices
#ifdef GGML_BACKEND_DL
ggml_backend_load_all();
#endif

// whisper init

struct whisper_context_params cparams = whisper_context_default_params();
Expand Down
7 changes: 7 additions & 0 deletions examples/stream/stream.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,13 @@ int main(int argc, char ** argv) {
exit(0);
}

// If we're using a GGML_BACKEND_DL build we need to load backends before
// the model is initialised in whisper_init_from_file_with_params
// Failure to do this will result in attempts to query null devices
#ifdef GGML_BACKEND_DL
ggml_backend_load_all();
#endif

struct whisper_context_params cparams = whisper_context_default_params();

cparams.use_gpu = params.use_gpu;
Expand Down
7 changes: 6 additions & 1 deletion ggml/include/ggml-backend.h
Original file line number Diff line number Diff line change
Expand Up @@ -348,7 +348,12 @@ extern "C" {
// CPU buffer types are always available
GGML_API ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size);
GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void);


// Load Best exposed to allow loading of specific types of backend
// Notably this allows you to load only one specific backend ignoring all
// others (e.g. only load cuda - without cpu)
GGML_API ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent, const char * user_search_path);

#ifdef __cplusplus
}
#endif
18 changes: 18 additions & 0 deletions ggml/src/ggml-backend.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -346,7 +346,18 @@ bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor *
return ggml_backend_dev_offload_op(backend->device, op);
}

void print_error_no_device(void) {
fprintf(stderr, "You are attampting to use a null backend.\n");
fprintf(stderr, "Please verify the backend is loaded before you try to use one\n");
fprintf(stderr, "See bench.cpp / cli.cpp / stream.cpp for example\n");
}

ggml_backend_dev_t ggml_backend_get_device(ggml_backend_t backend) {
#ifdef GGML_BACKEND_DL
if (backend == nullptr) {
print_error_no_device();
}
#endif
return backend->device;
}

Expand Down Expand Up @@ -469,6 +480,11 @@ void ggml_backend_dev_get_props(ggml_backend_dev_t device, struct ggml_backend_d
}

ggml_backend_reg_t ggml_backend_dev_backend_reg(ggml_backend_dev_t device) {
#ifdef GGML_BACKEND_DL
if (device == nullptr) {
print_error_no_device();
}
#endif
return device->reg;
}

Expand Down Expand Up @@ -1455,7 +1471,9 @@ ggml_backend_sched_t ggml_backend_sched_new(
bool parallel) {
GGML_ASSERT(n_backends > 0);
GGML_ASSERT(n_backends <= GGML_SCHED_MAX_BACKENDS);
#ifndef GGML_BACKEND_DL // What's wrong with a GPU here ?
GGML_ASSERT(ggml_backend_dev_type(ggml_backend_get_device(backends[n_backends - 1])) == GGML_BACKEND_DEVICE_TYPE_CPU);
#endif

struct ggml_backend_sched * sched = (ggml_backend_sched *) calloc(1, sizeof(struct ggml_backend_sched));

Expand Down
25 changes: 10 additions & 15 deletions src/whisper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -208,15 +208,6 @@ static bool ggml_graph_compute_helper(
return t;
}

static void whisper_load_backends() {
#ifdef GGML_BACKEND_DL
static std::once_flag flag;
std::call_once(flag, []() {
ggml_backend_load_all();
});
#endif
}

// TODO: move these functions to ggml-base with support for ggml-backend?

static ggml_tensor * whisper_set_f32(struct ggml_tensor * t, float v) {
Expand Down Expand Up @@ -1313,8 +1304,6 @@ static size_t aheads_masks_nbytes(struct whisper_aheads_masks & aheads_masks) {
static ggml_backend_t whisper_backend_init_gpu(const whisper_context_params & params) {
ggml_log_set(g_state.log_callback, g_state.log_callback_user_data);

whisper_load_backends();

ggml_backend_dev_t dev = nullptr;

int cnt = 0;
Expand Down Expand Up @@ -1372,6 +1361,10 @@ static std::vector<ggml_backend_t> whisper_backend_init(const whisper_context_pa

ggml_backend_t backend_cpu = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr);
if (backend_cpu == nullptr) {
#ifdef GGML_BACKEND_DL
// If not using a load_all it is possible CPU is null
return result;
#endif
throw std::runtime_error("failed to initialize CPU backend");
}
result.push_back(backend_cpu);
Expand Down Expand Up @@ -1407,6 +1400,12 @@ static buft_list_t make_buft_list(whisper_context_params & params) {

// CPU Extra
auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
#ifdef GGML_BACKEND_DL
// If not using a load_all it is possible CPU is null
if(cpu_dev == nullptr) {
return buft_list;
}
#endif
auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev);
auto get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t)
ggml_backend_reg_get_proc_address(cpu_reg, "ggml_backend_dev_get_extra_bufts");
Expand Down Expand Up @@ -4321,8 +4320,6 @@ static int whisper_has_openvino(void) {
const char * whisper_print_system_info(void) {
static std::string s;

whisper_load_backends();

s = "";
s += "WHISPER : ";
s += "COREML = " + std::to_string(whisper_has_coreml()) + " | ";
Expand Down Expand Up @@ -6776,8 +6773,6 @@ WHISPER_API int whisper_bench_ggml_mul_mat(int n_threads) {
}

WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) {
whisper_load_backends();

static std::string s;
s = "";
char strbuf[256];
Expand Down