Skip to content

Commit 579539a

Browse files
committed
whisper : support GGML_BACKEND_DL
1 parent dfc6ca6 commit 579539a

File tree

3 files changed

+121
-57
lines changed

3 files changed

+121
-57
lines changed

examples/bench/bench.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -50,11 +50,11 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
5050
fprintf(stderr, " -t N, --threads N [%-7d] number of threads to use during computation\n", params.n_threads);
5151
fprintf(stderr, " -m FNAME, --model FNAME [%-7s] model path\n", params.model.c_str());
5252
fprintf(stderr, " -w N, --what N [%-7d] what to benchmark:\n", params.what);
53-
fprintf(stderr, " -ng, --no-gpu [%-7s] disable GPU\n", params.use_gpu ? "false" : "true");
54-
fprintf(stderr, " -fa, --flash-attn [%-7s] enable flash attention\n", params.flash_attn ? "true" : "false");
5553
fprintf(stderr, " %-7s 0 - whisper\n", "");
5654
fprintf(stderr, " %-7s 1 - memcpy\n", "");
5755
fprintf(stderr, " %-7s 2 - ggml_mul_mat\n", "");
56+
fprintf(stderr, " -ng, --no-gpu [%-7s] disable GPU\n", params.use_gpu ? "false" : "true");
57+
fprintf(stderr, " -fa, --flash-attn [%-7s] enable flash attention\n", params.flash_attn ? "true" : "false");
5858
fprintf(stderr, "\n");
5959
}
6060

ggml/src/CMakeLists.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -226,6 +226,9 @@ add_library(ggml-base
226226
gguf.cpp)
227227

228228
target_include_directories(ggml-base PRIVATE .)
229+
if (GGML_BACKEND_DL)
230+
target_compile_definitions(ggml-base PUBLIC GGML_BACKEND_DL)
231+
endif()
229232

230233
add_library(ggml
231234
ggml-backend-reg.cpp)

src/whisper.cpp

Lines changed: 116 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,7 @@
11
#include "whisper.h"
22

3-
#include "ggml-cpu.h"
4-
53
#include "ggml.h"
4+
#include "ggml-cpp.h"
65
#include "ggml-alloc.h"
76
#include "ggml-backend.h"
87

@@ -19,19 +18,20 @@
1918
#include <cassert>
2019
#define _USE_MATH_DEFINES
2120
#include <cmath>
22-
#include <cstdio>
21+
#include <codecvt>
2322
#include <cstdarg>
23+
#include <cstdio>
2424
#include <cstring>
2525
#include <fstream>
26+
#include <functional>
2627
#include <map>
28+
#include <mutex>
29+
#include <random>
30+
#include <regex>
2731
#include <set>
2832
#include <string>
2933
#include <thread>
3034
#include <vector>
31-
#include <regex>
32-
#include <random>
33-
#include <functional>
34-
#include <codecvt>
3535

3636
// dummy
3737

@@ -149,21 +149,25 @@ static void whisper_log_callback_default(ggml_log_level level, const char * text
149149

150150
static bool ggml_graph_compute_helper(
151151
struct ggml_cgraph * graph,
152-
std::vector<uint8_t> & buf,
153152
int n_threads,
154153
ggml_abort_callback abort_callback,
155154
void * abort_callback_data) {
156-
struct ggml_cplan plan = ggml_graph_plan(graph, n_threads, nullptr);
157155

158-
plan.abort_callback = abort_callback;
159-
plan.abort_callback_data = abort_callback_data;
156+
ggml_backend_ptr backend { ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr) };
157+
158+
auto * reg = ggml_backend_dev_backend_reg(ggml_backend_get_device(backend.get()));
160159

161-
if (plan.work_size > 0) {
162-
buf.resize(plan.work_size);
163-
plan.work_data = buf.data();
160+
auto * set_abort_callback_fn = (ggml_backend_set_abort_callback_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_abort_callback");
161+
if (set_abort_callback_fn) {
162+
set_abort_callback_fn(backend.get(), abort_callback, abort_callback_data);
164163
}
165164

166-
return ggml_graph_compute(graph, &plan);
165+
auto ggml_backend_set_n_threads_fn = (ggml_backend_set_n_threads_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_n_threads");
166+
if (ggml_backend_set_n_threads_fn) {
167+
ggml_backend_set_n_threads_fn(backend.get(), n_threads);
168+
}
169+
170+
return ggml_backend_graph_compute(backend.get(), graph) == GGML_STATUS_SUCCESS;
167171
}
168172

169173
static bool ggml_graph_compute_helper(
@@ -187,6 +191,51 @@ static bool ggml_graph_compute_helper(
187191
return t;
188192
}
189193

194+
static void whisper_load_backends() {
195+
#ifdef GGML_BACKEND_DL
196+
static std::once_flag flag;
197+
std::call_once(flag, []() {
198+
ggml_backend_load_all();
199+
});
200+
#endif
201+
}
202+
203+
// TODO: move these functions to ggml-base with support for ggml-backend?
204+
205+
static ggml_tensor * whisper_set_f32(struct ggml_tensor * t, float v) {
206+
GGML_ASSERT(t->type == GGML_TYPE_F32);
207+
GGML_ASSERT(ggml_is_contiguous(t));
208+
size_t nels = ggml_nelements(t);
209+
for (int64_t i = 0; i < nels; ++i) {
210+
((float *) t->data)[i] = v;
211+
}
212+
return t;
213+
}
214+
215+
static float whisper_get_f32_nd(const struct ggml_tensor * t, int64_t i0, int64_t i1, int64_t i2, int64_t i3) {
216+
GGML_ASSERT(t->type == GGML_TYPE_F32);
217+
void * data = (char *) t->data + i0*t->nb[0] + i1*t->nb[1] + i2*t->nb[2] + i3*t->nb[3];
218+
return *(float *) data;
219+
}
220+
221+
static void whisper_set_f32_nd(struct ggml_tensor * t, int64_t i0, int64_t i1, int64_t i2, int64_t i3, float v) {
222+
GGML_ASSERT(t->type == GGML_TYPE_F32);
223+
void * data = (char *) t->data + i0*t->nb[0] + i1*t->nb[1] + i2*t->nb[2] + i3*t->nb[3];
224+
*(float *) data = v;
225+
}
226+
227+
static int32_t whisper_get_i32_nd(const struct ggml_tensor * t, int64_t i0, int64_t i1, int64_t i2, int64_t i3) {
228+
GGML_ASSERT(t->type == GGML_TYPE_I32);
229+
void * data = (char *) t->data + i0*t->nb[0] + i1*t->nb[1] + i2*t->nb[2] + i3*t->nb[3];
230+
return *(int32_t *) data;
231+
}
232+
233+
static void whisper_set_i32_nd(struct ggml_tensor * t, int64_t i0, int64_t i1, int64_t i2, int64_t i3, int32_t v) {
234+
GGML_ASSERT(t->type == GGML_TYPE_I32);
235+
void * data = (char *) t->data + i0*t->nb[0] + i1*t->nb[1] + i2*t->nb[2] + i3*t->nb[3];
236+
*(int32_t *) data = v;
237+
}
238+
190239
// faster matrix multiplications for tensors that do not have dimension 0 divisible by "pad"
191240
// the idea is to represent the original matrix multiplication:
192241
//
@@ -1237,6 +1286,8 @@ static size_t aheads_masks_nbytes(struct whisper_aheads_masks & aheads_masks) {
12371286
static ggml_backend_t whisper_backend_init_gpu(const whisper_context_params & params) {
12381287
ggml_log_set(g_state.log_callback, g_state.log_callback_user_data);
12391288

1289+
whisper_load_backends();
1290+
12401291
ggml_backend_dev_t dev = nullptr;
12411292

12421293
int cnt = 0;
@@ -1294,7 +1345,7 @@ static std::vector<ggml_backend_t> whisper_backend_init(const whisper_context_pa
12941345

12951346
GGML_UNUSED(params);
12961347

1297-
result.push_back(ggml_backend_cpu_init());
1348+
result.push_back(ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr));
12981349

12991350
return result;
13001351
}
@@ -3516,7 +3567,7 @@ struct whisper_context_params whisper_context_default_params() {
35163567
/*.n_heads =*/ 0,
35173568
/*.heads =*/ NULL,
35183569
},
3519-
/*.dtw_mem_size =*/ 1024*1024*128,
3570+
/*.dtw_mem_size =*/ 1024*1024*128, // TODO: probably can be removed now
35203571
};
35213572
return result;
35223573
}
@@ -4206,22 +4257,28 @@ static int whisper_has_openvino(void) {
42064257
const char * whisper_print_system_info(void) {
42074258
static std::string s;
42084259

4260+
whisper_load_backends();
4261+
42094262
s = "";
4210-
s += "AVX = " + std::to_string(ggml_cpu_has_avx()) + " | ";
4211-
s += "AVX2 = " + std::to_string(ggml_cpu_has_avx2()) + " | ";
4212-
s += "AVX512 = " + std::to_string(ggml_cpu_has_avx512()) + " | ";
4213-
s += "FMA = " + std::to_string(ggml_cpu_has_fma()) + " | ";
4214-
s += "NEON = " + std::to_string(ggml_cpu_has_neon()) + " | ";
4215-
s += "ARM_FMA = " + std::to_string(ggml_cpu_has_arm_fma()) + " | ";
4216-
s += "F16C = " + std::to_string(ggml_cpu_has_f16c()) + " | ";
4217-
s += "FP16_VA = " + std::to_string(ggml_cpu_has_fp16_va()) + " | ";
4218-
s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | ";
4219-
s += "SSE3 = " + std::to_string(ggml_cpu_has_sse3()) + " | ";
4220-
s += "SSSE3 = " + std::to_string(ggml_cpu_has_ssse3()) + " | ";
4221-
s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | ";
4263+
s += "WHISPER : ";
42224264
s += "COREML = " + std::to_string(whisper_has_coreml()) + " | ";
42234265
s += "OPENVINO = " + std::to_string(whisper_has_openvino()) + " | ";
42244266

4267+
for (size_t i = 0; i < ggml_backend_reg_count(); i++) {
4268+
auto * reg = ggml_backend_reg_get(i);
4269+
auto * get_features_fn = (ggml_backend_get_features_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_get_features");
4270+
if (get_features_fn) {
4271+
ggml_backend_feature * features = get_features_fn(reg);
4272+
s += ggml_backend_reg_name(reg);
4273+
s += " : ";
4274+
for (; features->name; features++) {
4275+
s += features->name;
4276+
s += " = ";
4277+
s += features->value;
4278+
s += " | ";
4279+
}
4280+
}
4281+
}
42254282
return s.c_str();
42264283
}
42274284

@@ -6653,6 +6710,8 @@ WHISPER_API int whisper_bench_ggml_mul_mat(int n_threads) {
66536710
}
66546711

66556712
WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) {
6713+
whisper_load_backends();
6714+
66566715
static std::string s;
66576716
s = "";
66586717
char strbuf[256];
@@ -6672,7 +6731,6 @@ WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) {
66726731
// c: N*N*sizeof(float)
66736732
// when F16 is used, there is an extra work buffer of size N*N*sizeof(float)
66746733
std::vector<uint8_t> buf(3llu*N_max*N_max*sizeof(float) + 3*ggml_tensor_overhead() + ggml_graph_overhead());
6675-
std::vector<uint8_t> work;
66766734

66776735
// put a bunch of random data in the buffer
66786736
for (size_t i = 0; i < buf.size(); i++) buf[i] = i;
@@ -6729,12 +6787,12 @@ WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) {
67296787
double tsum = 0.0;
67306788

67316789
// heat-up
6732-
ggml_graph_compute_helper(gf, work, n_threads, nullptr, nullptr);
6790+
ggml_graph_compute_helper(gf, n_threads, nullptr, nullptr);
67336791

67346792
for (int i = 0; i < n_max; ++i) {
67356793
const int64_t t0 = ggml_time_us();
67366794

6737-
ggml_graph_compute_helper(gf, work, n_threads, nullptr, nullptr);
6795+
ggml_graph_compute_helper(gf, n_threads, nullptr, nullptr);
67386796

67396797
const int64_t t1 = ggml_time_us();
67406798

@@ -7111,18 +7169,18 @@ static ggml_tensor * dtw_and_backtrace(ggml_context * ctx, ggml_tensor * x) {
71117169
struct ggml_tensor * cost = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, N + 1, M + 1);
71127170
struct ggml_tensor * trace = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, N + 1, M + 1);
71137171

7114-
cost = ggml_set_f32(cost, INFINITY);
7115-
trace = ggml_set_f32(trace, -1);
7116-
ggml_set_f32_nd(cost, 0, 0, 0, 0, 0.0);
7172+
cost = whisper_set_f32(cost, INFINITY);
7173+
trace = whisper_set_f32(trace, -1);
7174+
whisper_set_f32_nd(cost, 0, 0, 0, 0, 0.0);
71177175

71187176
// dtw
71197177
// supposedly can be optmized by computing diagonals in parallel ?
71207178
// Not sure it is worth it since x will be GENERATED_TOKENS*1500 size at most.
71217179
for (int64_t j = 1; j < M + 1; ++j) {
71227180
for (int64_t i = 1; i < N + 1; ++i) {
7123-
float c0 = ggml_get_f32_nd(cost, i - 1, j - 1, 0, 0);
7124-
float c1 = ggml_get_f32_nd(cost, i - 1, j, 0, 0);
7125-
float c2 = ggml_get_f32_nd(cost, i, j - 1, 0, 0);
7181+
float c0 = whisper_get_f32_nd(cost, i - 1, j - 1, 0, 0);
7182+
float c1 = whisper_get_f32_nd(cost, i - 1, j, 0, 0);
7183+
float c2 = whisper_get_f32_nd(cost, i, j - 1, 0, 0);
71267184

71277185
float c;
71287186
int32_t t;
@@ -7137,9 +7195,9 @@ static ggml_tensor * dtw_and_backtrace(ggml_context * ctx, ggml_tensor * x) {
71377195
t = 2;
71387196
}
71397197

7140-
c = ggml_get_f32_nd(x, i - 1, j - 1, 0, 0) + c;
7141-
ggml_set_f32_nd(cost, i, j, 0, 0, c);
7142-
ggml_set_i32_nd(trace, i, j, 0, 0, t);
7198+
c = whisper_get_f32_nd(x, i - 1, j - 1, 0, 0) + c;
7199+
whisper_set_f32_nd(cost, i, j, 0, 0, c);
7200+
whisper_set_i32_nd(trace, i, j, 0, 0, t);
71437201
}
71447202
}
71457203

@@ -7148,19 +7206,19 @@ static ggml_tensor * dtw_and_backtrace(ggml_context * ctx, ggml_tensor * x) {
71487206
struct ggml_tensor * bt = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, BT_MAX_ROWS, 2);
71497207
// trace[0, :] = 2;
71507208
for (int64_t i = 0; i < M + 1; ++i)
7151-
ggml_set_i32_nd(trace, 0, i, 0, 0, 2);
7209+
whisper_set_i32_nd(trace, 0, i, 0, 0, 2);
71527210
//trace[:, 0] = 1;
71537211
for (int64_t i = 0; i < N + 1; ++i)
7154-
ggml_set_i32_nd(trace, i, 0, 0, 0, 1);
7212+
whisper_set_i32_nd(trace, i, 0, 0, 0, 1);
71557213
int bt_row_idx = BT_MAX_ROWS - 1;
71567214
int64_t i = N;
71577215
int64_t j = M;
71587216
while (i > 0 || j > 0) {
7159-
ggml_set_i32_nd(bt, bt_row_idx, 0, 0, 0, i - 1);
7160-
ggml_set_i32_nd(bt, bt_row_idx, 1, 0, 0, j - 1);
7217+
whisper_set_i32_nd(bt, bt_row_idx, 0, 0, 0, i - 1);
7218+
whisper_set_i32_nd(bt, bt_row_idx, 1, 0, 0, j - 1);
71617219
--bt_row_idx;
71627220

7163-
int32_t t = ggml_get_i32_nd(trace, i, j, 0, 0);
7221+
int32_t t = whisper_get_i32_nd(trace, i, j, 0, 0);
71647222
if (t == 0) {
71657223
--i;
71667224
--j;
@@ -7181,8 +7239,8 @@ static ggml_tensor * dtw_and_backtrace(ggml_context * ctx, ggml_tensor * x) {
71817239
ggml_tensor * r = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, 2, result_n_cols);
71827240
for (int64_t i = 0; i < 2; ++i) {
71837241
for (int64_t j = 0; j < result_n_cols; ++j) {
7184-
int32_t v = ggml_get_i32_nd(bt, j+bt_row_idx+1, i, 0, 0);
7185-
ggml_set_i32_nd(r, i, j, 0, 0, v);
7242+
int32_t v = whisper_get_i32_nd(bt, j+bt_row_idx+1, i, 0, 0);
7243+
whisper_set_i32_nd(r, i, j, 0, 0, v);
71867244
}
71877245
}
71887246

@@ -7217,11 +7275,11 @@ static void median_filter(struct ggml_tensor * dst , const struct ggml_tensor *
72177275
idx = 2*(a->ne[2] - 1) - idx;
72187276
}
72197277

7220-
filter.push_back(ggml_get_f32_nd(a, i, j, idx, 0));
7278+
filter.push_back(whisper_get_f32_nd(a, i, j, idx, 0));
72217279
}
72227280
std::sort(filter.begin(), filter.end());
72237281
const float v = filter[filter.size()/2];
7224-
ggml_set_f32_nd(dst, i, j, k, 0, v);
7282+
whisper_set_f32_nd(dst, i, j, k, 0, v);
72257283
filter.clear();
72267284
}
72277285
}
@@ -7248,9 +7306,9 @@ static void whisper_exp_compute_token_level_timestamps_dtw(
72487306
// Our ggml buffer should be pre-allocated somewhere during init and reused
72497307
// when we call this function
72507308
struct ggml_init_params gparams = {
7251-
/*.mem_size =*/ ctx->params.dtw_mem_size,
7309+
/*.mem_size =*/ ggml_tensor_overhead()*1024 + ggml_graph_overhead(),
72527310
/*.mem_buffer =*/ NULL,
7253-
/*.no_alloc =*/ false,
7311+
/*.no_alloc =*/ true,
72547312
};
72557313
struct ggml_context * gctx = ggml_init(gparams);
72567314

@@ -7343,7 +7401,10 @@ static void whisper_exp_compute_token_level_timestamps_dtw(
73437401
// Compute
73447402
struct ggml_cgraph * gf = ggml_new_graph(gctx);
73457403
ggml_build_forward_expand(gf, w);
7346-
ggml_graph_compute_with_ctx(gctx, gf, n_threads);
7404+
7405+
ggml_backend_ptr backend { ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr) };
7406+
ggml_backend_buffer_ptr buf { ggml_backend_alloc_ctx_tensors(gctx, backend.get()) };
7407+
ggml_backend_graph_compute(backend.get(), gf);
73477408

73487409
ggml_tensor * alignment = dtw_and_backtrace(gctx, w);
73497410

@@ -7352,9 +7413,9 @@ static void whisper_exp_compute_token_level_timestamps_dtw(
73527413
auto seg_i = state->result_all.begin() + i_segment;
73537414
auto tok_i = seg_i->tokens.begin();
73547415
for (int i = 0; i < alignment->ne[1]; ++i) {
7355-
int32_t v = ggml_get_i32_nd(alignment, 0, i, 0, 0);
7416+
int32_t v = whisper_get_i32_nd(alignment, 0, i, 0, 0);
73567417
if (v != last_v) {
7357-
int32_t time_index = ggml_get_i32_nd(alignment, 1, i, 0, 0);
7418+
int32_t time_index = whisper_get_i32_nd(alignment, 1, i, 0, 0);
73587419
int64_t timestamp = (time_index * 2) + seek; // Each index on DTW result = 20mS audio
73597420
last_v = v;
73607421

0 commit comments

Comments
 (0)