11#include " whisper.h"
22
3- #include " ggml-cpu.h"
4-
53#include " ggml.h"
4+ #include " ggml-cpp.h"
65#include " ggml-alloc.h"
76#include " ggml-backend.h"
87
1918#include < cassert>
2019#define _USE_MATH_DEFINES
2120#include < cmath>
22- #include < cstdio >
21+ #include < codecvt >
2322#include < cstdarg>
23+ #include < cstdio>
2424#include < cstring>
2525#include < fstream>
26+ #include < functional>
2627#include < map>
28+ #include < mutex>
29+ #include < random>
30+ #include < regex>
2731#include < set>
2832#include < string>
2933#include < thread>
3034#include < vector>
31- #include < regex>
32- #include < random>
33- #include < functional>
34- #include < codecvt>
3535
3636// dummy
3737
@@ -149,21 +149,25 @@ static void whisper_log_callback_default(ggml_log_level level, const char * text
149149
150150static bool ggml_graph_compute_helper (
151151 struct ggml_cgraph * graph,
152- std::vector<uint8_t > & buf,
153152 int n_threads,
154153 ggml_abort_callback abort_callback,
155154 void * abort_callback_data) {
156- struct ggml_cplan plan = ggml_graph_plan (graph, n_threads, nullptr );
157155
158- plan.abort_callback = abort_callback;
159- plan.abort_callback_data = abort_callback_data;
156+ ggml_backend_ptr backend { ggml_backend_init_by_type (GGML_BACKEND_DEVICE_TYPE_CPU, nullptr ) };
157+
158+ auto * reg = ggml_backend_dev_backend_reg (ggml_backend_get_device (backend.get ()));
160159
161- if (plan. work_size > 0 ) {
162- buf. resize (plan. work_size );
163- plan. work_data = buf. data ( );
160+ auto * set_abort_callback_fn = ( ggml_backend_set_abort_callback_t ) ggml_backend_reg_get_proc_address (reg, " ggml_backend_set_abort_callback " );
161+ if (set_abort_callback_fn) {
162+ set_abort_callback_fn (backend. get (), abort_callback, abort_callback_data );
164163 }
165164
166- return ggml_graph_compute (graph, &plan);
165+ auto ggml_backend_set_n_threads_fn = (ggml_backend_set_n_threads_t ) ggml_backend_reg_get_proc_address (reg, " ggml_backend_set_n_threads" );
166+ if (ggml_backend_set_n_threads_fn) {
167+ ggml_backend_set_n_threads_fn (backend.get (), n_threads);
168+ }
169+
170+ return ggml_backend_graph_compute (backend.get (), graph) == GGML_STATUS_SUCCESS;
167171}
168172
169173static bool ggml_graph_compute_helper (
@@ -187,6 +191,51 @@ static bool ggml_graph_compute_helper(
187191 return t;
188192}
189193
194+ static void whisper_load_backends () {
195+ #ifdef GGML_BACKEND_DL
196+ static std::once_flag flag;
197+ std::call_once (flag, []() {
198+ ggml_backend_load_all ();
199+ });
200+ #endif
201+ }
202+
203+ // TODO: move these functions to ggml-base with support for ggml-backend?
204+
205+ static ggml_tensor * whisper_set_f32 (struct ggml_tensor * t, float v) {
206+ GGML_ASSERT (t->type == GGML_TYPE_F32);
207+ GGML_ASSERT (ggml_is_contiguous (t));
208+ size_t nels = ggml_nelements (t);
209+ for (int64_t i = 0 ; i < nels; ++i) {
210+ ((float *) t->data )[i] = v;
211+ }
212+ return t;
213+ }
214+
215+ static float whisper_get_f32_nd (const struct ggml_tensor * t, int64_t i0, int64_t i1, int64_t i2, int64_t i3) {
216+ GGML_ASSERT (t->type == GGML_TYPE_F32);
217+ void * data = (char *) t->data + i0*t->nb [0 ] + i1*t->nb [1 ] + i2*t->nb [2 ] + i3*t->nb [3 ];
218+ return *(float *) data;
219+ }
220+
221+ static void whisper_set_f32_nd (struct ggml_tensor * t, int64_t i0, int64_t i1, int64_t i2, int64_t i3, float v) {
222+ GGML_ASSERT (t->type == GGML_TYPE_F32);
223+ void * data = (char *) t->data + i0*t->nb [0 ] + i1*t->nb [1 ] + i2*t->nb [2 ] + i3*t->nb [3 ];
224+ *(float *) data = v;
225+ }
226+
227+ static int32_t whisper_get_i32_nd (const struct ggml_tensor * t, int64_t i0, int64_t i1, int64_t i2, int64_t i3) {
228+ GGML_ASSERT (t->type == GGML_TYPE_I32);
229+ void * data = (char *) t->data + i0*t->nb [0 ] + i1*t->nb [1 ] + i2*t->nb [2 ] + i3*t->nb [3 ];
230+ return *(int32_t *) data;
231+ }
232+
233+ static void whisper_set_i32_nd (struct ggml_tensor * t, int64_t i0, int64_t i1, int64_t i2, int64_t i3, int32_t v) {
234+ GGML_ASSERT (t->type == GGML_TYPE_I32);
235+ void * data = (char *) t->data + i0*t->nb [0 ] + i1*t->nb [1 ] + i2*t->nb [2 ] + i3*t->nb [3 ];
236+ *(int32_t *) data = v;
237+ }
238+
190239// faster matrix multiplications for tensors that do not have dimension 0 divisible by "pad"
191240// the idea is to represent the original matrix multiplication:
192241//
@@ -1237,6 +1286,8 @@ static size_t aheads_masks_nbytes(struct whisper_aheads_masks & aheads_masks) {
12371286static ggml_backend_t whisper_backend_init_gpu (const whisper_context_params & params) {
12381287 ggml_log_set (g_state.log_callback , g_state.log_callback_user_data );
12391288
1289+ whisper_load_backends ();
1290+
12401291 ggml_backend_dev_t dev = nullptr ;
12411292
12421293 int cnt = 0 ;
@@ -1294,7 +1345,7 @@ static std::vector<ggml_backend_t> whisper_backend_init(const whisper_context_pa
12941345
12951346 GGML_UNUSED (params);
12961347
1297- result.push_back (ggml_backend_cpu_init ( ));
1348+ result.push_back (ggml_backend_init_by_type (GGML_BACKEND_DEVICE_TYPE_CPU, nullptr ));
12981349
12991350 return result;
13001351}
@@ -3516,7 +3567,7 @@ struct whisper_context_params whisper_context_default_params() {
35163567 /* .n_heads =*/ 0 ,
35173568 /* .heads =*/ NULL ,
35183569 },
3519- /* .dtw_mem_size =*/ 1024 *1024 *128 ,
3570+ /* .dtw_mem_size =*/ 1024 *1024 *128 , // TODO: probably can be removed now
35203571 };
35213572 return result;
35223573}
@@ -4206,22 +4257,28 @@ static int whisper_has_openvino(void) {
42064257const char * whisper_print_system_info (void ) {
42074258 static std::string s;
42084259
4260+ whisper_load_backends ();
4261+
42094262 s = " " ;
4210- s += " AVX = " + std::to_string (ggml_cpu_has_avx ()) + " | " ;
4211- s += " AVX2 = " + std::to_string (ggml_cpu_has_avx2 ()) + " | " ;
4212- s += " AVX512 = " + std::to_string (ggml_cpu_has_avx512 ()) + " | " ;
4213- s += " FMA = " + std::to_string (ggml_cpu_has_fma ()) + " | " ;
4214- s += " NEON = " + std::to_string (ggml_cpu_has_neon ()) + " | " ;
4215- s += " ARM_FMA = " + std::to_string (ggml_cpu_has_arm_fma ()) + " | " ;
4216- s += " F16C = " + std::to_string (ggml_cpu_has_f16c ()) + " | " ;
4217- s += " FP16_VA = " + std::to_string (ggml_cpu_has_fp16_va ()) + " | " ;
4218- s += " WASM_SIMD = " + std::to_string (ggml_cpu_has_wasm_simd ()) + " | " ;
4219- s += " SSE3 = " + std::to_string (ggml_cpu_has_sse3 ()) + " | " ;
4220- s += " SSSE3 = " + std::to_string (ggml_cpu_has_ssse3 ()) + " | " ;
4221- s += " VSX = " + std::to_string (ggml_cpu_has_vsx ()) + " | " ;
4263+ s += " WHISPER : " ;
42224264 s += " COREML = " + std::to_string (whisper_has_coreml ()) + " | " ;
42234265 s += " OPENVINO = " + std::to_string (whisper_has_openvino ()) + " | " ;
42244266
4267+ for (size_t i = 0 ; i < ggml_backend_reg_count (); i++) {
4268+ auto * reg = ggml_backend_reg_get (i);
4269+ auto * get_features_fn = (ggml_backend_get_features_t ) ggml_backend_reg_get_proc_address (reg, " ggml_backend_get_features" );
4270+ if (get_features_fn) {
4271+ ggml_backend_feature * features = get_features_fn (reg);
4272+ s += ggml_backend_reg_name (reg);
4273+ s += " : " ;
4274+ for (; features->name ; features++) {
4275+ s += features->name ;
4276+ s += " = " ;
4277+ s += features->value ;
4278+ s += " | " ;
4279+ }
4280+ }
4281+ }
42254282 return s.c_str ();
42264283}
42274284
@@ -6653,6 +6710,8 @@ WHISPER_API int whisper_bench_ggml_mul_mat(int n_threads) {
66536710}
66546711
66556712WHISPER_API const char * whisper_bench_ggml_mul_mat_str (int n_threads) {
6713+ whisper_load_backends ();
6714+
66566715 static std::string s;
66576716 s = " " ;
66586717 char strbuf[256 ];
@@ -6672,7 +6731,6 @@ WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) {
66726731 // c: N*N*sizeof(float)
66736732 // when F16 is used, there is an extra work buffer of size N*N*sizeof(float)
66746733 std::vector<uint8_t > buf (3llu*N_max*N_max*sizeof (float ) + 3 *ggml_tensor_overhead () + ggml_graph_overhead ());
6675- std::vector<uint8_t > work;
66766734
66776735 // put a bunch of random data in the buffer
66786736 for (size_t i = 0 ; i < buf.size (); i++) buf[i] = i;
@@ -6729,12 +6787,12 @@ WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) {
67296787 double tsum = 0.0 ;
67306788
67316789 // heat-up
6732- ggml_graph_compute_helper (gf, work, n_threads, nullptr , nullptr );
6790+ ggml_graph_compute_helper (gf, n_threads, nullptr , nullptr );
67336791
67346792 for (int i = 0 ; i < n_max; ++i) {
67356793 const int64_t t0 = ggml_time_us ();
67366794
6737- ggml_graph_compute_helper (gf, work, n_threads, nullptr , nullptr );
6795+ ggml_graph_compute_helper (gf, n_threads, nullptr , nullptr );
67386796
67396797 const int64_t t1 = ggml_time_us ();
67406798
@@ -7111,18 +7169,18 @@ static ggml_tensor * dtw_and_backtrace(ggml_context * ctx, ggml_tensor * x) {
71117169 struct ggml_tensor * cost = ggml_new_tensor_2d (ctx, GGML_TYPE_F32, N + 1 , M + 1 );
71127170 struct ggml_tensor * trace = ggml_new_tensor_2d (ctx, GGML_TYPE_I32, N + 1 , M + 1 );
71137171
7114- cost = ggml_set_f32 (cost, INFINITY);
7115- trace = ggml_set_f32 (trace, -1 );
7116- ggml_set_f32_nd (cost, 0 , 0 , 0 , 0 , 0.0 );
7172+ cost = whisper_set_f32 (cost, INFINITY);
7173+ trace = whisper_set_f32 (trace, -1 );
7174+ whisper_set_f32_nd (cost, 0 , 0 , 0 , 0 , 0.0 );
71177175
71187176 // dtw
71197177 // supposedly can be optmized by computing diagonals in parallel ?
71207178 // Not sure it is worth it since x will be GENERATED_TOKENS*1500 size at most.
71217179 for (int64_t j = 1 ; j < M + 1 ; ++j) {
71227180 for (int64_t i = 1 ; i < N + 1 ; ++i) {
7123- float c0 = ggml_get_f32_nd (cost, i - 1 , j - 1 , 0 , 0 );
7124- float c1 = ggml_get_f32_nd (cost, i - 1 , j, 0 , 0 );
7125- float c2 = ggml_get_f32_nd (cost, i, j - 1 , 0 , 0 );
7181+ float c0 = whisper_get_f32_nd (cost, i - 1 , j - 1 , 0 , 0 );
7182+ float c1 = whisper_get_f32_nd (cost, i - 1 , j, 0 , 0 );
7183+ float c2 = whisper_get_f32_nd (cost, i, j - 1 , 0 , 0 );
71267184
71277185 float c;
71287186 int32_t t;
@@ -7137,9 +7195,9 @@ static ggml_tensor * dtw_and_backtrace(ggml_context * ctx, ggml_tensor * x) {
71377195 t = 2 ;
71387196 }
71397197
7140- c = ggml_get_f32_nd (x, i - 1 , j - 1 , 0 , 0 ) + c;
7141- ggml_set_f32_nd (cost, i, j, 0 , 0 , c);
7142- ggml_set_i32_nd (trace, i, j, 0 , 0 , t);
7198+ c = whisper_get_f32_nd (x, i - 1 , j - 1 , 0 , 0 ) + c;
7199+ whisper_set_f32_nd (cost, i, j, 0 , 0 , c);
7200+ whisper_set_i32_nd (trace, i, j, 0 , 0 , t);
71437201 }
71447202 }
71457203
@@ -7148,19 +7206,19 @@ static ggml_tensor * dtw_and_backtrace(ggml_context * ctx, ggml_tensor * x) {
71487206 struct ggml_tensor * bt = ggml_new_tensor_2d (ctx, GGML_TYPE_I32, BT_MAX_ROWS, 2 );
71497207 // trace[0, :] = 2;
71507208 for (int64_t i = 0 ; i < M + 1 ; ++i)
7151- ggml_set_i32_nd (trace, 0 , i, 0 , 0 , 2 );
7209+ whisper_set_i32_nd (trace, 0 , i, 0 , 0 , 2 );
71527210 // trace[:, 0] = 1;
71537211 for (int64_t i = 0 ; i < N + 1 ; ++i)
7154- ggml_set_i32_nd (trace, i, 0 , 0 , 0 , 1 );
7212+ whisper_set_i32_nd (trace, i, 0 , 0 , 0 , 1 );
71557213 int bt_row_idx = BT_MAX_ROWS - 1 ;
71567214 int64_t i = N;
71577215 int64_t j = M;
71587216 while (i > 0 || j > 0 ) {
7159- ggml_set_i32_nd (bt, bt_row_idx, 0 , 0 , 0 , i - 1 );
7160- ggml_set_i32_nd (bt, bt_row_idx, 1 , 0 , 0 , j - 1 );
7217+ whisper_set_i32_nd (bt, bt_row_idx, 0 , 0 , 0 , i - 1 );
7218+ whisper_set_i32_nd (bt, bt_row_idx, 1 , 0 , 0 , j - 1 );
71617219 --bt_row_idx;
71627220
7163- int32_t t = ggml_get_i32_nd (trace, i, j, 0 , 0 );
7221+ int32_t t = whisper_get_i32_nd (trace, i, j, 0 , 0 );
71647222 if (t == 0 ) {
71657223 --i;
71667224 --j;
@@ -7181,8 +7239,8 @@ static ggml_tensor * dtw_and_backtrace(ggml_context * ctx, ggml_tensor * x) {
71817239 ggml_tensor * r = ggml_new_tensor_2d (ctx, GGML_TYPE_I32, 2 , result_n_cols);
71827240 for (int64_t i = 0 ; i < 2 ; ++i) {
71837241 for (int64_t j = 0 ; j < result_n_cols; ++j) {
7184- int32_t v = ggml_get_i32_nd (bt, j+bt_row_idx+1 , i, 0 , 0 );
7185- ggml_set_i32_nd (r, i, j, 0 , 0 , v);
7242+ int32_t v = whisper_get_i32_nd (bt, j+bt_row_idx+1 , i, 0 , 0 );
7243+ whisper_set_i32_nd (r, i, j, 0 , 0 , v);
71867244 }
71877245 }
71887246
@@ -7217,11 +7275,11 @@ static void median_filter(struct ggml_tensor * dst , const struct ggml_tensor *
72177275 idx = 2 *(a->ne [2 ] - 1 ) - idx;
72187276 }
72197277
7220- filter.push_back (ggml_get_f32_nd (a, i, j, idx, 0 ));
7278+ filter.push_back (whisper_get_f32_nd (a, i, j, idx, 0 ));
72217279 }
72227280 std::sort (filter.begin (), filter.end ());
72237281 const float v = filter[filter.size ()/2 ];
7224- ggml_set_f32_nd (dst, i, j, k, 0 , v);
7282+ whisper_set_f32_nd (dst, i, j, k, 0 , v);
72257283 filter.clear ();
72267284 }
72277285 }
@@ -7248,9 +7306,9 @@ static void whisper_exp_compute_token_level_timestamps_dtw(
72487306 // Our ggml buffer should be pre-allocated somewhere during init and reused
72497307 // when we call this function
72507308 struct ggml_init_params gparams = {
7251- /* .mem_size =*/ ctx-> params . dtw_mem_size ,
7309+ /* .mem_size =*/ ggml_tensor_overhead ()* 1024 + ggml_graph_overhead () ,
72527310 /* .mem_buffer =*/ NULL ,
7253- /* .no_alloc =*/ false ,
7311+ /* .no_alloc =*/ true ,
72547312 };
72557313 struct ggml_context * gctx = ggml_init (gparams);
72567314
@@ -7343,7 +7401,10 @@ static void whisper_exp_compute_token_level_timestamps_dtw(
73437401 // Compute
73447402 struct ggml_cgraph * gf = ggml_new_graph (gctx);
73457403 ggml_build_forward_expand (gf, w);
7346- ggml_graph_compute_with_ctx (gctx, gf, n_threads);
7404+
7405+ ggml_backend_ptr backend { ggml_backend_init_by_type (GGML_BACKEND_DEVICE_TYPE_CPU, nullptr ) };
7406+ ggml_backend_buffer_ptr buf { ggml_backend_alloc_ctx_tensors (gctx, backend.get ()) };
7407+ ggml_backend_graph_compute (backend.get (), gf);
73477408
73487409 ggml_tensor * alignment = dtw_and_backtrace (gctx, w);
73497410
@@ -7352,9 +7413,9 @@ static void whisper_exp_compute_token_level_timestamps_dtw(
73527413 auto seg_i = state->result_all .begin () + i_segment;
73537414 auto tok_i = seg_i->tokens .begin ();
73547415 for (int i = 0 ; i < alignment->ne [1 ]; ++i) {
7355- int32_t v = ggml_get_i32_nd (alignment, 0 , i, 0 , 0 );
7416+ int32_t v = whisper_get_i32_nd (alignment, 0 , i, 0 , 0 );
73567417 if (v != last_v) {
7357- int32_t time_index = ggml_get_i32_nd (alignment, 1 , i, 0 , 0 );
7418+ int32_t time_index = whisper_get_i32_nd (alignment, 1 , i, 0 , 0 );
73587419 int64_t timestamp = (time_index * 2 ) + seek; // Each index on DTW result = 20mS audio
73597420 last_v = v;
73607421
0 commit comments