11#include " whisper.h"
22
3- #include " ggml-cpu.h"
4-
53#include " ggml.h"
4+ #include " ggml-cpp.h"
65#include " ggml-alloc.h"
76#include " ggml-backend.h"
87
1918#include < cassert>
2019#define _USE_MATH_DEFINES
2120#include < cmath>
22- #include < cstdio >
21+ #include < codecvt >
2322#include < cstdarg>
23+ #include < cstdio>
2424#include < cstring>
2525#include < fstream>
26+ #include < functional>
2627#include < map>
28+ #include < mutex>
29+ #include < random>
30+ #include < regex>
2731#include < set>
2832#include < string>
2933#include < thread>
3034#include < vector>
31- #include < regex>
32- #include < random>
33- #include < functional>
34- #include < codecvt>
3535
3636// dummy
3737
@@ -149,21 +149,25 @@ static void whisper_log_callback_default(ggml_log_level level, const char * text
149149
150150static bool ggml_graph_compute_helper (
151151 struct ggml_cgraph * graph,
152- std::vector<uint8_t > & buf,
153152 int n_threads,
154153 ggml_abort_callback abort_callback,
155154 void * abort_callback_data) {
156- struct ggml_cplan plan = ggml_graph_plan (graph, n_threads, nullptr );
157155
158- plan.abort_callback = abort_callback;
159- plan.abort_callback_data = abort_callback_data;
156+ ggml_backend_ptr backend { ggml_backend_init_by_type (GGML_BACKEND_DEVICE_TYPE_CPU, nullptr ) };
160157
161- if (plan.work_size > 0 ) {
162- buf.resize (plan.work_size );
163- plan.work_data = buf.data ();
158+ auto * reg = ggml_backend_dev_backend_reg (ggml_backend_get_device (backend.get ()));
159+
160+ auto * set_abort_callback_fn = (ggml_backend_set_abort_callback_t ) ggml_backend_reg_get_proc_address (reg, " ggml_backend_set_abort_callback" );
161+ if (set_abort_callback_fn) {
162+ set_abort_callback_fn (backend.get (), abort_callback, abort_callback_data);
163+ }
164+
165+ auto ggml_backend_set_n_threads_fn = (ggml_backend_set_n_threads_t ) ggml_backend_reg_get_proc_address (reg, " ggml_backend_set_n_threads" );
166+ if (ggml_backend_set_n_threads_fn) {
167+ ggml_backend_set_n_threads_fn (backend.get (), n_threads);
164168 }
165169
166- return ggml_graph_compute (graph, &plan) ;
170+ return ggml_backend_graph_compute (backend. get (), graph) == GGML_STATUS_SUCCESS ;
167171}
168172
169173static bool ggml_graph_compute_helper (
@@ -187,6 +191,61 @@ static bool ggml_graph_compute_helper(
187191 return t;
188192}
189193
194+ static void whisper_load_backends () {
195+ #ifdef GGML_BACKEND_DL
196+ static std::once_flag flag;
197+ std::call_once (flag, []() {
198+ ggml_backend_load_all ();
199+ });
200+ #endif
201+ }
202+
203+ // TODO: move these functions to ggml-base with support for ggml-backend?
204+
205+ static ggml_tensor * whisper_set_f32 (struct ggml_tensor * t, float v) {
206+ GGML_ASSERT (t->type == GGML_TYPE_F32);
207+ GGML_ASSERT (ggml_is_contiguous (t));
208+ size_t nels = ggml_nelements (t);
209+ for (int64_t i = 0 ; i < nels; ++i) {
210+ ((float *) t->data )[i] = v;
211+ }
212+ return t;
213+ }
214+
215+ static ggml_tensor * whisper_set_i32 (struct ggml_tensor * t, int32_t v) {
216+ GGML_ASSERT (t->type == GGML_TYPE_I32);
217+ GGML_ASSERT (ggml_is_contiguous (t));
218+ size_t nels = ggml_nelements (t);
219+ for (int64_t i = 0 ; i < nels; ++i) {
220+ ((int32_t *) t->data )[i] = v;
221+ }
222+ return t;
223+ }
224+
225+ static float whisper_get_f32_nd (const struct ggml_tensor * t, int64_t i0, int64_t i1, int64_t i2, int64_t i3) {
226+ GGML_ASSERT (t->type == GGML_TYPE_F32);
227+ void * data = (char *) t->data + i0*t->nb [0 ] + i1*t->nb [1 ] + i2*t->nb [2 ] + i3*t->nb [3 ];
228+ return *(float *) data;
229+ }
230+
231+ static void whisper_set_f32_nd (struct ggml_tensor * t, int64_t i0, int64_t i1, int64_t i2, int64_t i3, float v) {
232+ GGML_ASSERT (t->type == GGML_TYPE_F32);
233+ void * data = (char *) t->data + i0*t->nb [0 ] + i1*t->nb [1 ] + i2*t->nb [2 ] + i3*t->nb [3 ];
234+ *(float *) data = v;
235+ }
236+
237+ static int32_t whisper_get_i32_nd (const struct ggml_tensor * t, int64_t i0, int64_t i1, int64_t i2, int64_t i3) {
238+ GGML_ASSERT (t->type == GGML_TYPE_I32);
239+ void * data = (char *) t->data + i0*t->nb [0 ] + i1*t->nb [1 ] + i2*t->nb [2 ] + i3*t->nb [3 ];
240+ return *(int32_t *) data;
241+ }
242+
243+ static void whisper_set_i32_nd (struct ggml_tensor * t, int64_t i0, int64_t i1, int64_t i2, int64_t i3, int32_t v) {
244+ GGML_ASSERT (t->type == GGML_TYPE_I32);
245+ void * data = (char *) t->data + i0*t->nb [0 ] + i1*t->nb [1 ] + i2*t->nb [2 ] + i3*t->nb [3 ];
246+ *(int32_t *) data = v;
247+ }
248+
190249// faster matrix multiplications for tensors that do not have dimension 0 divisible by "pad"
191250// the idea is to represent the original matrix multiplication:
192251//
@@ -1237,6 +1296,8 @@ static size_t aheads_masks_nbytes(struct whisper_aheads_masks & aheads_masks) {
12371296static ggml_backend_t whisper_backend_init_gpu (const whisper_context_params & params) {
12381297 ggml_log_set (g_state.log_callback , g_state.log_callback_user_data );
12391298
1299+ whisper_load_backends ();
1300+
12401301 ggml_backend_dev_t dev = nullptr ;
12411302
12421303 int cnt = 0 ;
@@ -1294,7 +1355,7 @@ static std::vector<ggml_backend_t> whisper_backend_init(const whisper_context_pa
12941355
12951356 GGML_UNUSED (params);
12961357
1297- result.push_back (ggml_backend_cpu_init ( ));
1358+ result.push_back (ggml_backend_init_by_type (GGML_BACKEND_DEVICE_TYPE_CPU, nullptr ));
12981359
12991360 return result;
13001361}
@@ -4206,22 +4267,28 @@ static int whisper_has_openvino(void) {
42064267const char * whisper_print_system_info (void ) {
42074268 static std::string s;
42084269
4270+ whisper_load_backends ();
4271+
42094272 s = " " ;
4210- s += " AVX = " + std::to_string (ggml_cpu_has_avx ()) + " | " ;
4211- s += " AVX2 = " + std::to_string (ggml_cpu_has_avx2 ()) + " | " ;
4212- s += " AVX512 = " + std::to_string (ggml_cpu_has_avx512 ()) + " | " ;
4213- s += " FMA = " + std::to_string (ggml_cpu_has_fma ()) + " | " ;
4214- s += " NEON = " + std::to_string (ggml_cpu_has_neon ()) + " | " ;
4215- s += " ARM_FMA = " + std::to_string (ggml_cpu_has_arm_fma ()) + " | " ;
4216- s += " F16C = " + std::to_string (ggml_cpu_has_f16c ()) + " | " ;
4217- s += " FP16_VA = " + std::to_string (ggml_cpu_has_fp16_va ()) + " | " ;
4218- s += " WASM_SIMD = " + std::to_string (ggml_cpu_has_wasm_simd ()) + " | " ;
4219- s += " SSE3 = " + std::to_string (ggml_cpu_has_sse3 ()) + " | " ;
4220- s += " SSSE3 = " + std::to_string (ggml_cpu_has_ssse3 ()) + " | " ;
4221- s += " VSX = " + std::to_string (ggml_cpu_has_vsx ()) + " | " ;
4273+ s += " WHISPER : " ;
42224274 s += " COREML = " + std::to_string (whisper_has_coreml ()) + " | " ;
42234275 s += " OPENVINO = " + std::to_string (whisper_has_openvino ()) + " | " ;
42244276
4277+ for (size_t i = 0 ; i < ggml_backend_reg_count (); i++) {
4278+ auto * reg = ggml_backend_reg_get (i);
4279+ auto * get_features_fn = (ggml_backend_get_features_t ) ggml_backend_reg_get_proc_address (reg, " ggml_backend_get_features" );
4280+ if (get_features_fn) {
4281+ ggml_backend_feature * features = get_features_fn (reg);
4282+ s += ggml_backend_reg_name (reg);
4283+ s += " : " ;
4284+ for (; features->name ; features++) {
4285+ s += features->name ;
4286+ s += " = " ;
4287+ s += features->value ;
4288+ s += " | " ;
4289+ }
4290+ }
4291+ }
42254292 return s.c_str ();
42264293}
42274294
@@ -6653,6 +6720,8 @@ WHISPER_API int whisper_bench_ggml_mul_mat(int n_threads) {
66536720}
66546721
66556722WHISPER_API const char * whisper_bench_ggml_mul_mat_str (int n_threads) {
6723+ whisper_load_backends ();
6724+
66566725 static std::string s;
66576726 s = " " ;
66586727 char strbuf[256 ];
@@ -6672,7 +6741,6 @@ WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) {
66726741 // c: N*N*sizeof(float)
66736742 // when F16 is used, there is an extra work buffer of size N*N*sizeof(float)
66746743 std::vector<uint8_t > buf (3llu*N_max*N_max*sizeof (float ) + 3 *ggml_tensor_overhead () + ggml_graph_overhead ());
6675- std::vector<uint8_t > work;
66766744
66776745 // put a bunch of random data in the buffer
66786746 for (size_t i = 0 ; i < buf.size (); i++) buf[i] = i;
@@ -6729,12 +6797,12 @@ WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) {
67296797 double tsum = 0.0 ;
67306798
67316799 // heat-up
6732- ggml_graph_compute_helper (gf, work, n_threads, nullptr , nullptr );
6800+ ggml_graph_compute_helper (gf, n_threads, nullptr , nullptr );
67336801
67346802 for (int i = 0 ; i < n_max; ++i) {
67356803 const int64_t t0 = ggml_time_us ();
67366804
6737- ggml_graph_compute_helper (gf, work, n_threads, nullptr , nullptr );
6805+ ggml_graph_compute_helper (gf, n_threads, nullptr , nullptr );
67386806
67396807 const int64_t t1 = ggml_time_us ();
67406808
@@ -7111,18 +7179,18 @@ static ggml_tensor * dtw_and_backtrace(ggml_context * ctx, ggml_tensor * x) {
71117179 struct ggml_tensor * cost = ggml_new_tensor_2d (ctx, GGML_TYPE_F32, N + 1 , M + 1 );
71127180 struct ggml_tensor * trace = ggml_new_tensor_2d (ctx, GGML_TYPE_I32, N + 1 , M + 1 );
71137181
7114- cost = ggml_set_f32 (cost, INFINITY);
7115- trace = ggml_set_f32 (trace, -1 );
7116- ggml_set_f32_nd (cost, 0 , 0 , 0 , 0 , 0.0 );
7182+ cost = whisper_set_f32 (cost, INFINITY);
7183+ trace = whisper_set_i32 (trace, -1 );
7184+ whisper_set_f32_nd (cost, 0 , 0 , 0 , 0 , 0.0 );
71177185
71187186 // dtw
71197187 // supposedly can be optmized by computing diagonals in parallel ?
71207188 // Not sure it is worth it since x will be GENERATED_TOKENS*1500 size at most.
71217189 for (int64_t j = 1 ; j < M + 1 ; ++j) {
71227190 for (int64_t i = 1 ; i < N + 1 ; ++i) {
7123- float c0 = ggml_get_f32_nd (cost, i - 1 , j - 1 , 0 , 0 );
7124- float c1 = ggml_get_f32_nd (cost, i - 1 , j, 0 , 0 );
7125- float c2 = ggml_get_f32_nd (cost, i, j - 1 , 0 , 0 );
7191+ float c0 = whisper_get_f32_nd (cost, i - 1 , j - 1 , 0 , 0 );
7192+ float c1 = whisper_get_f32_nd (cost, i - 1 , j, 0 , 0 );
7193+ float c2 = whisper_get_f32_nd (cost, i, j - 1 , 0 , 0 );
71267194
71277195 float c;
71287196 int32_t t;
@@ -7137,9 +7205,9 @@ static ggml_tensor * dtw_and_backtrace(ggml_context * ctx, ggml_tensor * x) {
71377205 t = 2 ;
71387206 }
71397207
7140- c = ggml_get_f32_nd (x, i - 1 , j - 1 , 0 , 0 ) + c;
7141- ggml_set_f32_nd (cost, i, j, 0 , 0 , c);
7142- ggml_set_i32_nd (trace, i, j, 0 , 0 , t);
7208+ c = whisper_get_f32_nd (x, i - 1 , j - 1 , 0 , 0 ) + c;
7209+ whisper_set_f32_nd (cost, i, j, 0 , 0 , c);
7210+ whisper_set_i32_nd (trace, i, j, 0 , 0 , t);
71437211 }
71447212 }
71457213
@@ -7148,19 +7216,19 @@ static ggml_tensor * dtw_and_backtrace(ggml_context * ctx, ggml_tensor * x) {
71487216 struct ggml_tensor * bt = ggml_new_tensor_2d (ctx, GGML_TYPE_I32, BT_MAX_ROWS, 2 );
71497217 // trace[0, :] = 2;
71507218 for (int64_t i = 0 ; i < M + 1 ; ++i)
7151- ggml_set_i32_nd (trace, 0 , i, 0 , 0 , 2 );
7219+ whisper_set_i32_nd (trace, 0 , i, 0 , 0 , 2 );
71527220 // trace[:, 0] = 1;
71537221 for (int64_t i = 0 ; i < N + 1 ; ++i)
7154- ggml_set_i32_nd (trace, i, 0 , 0 , 0 , 1 );
7222+ whisper_set_i32_nd (trace, i, 0 , 0 , 0 , 1 );
71557223 int bt_row_idx = BT_MAX_ROWS - 1 ;
71567224 int64_t i = N;
71577225 int64_t j = M;
71587226 while (i > 0 || j > 0 ) {
7159- ggml_set_i32_nd (bt, bt_row_idx, 0 , 0 , 0 , i - 1 );
7160- ggml_set_i32_nd (bt, bt_row_idx, 1 , 0 , 0 , j - 1 );
7227+ whisper_set_i32_nd (bt, bt_row_idx, 0 , 0 , 0 , i - 1 );
7228+ whisper_set_i32_nd (bt, bt_row_idx, 1 , 0 , 0 , j - 1 );
71617229 --bt_row_idx;
71627230
7163- int32_t t = ggml_get_i32_nd (trace, i, j, 0 , 0 );
7231+ int32_t t = whisper_get_i32_nd (trace, i, j, 0 , 0 );
71647232 if (t == 0 ) {
71657233 --i;
71667234 --j;
@@ -7181,8 +7249,8 @@ static ggml_tensor * dtw_and_backtrace(ggml_context * ctx, ggml_tensor * x) {
71817249 ggml_tensor * r = ggml_new_tensor_2d (ctx, GGML_TYPE_I32, 2 , result_n_cols);
71827250 for (int64_t i = 0 ; i < 2 ; ++i) {
71837251 for (int64_t j = 0 ; j < result_n_cols; ++j) {
7184- int32_t v = ggml_get_i32_nd (bt, j+bt_row_idx+1 , i, 0 , 0 );
7185- ggml_set_i32_nd (r, i, j, 0 , 0 , v);
7252+ int32_t v = whisper_get_i32_nd (bt, j+bt_row_idx+1 , i, 0 , 0 );
7253+ whisper_set_i32_nd (r, i, j, 0 , 0 , v);
71867254 }
71877255 }
71887256
@@ -7217,11 +7285,11 @@ static void median_filter(struct ggml_tensor * dst , const struct ggml_tensor *
72177285 idx = 2 *(a->ne [2 ] - 1 ) - idx;
72187286 }
72197287
7220- filter.push_back (ggml_get_f32_nd (a, i, j, idx, 0 ));
7288+ filter.push_back (whisper_get_f32_nd (a, i, j, idx, 0 ));
72217289 }
72227290 std::sort (filter.begin (), filter.end ());
72237291 const float v = filter[filter.size ()/2 ];
7224- ggml_set_f32_nd (dst, i, j, k, 0 , v);
7292+ whisper_set_f32_nd (dst, i, j, k, 0 , v);
72257293 filter.clear ();
72267294 }
72277295 }
@@ -7343,7 +7411,9 @@ static void whisper_exp_compute_token_level_timestamps_dtw(
73437411 // Compute
73447412 struct ggml_cgraph * gf = ggml_new_graph (gctx);
73457413 ggml_build_forward_expand (gf, w);
7346- ggml_graph_compute_with_ctx (gctx, gf, n_threads);
7414+
7415+ ggml_backend_ptr backend { ggml_backend_init_by_type (GGML_BACKEND_DEVICE_TYPE_CPU, nullptr ) };
7416+ ggml_backend_graph_compute (backend.get (), gf);
73477417
73487418 ggml_tensor * alignment = dtw_and_backtrace (gctx, w);
73497419
@@ -7352,9 +7422,9 @@ static void whisper_exp_compute_token_level_timestamps_dtw(
73527422 auto seg_i = state->result_all .begin () + i_segment;
73537423 auto tok_i = seg_i->tokens .begin ();
73547424 for (int i = 0 ; i < alignment->ne [1 ]; ++i) {
7355- int32_t v = ggml_get_i32_nd (alignment, 0 , i, 0 , 0 );
7425+ int32_t v = whisper_get_i32_nd (alignment, 0 , i, 0 , 0 );
73567426 if (v != last_v) {
7357- int32_t time_index = ggml_get_i32_nd (alignment, 1 , i, 0 , 0 );
7427+ int32_t time_index = whisper_get_i32_nd (alignment, 1 , i, 0 , 0 );
73587428 int64_t timestamp = (time_index * 2 ) + seek; // Each index on DTW result = 20mS audio
73597429 last_v = v;
73607430
0 commit comments