@@ -170,7 +170,6 @@ static bool ggml_graph_compute_helper(
170170 int n_threads,
171171 ggml_abort_callback abort_callback,
172172 void * abort_callback_data) {
173-
174173 ggml_backend_ptr backend { ggml_backend_init_by_type (GGML_BACKEND_DEVICE_TYPE_CPU, nullptr ) };
175174
176175 auto * reg = ggml_backend_dev_backend_reg (ggml_backend_get_device (backend.get ()));
@@ -191,8 +190,8 @@ static bool ggml_graph_compute_helper(
191190static bool ggml_graph_compute_helper (
192191 ggml_backend_sched_t sched,
193192 struct ggml_cgraph * graph,
194- int n_threads) {
195-
193+ int n_threads,
194+ bool sched_reset = true ) {
196195 for (int i = 0 ; i < ggml_backend_sched_get_n_backends (sched); ++i) {
197196 ggml_backend_t backend = ggml_backend_sched_get_backend (sched, i);
198197 ggml_backend_dev_t dev = ggml_backend_get_device (backend);
@@ -204,8 +203,12 @@ static bool ggml_graph_compute_helper(
204203 }
205204 }
206205
207- bool t = ggml_backend_sched_graph_compute (sched, graph) == GGML_STATUS_SUCCESS;
208- ggml_backend_sched_reset (sched);
206+ const bool t = (ggml_backend_sched_graph_compute (sched, graph) == GGML_STATUS_SUCCESS);
207+
208+ if (!t || sched_reset) {
209+ ggml_backend_sched_reset (sched);
210+ }
211+
209212 return t;
210213}
211214
@@ -4421,6 +4424,10 @@ struct whisper_vad_state {
44214424 struct ggml_tensor * h_state;
44224425 struct ggml_tensor * c_state;
44234426
4427+ ggml_backend_buffer_t buffer = nullptr ;
4428+
4429+ std::vector<uint8_t > ctx_buf;
4430+
44244431 whisper_sched sched;
44254432};
44264433
@@ -4443,9 +4450,7 @@ struct whisper_vad_context {
44434450struct whisper_vad_context_params whisper_vad_default_context_params (void ) {
44444451 whisper_vad_context_params result = {
44454452 /* .n_thread = */ 4 ,
4446- // TODO(danbev) Default to true when CUDA GPU support is working:
4447- // https://github.com/ggml-org/whisper.cpp/pull/3065#issuecomment-2858583911
4448- /* .use_gpu = */ false ,
4453+ /* .use_gpu = */ true ,
44494454 /* .gpu_device = */ 0 ,
44504455 };
44514456 return result;
@@ -4601,6 +4606,9 @@ static ggml_tensor * whisper_vad_build_lstm_layer(ggml_context * ctx0,
46014606
46024607 // Create add operation to get preactivations for all gates.
46034608 struct ggml_tensor * out_gate = ggml_add (ctx0, inp_gate, hid_gate);
4609+
4610+ ggml_build_forward_expand (gf, out_gate);
4611+
46044612 const size_t hdim_size = ggml_row_size (out_gate->type , hdim);
46054613
46064614 // Create sigmoid for input gate (using the first 128 bytes from the preactivations).
@@ -4623,12 +4631,13 @@ static ggml_tensor * whisper_vad_build_lstm_layer(ggml_context * ctx0,
46234631
46244632 // Update hidden state
46254633 struct ggml_tensor * out = ggml_mul (ctx0, o_t , ggml_tanh (ctx0, c_out));
4626- ggml_build_forward_expand (gf, ggml_cpy (ctx0, out, vctx.state ->h_state ));
4634+ ggml_build_forward_expand (gf, ggml_cpy (ctx0, out, vctx.state ->h_state ));
4635+
46274636 return out;
46284637}
46294638
46304639static struct ggml_cgraph * whisper_vad_build_graph (whisper_vad_context & vctx) {
4631- const auto & model = vctx.model ;
4640+ const auto & model = vctx.model ;
46324641
46334642 struct ggml_init_params params = {
46344643 /* .mem_size =*/ vctx.state ->sched .meta .size (),
@@ -4677,22 +4686,28 @@ struct whisper_vad_state * whisper_vad_init_state(whisper_vad_context * vctx) {
46774686 auto whisper_context_params = whisper_context_default_params ();
46784687 whisper_context_params.use_gpu = vctx->params .use_gpu ;
46794688 whisper_context_params.gpu_device = vctx->params .gpu_device ;
4689+
46804690 state->backends = whisper_backend_init (whisper_context_params);
46814691 if (state->backends .empty ()) {
46824692 WHISPER_LOG_ERROR (" %s: whisper_backend_init() failed\n " , __func__);
46834693 whisper_vad_free_state (state);
46844694 return nullptr ;
46854695 }
46864696
4687- int32_t lstm_hidden_size = vctx->model .hparams .lstm_hidden_size ;
4697+ const int32_t lstm_hidden_size = vctx->model .hparams .lstm_hidden_size ;
4698+
4699+ state->ctx_buf .resize (2u *ggml_tensor_overhead ());
4700+
46884701 struct ggml_init_params params = {
4689- /* .mem_size =*/ size_t ( 2u *lstm_hidden_size* ggml_tensor_overhead () ),
4690- /* .mem_buffer =*/ NULL ,
4702+ /* .mem_size =*/ state-> ctx_buf . size ( ),
4703+ /* .mem_buffer =*/ state-> ctx_buf . data () ,
46914704 /* .no_alloc =*/ true ,
46924705 };
4706+
46934707 ggml_context * ctx = ggml_init (params);
46944708 if (!ctx) {
46954709 WHISPER_LOG_ERROR (" %s: failed to init LSTM state ggml context\n " , __func__);
4710+ whisper_vad_free_state (state);
46964711 return nullptr ;
46974712 }
46984713
@@ -4704,6 +4719,13 @@ struct whisper_vad_state * whisper_vad_init_state(whisper_vad_context * vctx) {
47044719 state->c_state = ggml_new_tensor_1d (ctx, GGML_TYPE_F32, lstm_hidden_size);
47054720 ggml_set_name (state->c_state , " c_state" );
47064721
4722+ state->buffer = ggml_backend_alloc_ctx_tensors (ctx, state->backends [0 ]);
4723+ if (!state->buffer ) {
4724+ WHISPER_LOG_ERROR (" %s: failed to allocate memory for the VAD state\n " , __func__);
4725+ whisper_vad_free_state (state);
4726+ return nullptr ;
4727+ }
4728+
47074729 {
47084730 bool ok = whisper_sched_graph_init (state->sched , state->backends ,
47094731 [&]() {
@@ -5106,11 +5128,20 @@ struct whisper_vad_speech whisper_vad_detect_speech(struct whisper_vad_context *
51065128 if (n_samples % vctx->n_window != 0 ) {
51075129 n_chunks += 1 ; // Add one more chunk for remaining samples.
51085130 }
5109- auto & sched = vctx->state ->sched .sched ;
5110-
51115131 WHISPER_LOG_INFO (" %s: detecting speech in %d samples\n " , __func__, n_samples);
51125132 WHISPER_LOG_INFO (" %s: n_chunks: %d\n " , __func__, n_chunks);
51135133
5134+ // Reset LSTM hidden/cell states
5135+ ggml_backend_buffer_clear (vctx->state ->buffer , 0 );
5136+
5137+ // TODO: move to vad state and change to std::vector<float>
5138+ float * probs = new float [n_chunks];
5139+ WHISPER_LOG_INFO (" %s: props size: %u\n " , __func__, n_chunks);
5140+
5141+ std::vector<float > window (vctx->n_window , 0 .0f );
5142+
5143+ auto & sched = vctx->state ->sched .sched ;
5144+
51145145 ggml_cgraph * gf = whisper_vad_build_graph (*vctx);
51155146
51165147 if (!ggml_backend_sched_alloc_graph (sched, gf)) {
@@ -5120,19 +5151,13 @@ struct whisper_vad_speech whisper_vad_detect_speech(struct whisper_vad_context *
51205151
51215152 struct ggml_tensor * frame = ggml_graph_get_tensor (gf, " frame" );
51225153 struct ggml_tensor * prob = ggml_graph_get_tensor (gf, " prob" );
5123- ggml_set_zero (prob);
51245154
5125- // Reset LSTM hidden/cell states
5126- ggml_set_zero (vctx->state ->h_state );
5127- ggml_set_zero (vctx->state ->c_state );
5128-
5129- float * probs= new float [n_chunks];
5130- WHISPER_LOG_INFO (" %s: props size: %u\n " , __func__, n_chunks);
5131-
5132- std::vector<float > window (vctx->n_window , 0 .0f );
5155+ // we are going to reuse the graph multiple times for each chunk
5156+ // TODO: measure time and print timing information for this step
51335157 for (int i = 0 ; i < n_chunks; i++) {
51345158 int start_idx = i * vctx->n_window ;
51355159 int end_idx = std::min (start_idx + vctx->n_window , n_samples);
5160+
51365161 int chunk_len = end_idx - start_idx;
51375162
51385163 if (chunk_len < vctx->n_window ) {
@@ -5150,28 +5175,33 @@ struct whisper_vad_speech whisper_vad_detect_speech(struct whisper_vad_context *
51505175 } else {
51515176 // Copy current frame samples to the window.
51525177 int samples_to_copy = std::min (end_idx - start_idx, vctx->n_window );
5153- std::copy (pcmf32 + start_idx, pcmf32 + start_idx + samples_to_copy,
5154- window.begin ());
5178+ std::copy (pcmf32 + start_idx, pcmf32 + start_idx + samples_to_copy, window.begin ());
51555179 }
51565180
51575181 // Set the frame tensor data with the samples.
51585182 ggml_backend_tensor_set (frame, window.data (), 0 , ggml_nelements (frame) * sizeof (float ));
51595183
5160- if (!ggml_graph_compute_helper (sched, gf, vctx->n_threads )) {
5184+ // do not reset the scheduler - we will reuse the graph in the next chunk
5185+ if (!ggml_graph_compute_helper (sched, gf, vctx->n_threads , false )) {
51615186 WHISPER_LOG_ERROR (" %s: failed to compute VAD graph\n " , __func__);
51625187 break ;
51635188 }
51645189
51655190 // Get the probability for this chunk.
51665191 ggml_backend_tensor_get (prob, &probs[i], 0 , sizeof (float ));
51675192
5193+ // WHISPER_LOG_DEBUG("chunk %d: p = %7.3f\n", i, probs[i]);
51685194 }
5195+
5196+ ggml_backend_sched_reset (sched);
5197+
51695198 WHISPER_LOG_INFO (" %s: finished processing %d samples\n " , __func__, n_samples);
51705199
51715200 struct whisper_vad_speech speech = {
51725201 /* n_probs = */ n_chunks,
51735202 /* probs = */ probs,
51745203 };
5204+
51755205 return speech;
51765206}
51775207
0 commit comments