@@ -170,7 +170,6 @@ static bool ggml_graph_compute_helper(
170170 int n_threads,
171171 ggml_abort_callback abort_callback,
172172 void * abort_callback_data) {
173-
174173 ggml_backend_ptr backend { ggml_backend_init_by_type (GGML_BACKEND_DEVICE_TYPE_CPU, nullptr ) };
175174
176175 auto * reg = ggml_backend_dev_backend_reg (ggml_backend_get_device (backend.get ()));
@@ -191,8 +190,8 @@ static bool ggml_graph_compute_helper(
191190static bool ggml_graph_compute_helper (
192191 ggml_backend_sched_t sched,
193192 struct ggml_cgraph * graph,
194- int n_threads) {
195-
193+ int n_threads,
194+ bool sched_reset = true ) {
196195 for (int i = 0 ; i < ggml_backend_sched_get_n_backends (sched); ++i) {
197196 ggml_backend_t backend = ggml_backend_sched_get_backend (sched, i);
198197 ggml_backend_dev_t dev = ggml_backend_get_device (backend);
@@ -204,8 +203,12 @@ static bool ggml_graph_compute_helper(
204203 }
205204 }
206205
207- bool t = ggml_backend_sched_graph_compute (sched, graph) == GGML_STATUS_SUCCESS;
208- ggml_backend_sched_reset (sched);
206+ const bool t = (ggml_backend_sched_graph_compute (sched, graph) == GGML_STATUS_SUCCESS);
207+
208+ if (!t || sched_reset) {
209+ ggml_backend_sched_reset (sched);
210+ }
211+
209212 return t;
210213}
211214
@@ -4421,6 +4424,10 @@ struct whisper_vad_state {
44214424 struct ggml_tensor * h_state;
44224425 struct ggml_tensor * c_state;
44234426
4427+ ggml_backend_buffer_t buffer = nullptr ;
4428+
4429+ std::vector<uint8_t > ctx_buf;
4430+
44244431 whisper_sched sched;
44254432};
44264433
@@ -4443,9 +4450,7 @@ struct whisper_vad_context {
44434450struct whisper_vad_context_params whisper_vad_default_context_params (void ) {
44444451 whisper_vad_context_params result = {
44454452 /* .n_thread = */ 4 ,
4446- // TODO(danbev) Default to true when CUDA GPU support is working:
4447- // https://github.com/ggml-org/whisper.cpp/pull/3065#issuecomment-2858583911
4448- /* .use_gpu = */ false ,
4453+ /* .use_gpu = */ true ,
44494454 /* .gpu_device = */ 0 ,
44504455 };
44514456 return result;
@@ -4601,6 +4606,7 @@ static ggml_tensor * whisper_vad_build_lstm_layer(ggml_context * ctx0,
46014606
46024607 // Create add operation to get preactivations for all gates.
46034608 struct ggml_tensor * out_gate = ggml_add (ctx0, inp_gate, hid_gate);
4609+
46044610 const size_t hdim_size = ggml_row_size (out_gate->type , hdim);
46054611
46064612 // Create sigmoid for input gate (using the first 128 bytes from the preactivations).
@@ -4623,12 +4629,13 @@ static ggml_tensor * whisper_vad_build_lstm_layer(ggml_context * ctx0,
46234629
46244630 // Update hidden state
46254631 struct ggml_tensor * out = ggml_mul (ctx0, o_t , ggml_tanh (ctx0, c_out));
4626- ggml_build_forward_expand (gf, ggml_cpy (ctx0, out, vctx.state ->h_state ));
4632+ ggml_build_forward_expand (gf, ggml_cpy (ctx0, out, vctx.state ->h_state ));
4633+
46274634 return out;
46284635}
46294636
46304637static struct ggml_cgraph * whisper_vad_build_graph (whisper_vad_context & vctx) {
4631- const auto & model = vctx.model ;
4638+ const auto & model = vctx.model ;
46324639
46334640 struct ggml_init_params params = {
46344641 /* .mem_size =*/ vctx.state ->sched .meta .size (),
@@ -4677,22 +4684,28 @@ struct whisper_vad_state * whisper_vad_init_state(whisper_vad_context * vctx) {
46774684 auto whisper_context_params = whisper_context_default_params ();
46784685 whisper_context_params.use_gpu = vctx->params .use_gpu ;
46794686 whisper_context_params.gpu_device = vctx->params .gpu_device ;
4687+
46804688 state->backends = whisper_backend_init (whisper_context_params);
46814689 if (state->backends .empty ()) {
46824690 WHISPER_LOG_ERROR (" %s: whisper_backend_init() failed\n " , __func__);
46834691 whisper_vad_free_state (state);
46844692 return nullptr ;
46854693 }
46864694
4687- int32_t lstm_hidden_size = vctx->model .hparams .lstm_hidden_size ;
4695+ const int32_t lstm_hidden_size = vctx->model .hparams .lstm_hidden_size ;
4696+
4697+ state->ctx_buf .resize (2u *ggml_tensor_overhead ());
4698+
46884699 struct ggml_init_params params = {
4689- /* .mem_size =*/ size_t ( 2u *lstm_hidden_size* ggml_tensor_overhead () ),
4690- /* .mem_buffer =*/ NULL ,
4700+ /* .mem_size =*/ state-> ctx_buf . size ( ),
4701+ /* .mem_buffer =*/ state-> ctx_buf . data () ,
46914702 /* .no_alloc =*/ true ,
46924703 };
4704+
46934705 ggml_context * ctx = ggml_init (params);
46944706 if (!ctx) {
46954707 WHISPER_LOG_ERROR (" %s: failed to init LSTM state ggml context\n " , __func__);
4708+ whisper_vad_free_state (state);
46964709 return nullptr ;
46974710 }
46984711
@@ -4704,6 +4717,13 @@ struct whisper_vad_state * whisper_vad_init_state(whisper_vad_context * vctx) {
47044717 state->c_state = ggml_new_tensor_1d (ctx, GGML_TYPE_F32, lstm_hidden_size);
47054718 ggml_set_name (state->c_state , " c_state" );
47064719
4720+ state->buffer = ggml_backend_alloc_ctx_tensors (ctx, state->backends [0 ]);
4721+ if (!state->buffer ) {
4722+ WHISPER_LOG_ERROR (" %s: failed to allocate memory for the VAD state\n " , __func__);
4723+ whisper_vad_free_state (state);
4724+ return nullptr ;
4725+ }
4726+
47074727 {
47084728 bool ok = whisper_sched_graph_init (state->sched , state->backends ,
47094729 [&]() {
@@ -5106,11 +5126,20 @@ struct whisper_vad_speech whisper_vad_detect_speech(struct whisper_vad_context *
51065126 if (n_samples % vctx->n_window != 0 ) {
51075127 n_chunks += 1 ; // Add one more chunk for remaining samples.
51085128 }
5109- auto & sched = vctx->state ->sched .sched ;
5110-
51115129 WHISPER_LOG_INFO (" %s: detecting speech in %d samples\n " , __func__, n_samples);
51125130 WHISPER_LOG_INFO (" %s: n_chunks: %d\n " , __func__, n_chunks);
51135131
5132+ // Reset LSTM hidden/cell states
5133+ ggml_backend_buffer_clear (vctx->state ->buffer , 0 );
5134+
5135+ // TODO: move to vad state and change to std::vector<float>
5136+ float * probs = new float [n_chunks];
5137+ WHISPER_LOG_INFO (" %s: props size: %u\n " , __func__, n_chunks);
5138+
5139+ std::vector<float > window (vctx->n_window , 0 .0f );
5140+
5141+ auto & sched = vctx->state ->sched .sched ;
5142+
51145143 ggml_cgraph * gf = whisper_vad_build_graph (*vctx);
51155144
51165145 if (!ggml_backend_sched_alloc_graph (sched, gf)) {
@@ -5120,19 +5149,13 @@ struct whisper_vad_speech whisper_vad_detect_speech(struct whisper_vad_context *
51205149
51215150 struct ggml_tensor * frame = ggml_graph_get_tensor (gf, " frame" );
51225151 struct ggml_tensor * prob = ggml_graph_get_tensor (gf, " prob" );
5123- ggml_set_zero (prob);
51245152
5125- // Reset LSTM hidden/cell states
5126- ggml_set_zero (vctx->state ->h_state );
5127- ggml_set_zero (vctx->state ->c_state );
5128-
5129- float * probs= new float [n_chunks];
5130- WHISPER_LOG_INFO (" %s: props size: %u\n " , __func__, n_chunks);
5131-
5132- std::vector<float > window (vctx->n_window , 0 .0f );
5153+ // we are going to reuse the graph multiple times for each chunk
5154+ // TODO: measure time and print timing information for this step
51335155 for (int i = 0 ; i < n_chunks; i++) {
51345156 int start_idx = i * vctx->n_window ;
51355157 int end_idx = std::min (start_idx + vctx->n_window , n_samples);
5158+
51365159 int chunk_len = end_idx - start_idx;
51375160
51385161 if (chunk_len < vctx->n_window ) {
@@ -5150,28 +5173,33 @@ struct whisper_vad_speech whisper_vad_detect_speech(struct whisper_vad_context *
51505173 } else {
51515174 // Copy current frame samples to the window.
51525175 int samples_to_copy = std::min (end_idx - start_idx, vctx->n_window );
5153- std::copy (pcmf32 + start_idx, pcmf32 + start_idx + samples_to_copy,
5154- window.begin ());
5176+ std::copy (pcmf32 + start_idx, pcmf32 + start_idx + samples_to_copy, window.begin ());
51555177 }
51565178
51575179 // Set the frame tensor data with the samples.
51585180 ggml_backend_tensor_set (frame, window.data (), 0 , ggml_nelements (frame) * sizeof (float ));
51595181
5160- if (!ggml_graph_compute_helper (sched, gf, vctx->n_threads )) {
5182+ // do not reset the scheduler - we will reuse the graph in the next chunk
5183+ if (!ggml_graph_compute_helper (sched, gf, vctx->n_threads , false )) {
51615184 WHISPER_LOG_ERROR (" %s: failed to compute VAD graph\n " , __func__);
51625185 break ;
51635186 }
51645187
51655188 // Get the probability for this chunk.
51665189 ggml_backend_tensor_get (prob, &probs[i], 0 , sizeof (float ));
51675190
5191+ // WHISPER_LOG_DEBUG("chunk %d: p = %7.3f\n", i, probs[i]);
51685192 }
5193+
5194+ ggml_backend_sched_reset (sched);
5195+
51695196 WHISPER_LOG_INFO (" %s: finished processing %d samples\n " , __func__, n_samples);
51705197
51715198 struct whisper_vad_speech speech = {
51725199 /* n_probs = */ n_chunks,
51735200 /* probs = */ probs,
51745201 };
5202+
51755203 return speech;
51765204}
51775205
0 commit comments