@@ -956,6 +956,15 @@ struct whisper_state {
956956
957957 // [EXPERIMENTAL] speed-up techniques
958958 int32_t exp_n_audio_ctx = 0 ; // 0 - use default
959+
960+ struct vad_segment_info {
961+ float orig_start;
962+ float orig_end;
963+ float vad_start;
964+ float vad_end;
965+ };
966+ std::vector<vad_segment_info> vad_segments;
967+ bool has_vad_segments = false ;
959968};
960969
961970struct whisper_context {
@@ -6703,6 +6712,10 @@ int whisper_full_with_state(
67036712 struct whisper_vad_timestamps timestamps = whisper_vad_detect_speech_timestamps (vctx, vad_params, samples, n_samples);
67046713
67056714 if (timestamps.n_segments > 0 ) {
6715+ state->has_vad_segments = true ;
6716+ ctx->state ->vad_segments .clear ();
6717+ ctx->state ->vad_segments .reserve (timestamps.n_segments );
6718+
67066719 WHISPER_LOG_INFO (" %s: detected %d speech segments\n " , __func__, timestamps.n_segments );
67076720 float overlap_seconds = params.vad_samples_overlap ;
67086721 int overlap_samples = overlap_seconds * WHISPER_SAMPLE_RATE;
@@ -6752,6 +6765,19 @@ int whisper_full_with_state(
67526765 int segment_length = segment_end_samples - segment_start_samples;
67536766
67546767 if (segment_length > 0 ) {
6768+ whisper_state::vad_segment_info segment;
6769+
6770+ segment.orig_start = timestamps.segments [i].start ;
6771+ segment.orig_end = timestamps.segments [i].end ;
6772+
6773+ segment.vad_start = offset / (float )WHISPER_SAMPLE_RATE;
6774+ segment.vad_end = (offset + segment_length) / (float )WHISPER_SAMPLE_RATE;
6775+
6776+
6777+ WHISPER_LOG_INFO (" %s: vad_segment_info: orig_start: %.2f, orig_end: %.2f, vad_start: %.2f, vad_end: %.2f\n " ,
6778+ __func__, segment.orig_start , segment.orig_end , segment.vad_start , segment.vad_end );
6779+ ctx->state ->vad_segments .push_back (segment);
6780+
67556781 // Copy this speech segment
67566782 memcpy (filtered_samples + offset, samples + segment_start_samples, segment_length * sizeof (float ));
67576783 offset += segment_length;
@@ -7826,19 +7852,132 @@ int whisper_full_lang_id(struct whisper_context * ctx) {
78267852}
78277853
78287854int64_t whisper_full_get_segment_t0_from_state (struct whisper_state * state, int i_segment) {
7829- return state->result_all [i_segment].t0 ;
7855+ // If VAD wasn't used, return the original timestamp
7856+ if (!state->has_vad_segments || state->vad_segments .empty ()) {
7857+ return state->result_all [i_segment].t0 ;
7858+ }
7859+
7860+ // For the first segment, always start at 0
7861+ if (i_segment == 0 ) {
7862+ return 0 ;
7863+ }
7864+
7865+ // Get the start timestamp produced by whisper_full. whisper_full processes
7866+ // only the speech segments in this case so we need to map these timestamps
7867+ // back to the original audio.
7868+ float t0 = state->result_all [i_segment].t0 / 100 .0f ;
7869+
7870+ // Find which VAD segment this timestamp belongs.
7871+ for (size_t i = 0 ; i < state->vad_segments .size (); i++) {
7872+ const auto & segment = state->vad_segments [i];
7873+
7874+ // Check if the timestamp falls within this segment.
7875+ if (t0 >= segment.vad_start && t0 <= segment.vad_end ) {
7876+ float proportion = 0 .0f ;
7877+ if (segment.vad_end > segment.vad_start ) {
7878+ proportion = (t0 - segment.vad_start ) / (segment.vad_end - segment.vad_start );
7879+ }
7880+ float orig_t0 = segment.orig_start + proportion * (segment.orig_end - segment.orig_start );
7881+ return (int64_t )(orig_t0 * 100 );
7882+ }
7883+ }
7884+
7885+ // Check if the timestamp falls between two segments.
7886+ for (size_t i = 0 ; i < state->vad_segments .size () - 1 ; i++) {
7887+ const auto & curr = state->vad_segments [i];
7888+ const auto & next = state->vad_segments [i + 1 ];
7889+
7890+ if (t0 > curr.vad_end && t0 < next.vad_start ) {
7891+ // Calculate how far we are through the gap as a proportion
7892+ float gap_proportion = 0 .0f ;
7893+ if (next.vad_start > curr.vad_end ) {
7894+ gap_proportion = (t0 - curr.vad_end ) / (next.vad_start - curr.vad_end );
7895+ }
7896+ float orig_t0 = curr.orig_end + gap_proportion * (next.orig_start - curr.orig_end );
7897+ return (int64_t )(orig_t0 * 100 );
7898+ }
7899+ }
7900+
7901+ // Handle the case where the timestamp is after the last segment.
7902+ if (t0 > state->vad_segments .back ().vad_end ) {
7903+ // For timestamps after the last segment, add the extra time to the end of the last segment
7904+ const auto & last = state->vad_segments .back ();
7905+ // Calculate how far beyond the last segment
7906+ float extra_time = t0 - last.vad_end ;
7907+ // Add this extra time to the original end time
7908+ float orig_t0 = last.orig_end + extra_time;
7909+ return (int64_t )(orig_t0 * 100 );
7910+ }
7911+
7912+ WHISPER_LOG_WARN (" %s: Could not map t0 = %f to a VAD segment\n " , __func__, t0);
7913+ return t0;
78307914}
78317915
78327916int64_t whisper_full_get_segment_t0 (struct whisper_context * ctx, int i_segment) {
7833- return ctx->state -> result_all [ i_segment]. t0 ;
7917+ return whisper_full_get_segment_t0_from_state ( ctx->state , i_segment) ;
78347918}
78357919
78367920int64_t whisper_full_get_segment_t1_from_state (struct whisper_state * state, int i_segment) {
7837- return state->result_all [i_segment].t1 ;
7921+ // If VAD wasn't used, return the original timestamp
7922+ if (!state->has_vad_segments || state->vad_segments .empty ()) {
7923+ return state->result_all [i_segment].t1 ;
7924+ }
7925+
7926+ // Get the end timestamp produced by whisper_full. whisper_full processes
7927+ // only the speech segments in this case so we need to map these timestamps
7928+ // back to the original audio.
7929+ float t1 = state->result_all [i_segment].t1 / 100 .0f ;
7930+
7931+ // Find which VAD segment this timestamp belongs.
7932+ for (size_t i = 0 ; i < state->vad_segments .size (); i++) {
7933+ const auto & segment = state->vad_segments [i];
7934+
7935+ // Check if the timestamp falls within this segment.
7936+ if (t1 >= segment.vad_start && t1 <= segment.vad_end ) {
7937+ // Calculate the proportion through the filtered segment.
7938+ float proportion = 0 .0f ;
7939+ if (segment.vad_end > segment.vad_start ) {
7940+ proportion = (t1 - segment.vad_start ) / (segment.vad_end - segment.vad_start );
7941+ }
7942+ float orig_t1 = segment.orig_start + proportion * (segment.orig_end - segment.orig_start );
7943+ return (int64_t )(orig_t1 * 100 );
7944+ }
7945+ }
7946+
7947+ // Check if the timestamp falls between two segments.
7948+ for (size_t i = 0 ; i < state->vad_segments .size () - 1 ; i++) {
7949+ const auto & curr = state->vad_segments [i];
7950+ const auto & next = state->vad_segments [i + 1 ];
7951+
7952+ if (t1 > curr.vad_end && t1 < next.vad_start ) {
7953+ // Calculate how far we are through the gap as a proportion
7954+ float gap_proportion = 0 .0f ;
7955+ if (next.vad_start > curr.vad_end ) {
7956+ gap_proportion = (t1 - curr.vad_end ) / (next.vad_start - curr.vad_end );
7957+ }
7958+ // Map to the corresponding position in the original gap
7959+ float orig_t1 = curr.orig_end + gap_proportion * (next.orig_start - curr.orig_end );
7960+ return (int64_t )(orig_t1 * 100 );
7961+ }
7962+ }
7963+
7964+ // Handle the case where the timestamp is after the last segment
7965+ if (t1 > state->vad_segments .back ().vad_end ) {
7966+ // For the last segment, use the end of the last VAD segment
7967+ const auto & last = state->vad_segments .back ();
7968+ // Calculate how far beyond the last segment
7969+ float extra_time = t1 - last.vad_end ;
7970+ // Add this extra time to the original end time
7971+ float orig_t1 = last.orig_end + extra_time;
7972+ return (int64_t )(orig_t1 * 100 );
7973+ }
7974+
7975+ WHISPER_LOG_WARN (" %s: Could not map t1 = %f to a VAD segment\n " , __func__, t1);
7976+ return t1;
78387977}
78397978
78407979int64_t whisper_full_get_segment_t1 (struct whisper_context * ctx, int i_segment) {
7841- return ctx->state -> result_all [ i_segment]. t1 ;
7980+ return whisper_full_get_segment_t1_from_state ( ctx->state , i_segment) ;
78427981}
78437982
78447983bool whisper_full_get_segment_speaker_turn_next_from_state (struct whisper_state * state, int i_segment) {
0 commit comments