Skip to content

Commit c5e33f4

Browse files
committed
whisper-cli : align token timestamps with VAD ts
This commit aligns the token timestamps with the VAD timestamps when VAD is enabled. The motivation of this is that currently the token timestamps that are reported in the full json output are the timestamps that whisper sees after the VAD has processed the audio. This means that whisper only sees possibly filtered audio and the token timestamps are related to the filtered audio, not the original audio. For the segment timestamps we map/align them with original timestamps but this is not currenly done for the token timestamps which is what this commit aims to address. Resolves: #3174
1 parent 0083335 commit c5e33f4

File tree

3 files changed

+13
-1
lines changed

3 files changed

+13
-1
lines changed

examples/cli/cli.cpp

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -727,7 +727,13 @@ static void output_json(
727727
value_s("text", whisper_token_to_str(ctx, token.id), false);
728728
if(token.t0 > -1 && token.t1 > -1) {
729729
// If we have per-token timestamps, write them out
730-
times_o(token.t0, token.t1, false);
730+
if (params.vad) {
731+
times_o(vad_ts_to_original_ts(token.t0, ctx),
732+
vad_ts_to_original_ts(token.t1, ctx),
733+
false);
734+
} else {
735+
times_o(token.t0, token.t1, false);
736+
}
731737
}
732738
value_i("id", token.id, false);
733739
value_f("p", token.p, false);

include/whisper.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -712,6 +712,8 @@ extern "C" {
712712
WHISPER_API float whisper_vad_segments_get_segment_t0(struct whisper_vad_segments * segments, int i_segment);
713713
WHISPER_API float whisper_vad_segments_get_segment_t1(struct whisper_vad_segments * segments, int i_segment);
714714

715+
WHISPER_API int64_t vad_ts_to_original_ts(int64_t vad_ts, struct whisper_context * ctx);
716+
715717
WHISPER_API void whisper_vad_free_segments(struct whisper_vad_segments * segments);
716718
WHISPER_API void whisper_vad_free (struct whisper_vad_context * ctx);
717719

src/whisper.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7968,6 +7968,10 @@ int64_t whisper_full_get_segment_t1(struct whisper_context * ctx, int i_segment)
79687968
return whisper_full_get_segment_t1_from_state(ctx->state, i_segment);
79697969
}
79707970

7971+
int64_t vad_ts_to_original_ts(int64_t vad_ts, struct whisper_context * ctx) {
7972+
return map_processed_to_original_time(vad_ts, ctx->state->vad_mapping_table);
7973+
}
7974+
79717975
bool whisper_full_get_segment_speaker_turn_next_from_state(struct whisper_state * state, int i_segment) {
79727976
return state->result_all[i_segment].speaker_turn_next;
79737977
}

0 commit comments

Comments
 (0)