From 43ee3be2605c94d807f85942fdfb7186d6abdf92 Mon Sep 17 00:00:00 2001 From: fengjiao_amdeng Date: Thu, 9 Oct 2025 15:05:07 +0800 Subject: [PATCH] token: Add interfaces for getting start and end time of specified token similar like segment --- include/whisper.h | 7 +++++++ src/whisper.cpp | 39 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 46 insertions(+) diff --git a/include/whisper.h b/include/whisper.h index fcd756a9fe2..adb7cca476e 100644 --- a/include/whisper.h +++ b/include/whisper.h @@ -670,6 +670,13 @@ extern "C" { WHISPER_API float whisper_full_get_token_p (struct whisper_context * ctx, int i_segment, int i_token); WHISPER_API float whisper_full_get_token_p_from_state(struct whisper_state * state, int i_segment, int i_token); + // Get the start and end time of the specified token + WHISPER_API int64_t whisper_full_get_token_t0(struct whisper_context* ctx, int i_segment, int i_token); + WHISPER_API int64_t whisper_full_get_token_t0_from_state(struct whisper_state* state, struct whisper_token_data* token); + + WHISPER_API int64_t whisper_full_get_token_t1(struct whisper_context* ctx, int i_segment, int i_token); + WHISPER_API int64_t whisper_full_get_token_t1_from_state(struct whisper_state* state, struct whisper_token_data* token); + // // Voice Activity Detection (VAD) // diff --git a/src/whisper.cpp b/src/whisper.cpp index 39c53ba233a..4bddecefbe7 100644 --- a/src/whisper.cpp +++ b/src/whisper.cpp @@ -8001,6 +8001,45 @@ float whisper_full_get_token_p(struct whisper_context * ctx, int i_segment, int return ctx->state->result_all[i_segment].tokens[i_token].p; } +int64_t whisper_full_get_token_t0_from_state(struct whisper_state* state, struct whisper_token_data* token) +{ + if (!state->has_vad_segments || state->vad_mapping_table.empty()) { + return token->t0; + } + return map_processed_to_original_time(token->t0, state->vad_mapping_table); +} +int64_t whisper_full_get_token_t1_from_state(struct whisper_state* state, struct whisper_token_data* token) +{ + if (!state->has_vad_segments || state->vad_mapping_table.empty()) { + return token->t1; + } + + int64_t t1 = token->t1; + + int64_t orig_t1 = map_processed_to_original_time(t1, state->vad_mapping_table); + + int64_t orig_t0 = whisper_full_get_token_t0_from_state(state, token); + + // Ensure minimum duration to prevent zero-length token + const int64_t min_duration = 10; // 10ms minimum + if (orig_t1 - orig_t0 < min_duration) { + orig_t1 = orig_t0 + min_duration; + } + return orig_t1; +} + +int64_t whisper_full_get_token_t0(struct whisper_context* ctx, int i_segment, int i_token) +{ + whisper_token_data token = whisper_full_get_token_data(ctx, i_segment, i_token); + return whisper_full_get_token_t0_from_state(ctx->state, &token); +} +int64_t whisper_full_get_token_t1(struct whisper_context* ctx, int i_segment, int i_token) +{ + whisper_token_data token = whisper_full_get_token_data(ctx, i_segment, i_token); + return whisper_full_get_token_t1_from_state(ctx->state, &token); +} + + float whisper_full_get_segment_no_speech_prob(struct whisper_context * ctx, int i_segment) { return ctx->state->result_all[i_segment].no_speech_prob; }