Skip to content

Commit 705db0f

Browse files
authored
whisper : fix VAD processing for skipped audio segments (#3230)
This commit addresses an issue with token timestamps when audio segments are skipped, in `whisper_exp_compute_token_level_timestamps` related to the VAD processing and the energy levels. The motivation for this is that the token timestamps exceed the energy array bounds due to segment timing misalignment: ```console (skipped introduction) ↓ Audio segment: [2600ms → 5600ms] (3 seconds of actual audio) Energy array: [0 → 480652] (samples for 3 seconds) Token timestamps: [3266ms → 3408ms] (absolute timestamps) ``` So both `s0` and `t1` get clamped to the maximum sample index (480652) which causes the start/end timestamps to be the same for all the tokens after a certain point. This is addressed by using segment-relative timestamps in the `timestamp_to_sample` and `sample_to_timestamp`.
1 parent 0a4d85c commit 705db0f

File tree

1 file changed

+18
-10
lines changed

1 file changed

+18
-10
lines changed

src/whisper.cpp

Lines changed: 18 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -8325,10 +8325,6 @@ WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) {
83258325
// token-level timestamps
83268326
//
83278327

8328-
static int timestamp_to_sample(int64_t t, int n_samples) {
8329-
return std::max(0, std::min((int) n_samples - 1, (int) ((t*WHISPER_SAMPLE_RATE)/100)));
8330-
}
8331-
83328328
static int64_t sample_to_timestamp(int i_sample) {
83338329
return (100ll*i_sample)/WHISPER_SAMPLE_RATE;
83348330
}
@@ -8378,6 +8374,18 @@ static std::vector<float> get_signal_energy(const float * signal, int n_samples,
83788374
return result;
83798375
}
83808376

8377+
static int timestamp_to_sample(int64_t t, int64_t segment_t0, int n_samples) {
8378+
// Convert absolute timestamp to segment-relative timestamp
8379+
int64_t relative_t = t - segment_t0;
8380+
int sample = (int)((relative_t * WHISPER_SAMPLE_RATE) / 100);
8381+
return std::max(0, std::min(n_samples - 1, sample));
8382+
}
8383+
8384+
static int64_t sample_to_timestamp(int i_sample, int64_t segment_t0) {
8385+
int64_t relative_timestamp = (100ll * i_sample) / WHISPER_SAMPLE_RATE;
8386+
return relative_timestamp + segment_t0;
8387+
}
8388+
83818389
static void whisper_exp_compute_token_level_timestamps(
83828390
struct whisper_context & ctx,
83838391
struct whisper_state & state,
@@ -8518,8 +8526,8 @@ static void whisper_exp_compute_token_level_timestamps(
85188526
continue;
85198527
}
85208528

8521-
int s0 = timestamp_to_sample(tokens[j].t0, n_samples);
8522-
int s1 = timestamp_to_sample(tokens[j].t1, n_samples);
8529+
int s0 = timestamp_to_sample(tokens[j].t0, segment.t0, n_samples);
8530+
int s1 = timestamp_to_sample(tokens[j].t1, segment.t0, n_samples);
85238531

85248532
const int ss0 = std::max(s0 - hw, 0);
85258533
const int ss1 = std::min(s1 + hw, n_samples);
@@ -8540,7 +8548,7 @@ static void whisper_exp_compute_token_level_timestamps(
85408548
while (k > 0 && state.energy[k] > thold) {
85418549
k--;
85428550
}
8543-
tokens[j].t0 = sample_to_timestamp(k);
8551+
tokens[j].t0 = sample_to_timestamp(k, segment.t0);
85448552
if (tokens[j].t0 < tokens[j - 1].t1) {
85458553
tokens[j].t0 = tokens[j - 1].t1;
85468554
} else {
@@ -8551,7 +8559,7 @@ static void whisper_exp_compute_token_level_timestamps(
85518559
k++;
85528560
}
85538561
s0 = k;
8554-
tokens[j].t0 = sample_to_timestamp(k);
8562+
tokens[j].t0 = sample_to_timestamp(k, segment.t0);
85558563
}
85568564
}
85578565

@@ -8561,7 +8569,7 @@ static void whisper_exp_compute_token_level_timestamps(
85618569
while (k < n_samples - 1 && state.energy[k] > thold) {
85628570
k++;
85638571
}
8564-
tokens[j].t1 = sample_to_timestamp(k);
8572+
tokens[j].t1 = sample_to_timestamp(k, segment.t0);
85658573
if (j < n - 1 && tokens[j].t1 > tokens[j + 1].t0) {
85668574
tokens[j].t1 = tokens[j + 1].t0;
85678575
} else {
@@ -8572,7 +8580,7 @@ static void whisper_exp_compute_token_level_timestamps(
85728580
k--;
85738581
}
85748582
s1 = k;
8575-
tokens[j].t1 = sample_to_timestamp(k);
8583+
tokens[j].t1 = sample_to_timestamp(k, segment.t0);
85768584
}
85778585
}
85788586
}

0 commit comments

Comments
 (0)