Skip to content

Commit df7a3d8

Browse files
Fix whisper timestamp extraction for tokenizers with added tokens (#804)
* support user defined tokens by bounding timestamp token if statement * Update src/tokenizers.js Co-authored-by: Joshua Lochner <[email protected]> * calculate timestamp_end instead of hardcoding * Update tokenizers.js * Merge conflict resolution --------- Co-authored-by: Joshua Lochner <[email protected]>
1 parent 7dffb9a commit df7a3d8

File tree

1 file changed

+6
-1
lines changed

1 file changed

+6
-1
lines changed

src/tokenizers.js

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3583,6 +3583,11 @@ export class WhisperTokenizer extends PreTrainedTokenizer {
35833583
let chunk = new_chunk();
35843584
let time_offset = 0.0;
35853585
const timestamp_begin = this.timestamp_begin;
3586+
// Whisper timestamp tokens start from 0.00 and go to timestamp 30.00 in 0.02 increments.
3587+
// We can calculate the last time stamp token as timestamp_begin plus the number of tokens
3588+
// tokens from 0.00 to 30.00 which is 1500.
3589+
const total_timestamp_tokens = 1500; // (30.00 - 0.00) / 0.02
3590+
const timestamp_end = timestamp_begin + total_timestamp_tokens;
35863591

35873592
let previous_tokens = [];
35883593
let previous_token_timestamps = [];
@@ -3670,7 +3675,7 @@ export class WhisperTokenizer extends PreTrainedTokenizer {
36703675
} else {
36713676
// 2/ This is a regular special token, ignoring it
36723677
}
3673-
} else if (token >= timestamp_begin) {
3678+
} else if (token >= timestamp_begin && token <= timestamp_end) {
36743679
// 3/ Timestamp token
36753680
const time = (token - timestamp_begin) * time_precision + time_offset;
36763681
const rounded_time = round(time, 2);

0 commit comments

Comments
 (0)