Skip to content

Commit f116b49

Browse files
committed
Fix Whisper seek behavior
1 parent 9820718 commit f116b49

File tree

1 file changed

+34
-3
lines changed

1 file changed

+34
-3
lines changed

whisper/mlx_whisper/transcribe.py

Lines changed: 34 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -380,8 +380,22 @@ def next_words_segment(segments: List[dict]) -> Optional[dict]:
380380
last_slice = current_slice
381381

382382
if single_timestamp_ending:
383-
# single timestamp at the end means no speech after the last timestamp.
384-
seek += segment_size
383+
# When single_timestamp_ending and there's remaining audio,
384+
# advance to the timestamp position instead of full segment to avoid
385+
# skipping content in short audio clips.
386+
last_timestamp_token = tokens[-1].item()
387+
if last_timestamp_token != tokenizer.timestamp_begin:
388+
last_timestamp_pos = (
389+
last_timestamp_token - tokenizer.timestamp_begin
390+
)
391+
timestamp_seek = last_timestamp_pos * input_stride
392+
# Only use timestamp-based seek if there's remaining audio
393+
if seek + timestamp_seek < content_frames:
394+
seek += timestamp_seek
395+
else:
396+
seek += segment_size
397+
else:
398+
seek += segment_size
385399
else:
386400
# otherwise, ignore the unfinished segment and seek to the last timestamp
387401
last_timestamp_pos = (
@@ -409,7 +423,24 @@ def next_words_segment(segments: List[dict]) -> Optional[dict]:
409423
result=result,
410424
)
411425
)
412-
seek += segment_size
426+
# When single_timestamp_ending and there's remaining audio,
427+
# advance to the timestamp position instead of full segment to avoid
428+
# skipping content in short audio clips.
429+
if (
430+
single_timestamp_ending
431+
and len(timestamps) > 0
432+
and timestamps[-1].item() != tokenizer.timestamp_begin
433+
):
434+
last_timestamp_pos = (
435+
timestamps[-1].item() - tokenizer.timestamp_begin
436+
)
437+
timestamp_seek = last_timestamp_pos * input_stride
438+
if seek + timestamp_seek < content_frames:
439+
seek += timestamp_seek
440+
else:
441+
seek += segment_size
442+
else:
443+
seek += segment_size
413444

414445
if word_timestamps:
415446
add_word_timestamps(

0 commit comments

Comments
 (0)