@@ -380,8 +380,22 @@ def next_words_segment(segments: List[dict]) -> Optional[dict]:
380380 last_slice = current_slice
381381
382382 if single_timestamp_ending :
383- # single timestamp at the end means no speech after the last timestamp.
384- seek += segment_size
383+ # When single_timestamp_ending and there's remaining audio,
384+ # advance to the timestamp position instead of full segment to avoid
385+ # skipping content in short audio clips.
386+ last_timestamp_token = tokens [- 1 ].item ()
387+ if last_timestamp_token != tokenizer .timestamp_begin :
388+ last_timestamp_pos = (
389+ last_timestamp_token - tokenizer .timestamp_begin
390+ )
391+ timestamp_seek = last_timestamp_pos * input_stride
392+ # Only use timestamp-based seek if there's remaining audio
393+ if seek + timestamp_seek < content_frames :
394+ seek += timestamp_seek
395+ else :
396+ seek += segment_size
397+ else :
398+ seek += segment_size
385399 else :
386400 # otherwise, ignore the unfinished segment and seek to the last timestamp
387401 last_timestamp_pos = (
@@ -409,7 +423,24 @@ def next_words_segment(segments: List[dict]) -> Optional[dict]:
409423 result = result ,
410424 )
411425 )
412- seek += segment_size
426+ # When single_timestamp_ending and there's remaining audio,
427+ # advance to the timestamp position instead of full segment to avoid
428+ # skipping content in short audio clips.
429+ if (
430+ single_timestamp_ending
431+ and len (timestamps ) > 0
432+ and timestamps [- 1 ].item () != tokenizer .timestamp_begin
433+ ):
434+ last_timestamp_pos = (
435+ timestamps [- 1 ].item () - tokenizer .timestamp_begin
436+ )
437+ timestamp_seek = last_timestamp_pos * input_stride
438+ if seek + timestamp_seek < content_frames :
439+ seek += timestamp_seek
440+ else :
441+ seek += segment_size
442+ else :
443+ seek += segment_size
413444
414445 if word_timestamps :
415446 add_word_timestamps (
0 commit comments