@@ -1001,7 +1001,7 @@ def _join_char_level_timestamps(self, merged_hypothesis, hypotheses, chunk_offse
10011001 # Track time adjustments for filtered tokens within this chunk
10021002 time_adjustment_frames = 0
10031003 for i , h in enumerate (hypotheses ):
1004-
1004+
10051005 cumulative_offset += chunk_offsets [i ] # self.chunk_offsets starts with 0,
10061006
10071007 # update frame numbers
@@ -1011,7 +1011,7 @@ def _join_char_level_timestamps(self, merged_hypothesis, hypotheses, chunk_offse
10111011 for char in h .timestamp ['char' ]:
10121012 if not char :
10131013 continue
1014-
1014+
10151015 # If merged_tokens is provided, filter based on token matching
10161016 if merged_tokens is not None :
10171017 # Skip if we've processed all merged tokens or token doesn't match
@@ -1021,12 +1021,12 @@ def _join_char_level_timestamps(self, merged_hypothesis, hypotheses, chunk_offse
10211021 token_duration_frames = char ['end_offset' ] - char ['start_offset' ]
10221022 time_adjustment_frames += token_duration_frames
10231023 continue
1024-
1024+
10251025 start = char ['start_offset' ]
10261026 end = char ['end_offset' ]
10271027
10281028 updated_char = dict (char ) # copy all existing keys
1029-
1029+
10301030 # Apply both the chunk offset and the time adjustment from filtered tokens
10311031 if start != - 1 :
10321032 updated_char ['start_offset' ] = start + offset - time_adjustment_frames
@@ -1143,7 +1143,9 @@ def _transcribe_output_processing(self, outputs, trcfg: MultiTaskTranscriptionCo
11431143 for i in range (1 , len (hypotheses )):
11441144 merged_tokens = lcs_alignment_merge_buffer (
11451145 buffer = merged_tokens ,
1146- data = hypotheses [i ].y_sequence .tolist ()[:int (delay * 0.6 )], # only approximately 60% of the tokens are non blank
1146+ data = hypotheses [i ].y_sequence .tolist ()[
1147+ : int (delay * 0.6 )
1148+ ], # only approximately 60% of the tokens are non blank
11471149 delay = delay ,
11481150 model = self ,
11491151 max_steps_per_timestep = 1 ,
@@ -1179,7 +1181,7 @@ def _transcribe_output_processing(self, outputs, trcfg: MultiTaskTranscriptionCo
11791181 },
11801182 )
11811183 chunk_offsets = [0 ] + [x * self .encoder .subsampling_factor for x in encoded_len .tolist ()]
1182-
1184+
11831185 merged_hypotheses = self ._join_y_sequence (merged_hypotheses , hypotheses )
11841186 merged_hypotheses .text = final_text
11851187 merged_hypotheses = self ._join_timestamp_and_add_word_and_segment_level_timestamps (
0 commit comments