1010class AudioTimelineEventType (Enum ):
1111 USER_AUDIO = 0
1212 SILENCE_AUDIO = 1
13+ DROPPED_AUDIO = 2 # Audio that is dropped and not sent to provider
1314
1415
1516class AudioTimeline :
@@ -18,6 +19,7 @@ def __init__(self, error_cb: Optional[Callable[[str], None]] = None):
1819 self .timeline : list [tuple [AudioTimelineEventType , int ]] = []
1920 self .total_user_audio_duration = 0
2021 self .total_silence_audio_duration = 0
22+ self .total_dropped_audio_duration = 0
2123 self .error_cb = error_cb
2224
2325 def add_user_audio (self , duration_ms : int ):
@@ -63,26 +65,59 @@ def add_silence_audio(self, duration_ms: int):
6365
6466 self .total_silence_audio_duration += duration_ms
6567
68+ def add_dropped_audio (self , duration_ms : int ):
69+ """Add dropped audio (audio not sent to provider)
70+
71+ Args:
72+ duration_ms: Dropped audio duration in milliseconds
73+ """
74+ if duration_ms <= 0 :
75+ return
76+
77+ if (
78+ self .timeline
79+ and self .timeline [- 1 ][0 ] == AudioTimelineEventType .DROPPED_AUDIO
80+ ):
81+ # Merge adjacent dropped audio events
82+ self .timeline [- 1 ] = (
83+ AudioTimelineEventType .DROPPED_AUDIO ,
84+ self .timeline [- 1 ][1 ] + duration_ms ,
85+ )
86+ else :
87+ self .timeline .append ((AudioTimelineEventType .DROPPED_AUDIO , duration_ms ))
88+
89+ self .total_dropped_audio_duration += duration_ms
90+
6691 def get_audio_duration_before_time (self , time_ms : int ) -> int :
6792 """
68- Calculate the total duration of user audio before a specified timestamp.
69- If time_ms exceeds the total timeline duration, an error callback will be invoked.
93+ Calculate the real audio timestamp from provider's timestamp.
94+
95+ This method converts provider's timestamp to real audio timeline position by:
96+ - Adding dropped audio (exists in real world but not sent to provider)
97+ - Subtracting silence audio (sent to provider but not real user audio)
98+
99+ Real audio timestamp = provider timestamp + dropped audio - silence
70100
71101 Timeline diagram:
72- Timeline: [USER_AUDIO:100ms] [SILENCE:50ms] [USER_AUDIO:200ms] [SILENCE:100ms]
73- Time: 0 100 150 350 450
102+ Timeline: [DROPPED:3000ms] [USER:1000ms] [SILENCE:500ms] [USER:2000ms]
103+ Provider Time: 0 1000 1500 3500
104+ Real Audio Time: 0 3000 4000 (4000) 6000
105+
106+ When provider returns 1500ms (after silence):
107+ - Real audio time = 3000 (dropped) + 1000 (first user) = 4000ms
74108
75109 Examples:
76- - get_audio_duration_before_time(80) -> 80ms (within first USER_AUDIO segment)
77- - get_audio_duration_before_time(120) -> 100ms (first USER_AUDIO + partial SILENCE)
78- - get_audio_duration_before_time(200) -> 150ms (first 100ms + second 50ms)
79- - get_audio_duration_before_time(500) -> 300ms (all USER_AUDIO, but error reported)
110+ - get_audio_duration_before_time(0) -> 3000ms (dropped audio before provider's first audio)
111+ - get_audio_duration_before_time(500) -> 3500ms (dropped + 500ms user audio)
112+ - get_audio_duration_before_time(1000) -> 4000ms (dropped + 1000ms user audio)
113+ - get_audio_duration_before_time(1500) -> 4000ms (silence excluded)
114+ - get_audio_duration_before_time(2000) -> 4500ms (dropped + 1000 + 500 from second user)
80115
81116 Args:
82- time_ms: The specified timestamp in milliseconds
117+ time_ms: The timestamp from provider in milliseconds
83118
84119 Returns:
85- Total duration of user audio before the specified timestamp in milliseconds
120+ The real audio timeline position in milliseconds (only counting real audio)
86121 """
87122 if time_ms < 0 :
88123 if self .error_cb is not None :
@@ -94,7 +129,7 @@ def get_audio_duration_before_time(self, time_ms: int) -> int:
94129 # When requested time is less than 0, return 0
95130 return 0
96131
97- # Calculate total timeline duration
132+ # Calculate total timeline duration (excluding dropped audio)
98133 total_timeline_duration = (
99134 self .total_user_audio_duration + self .total_silence_audio_duration
100135 )
@@ -109,40 +144,65 @@ def get_audio_duration_before_time(self, time_ms: int) -> int:
109144 except Exception :
110145 # Silently ignore callback errors to keep returning result normally
111146 pass
112- # When exceeding range, return total user audio duration in timeline
113- return self .total_user_audio_duration
114-
115- total_user_audio_duration = 0
116- current_time = 0
147+ # When exceeding range, return total real audio duration (user + dropped)
148+ return self .total_user_audio_duration + self .total_dropped_audio_duration
117149
118- # Iterate through timeline, accumulating user audio before specified time
150+ real_audio_time = 0 # Real audio timeline (user audio + dropped audio)
151+ provider_time = 0 # Provider timeline (user audio + silence)
152+
153+ # Iterate through timeline to calculate real audio timestamp
119154 for event_type , duration in self .timeline :
120- # Stop if current time has reached or exceeded target time
121- if current_time >= time_ms :
122- break
123-
124- if event_type == AudioTimelineEventType .USER_AUDIO :
125- # If entire audio segment is before target time
126- if current_time + duration <= time_ms :
127- total_user_audio_duration += duration
128- else :
129- # If audio segment crosses target time, only count portion before target
130- partial_duration = time_ms - current_time
131- total_user_audio_duration += max (0 , partial_duration )
155+ if event_type == AudioTimelineEventType .DROPPED_AUDIO :
156+ # Dropped audio: exists in real world, adds to real audio time
157+ # but not counted in provider time
158+ real_audio_time += duration
159+ elif event_type == AudioTimelineEventType .USER_AUDIO :
160+ # User audio: sent to provider and is real audio
161+ # Check if this segment crosses the target time
162+ if provider_time + duration > time_ms :
163+ # Only add the partial duration
164+ partial_duration = time_ms - provider_time
165+ real_audio_time += partial_duration
166+ break
167+
168+ # Full segment is before target time
169+ provider_time += duration
170+ real_audio_time += duration
171+
172+ # Check if we've exactly reached the target
173+ if provider_time >= time_ms :
174+ break
175+ elif event_type == AudioTimelineEventType .SILENCE_AUDIO :
176+ # Silence: sent to provider but NOT real audio
177+ # Only advances provider time, not real audio time
178+ if provider_time + duration > time_ms :
179+ # Target time is within this silence segment
180+ # Don't add any audio duration
132181 break
133182
134- current_time += duration
183+ # Full silence segment is before target time
184+ provider_time += duration
185+ # real_audio_time stays the same (silence excluded)
135186
136- return total_user_audio_duration
187+ if provider_time >= time_ms :
188+ break
189+
190+ return real_audio_time
137191
138192 def get_total_user_audio_duration (self ) -> int :
139- return sum (
140- duration
141- for event , duration in self .timeline
142- if event == AudioTimelineEventType .USER_AUDIO
143- )
193+ """
194+ Get total duration of all user audio received from the user.
195+ This includes both audio sent to provider (USER_AUDIO) and audio dropped (DROPPED_AUDIO).
196+
197+ This is typically used before reset to record the total user audio received in this session.
198+
199+ Returns:
200+ Total duration of user audio in milliseconds (USER_AUDIO + DROPPED_AUDIO)
201+ """
202+ return self .total_user_audio_duration + self .total_dropped_audio_duration
144203
145204 def reset (self ):
146205 self .timeline = []
147206 self .total_user_audio_duration = 0
148207 self .total_silence_audio_duration = 0
208+ self .total_dropped_audio_duration = 0
0 commit comments