Skip to content

Commit 4dfc594

Browse files
authored
Merge pull request #80 from TEN-framework/fix/discard
fix: timestamp calculation in asr discard mode
2 parents bccbe82 + 39fb5f2 commit 4dfc594

File tree

4 files changed

+1376
-37
lines changed

4 files changed

+1376
-37
lines changed

interface/ten_ai_base/asr.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -433,7 +433,10 @@ async def _handle_audio_frame(
433433
self.buffered_frames_size -= len(discard_frame.get_buf())
434434
self.buffered_frames.put_nowait(audio_frame)
435435
self.buffered_frames_size += len(frame_buf)
436-
# return anyway if not connected
436+
else:
437+
# Discard mode
438+
self.audio_timeline.add_dropped_audio(len(frame_buf))
439+
437440
return
438441

439442
metadata, _ = audio_frame.get_property_to_json(PROPERTY_KEY_METADATA)

interface/ten_ai_base/timeline.py

Lines changed: 96 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
class AudioTimelineEventType(Enum):
1111
USER_AUDIO = 0
1212
SILENCE_AUDIO = 1
13+
DROPPED_AUDIO = 2 # Audio that is dropped and not sent to provider
1314

1415

1516
class AudioTimeline:
@@ -18,6 +19,7 @@ def __init__(self, error_cb: Optional[Callable[[str], None]] = None):
1819
self.timeline: list[tuple[AudioTimelineEventType, int]] = []
1920
self.total_user_audio_duration = 0
2021
self.total_silence_audio_duration = 0
22+
self.total_dropped_audio_duration = 0
2123
self.error_cb = error_cb
2224

2325
def add_user_audio(self, duration_ms: int):
@@ -63,26 +65,59 @@ def add_silence_audio(self, duration_ms: int):
6365

6466
self.total_silence_audio_duration += duration_ms
6567

68+
def add_dropped_audio(self, duration_ms: int):
69+
"""Add dropped audio (audio not sent to provider)
70+
71+
Args:
72+
duration_ms: Dropped audio duration in milliseconds
73+
"""
74+
if duration_ms <= 0:
75+
return
76+
77+
if (
78+
self.timeline
79+
and self.timeline[-1][0] == AudioTimelineEventType.DROPPED_AUDIO
80+
):
81+
# Merge adjacent dropped audio events
82+
self.timeline[-1] = (
83+
AudioTimelineEventType.DROPPED_AUDIO,
84+
self.timeline[-1][1] + duration_ms,
85+
)
86+
else:
87+
self.timeline.append((AudioTimelineEventType.DROPPED_AUDIO, duration_ms))
88+
89+
self.total_dropped_audio_duration += duration_ms
90+
6691
def get_audio_duration_before_time(self, time_ms: int) -> int:
6792
"""
68-
Calculate the total duration of user audio before a specified timestamp.
69-
If time_ms exceeds the total timeline duration, an error callback will be invoked.
93+
Calculate the real audio timestamp from provider's timestamp.
94+
95+
This method converts provider's timestamp to real audio timeline position by:
96+
- Adding dropped audio (exists in real world but not sent to provider)
97+
- Subtracting silence audio (sent to provider but not real user audio)
98+
99+
Real audio timestamp = provider timestamp + dropped audio - silence
70100
71101
Timeline diagram:
72-
Timeline: [USER_AUDIO:100ms] [SILENCE:50ms] [USER_AUDIO:200ms] [SILENCE:100ms]
73-
Time: 0 100 150 350 450
102+
Timeline: [DROPPED:3000ms] [USER:1000ms] [SILENCE:500ms] [USER:2000ms]
103+
Provider Time: 0 1000 1500 3500
104+
Real Audio Time: 0 3000 4000 (4000) 6000
105+
106+
When provider returns 1500ms (after silence):
107+
- Real audio time = 3000 (dropped) + 1000 (first user) = 4000ms
74108
75109
Examples:
76-
- get_audio_duration_before_time(80) -> 80ms (within first USER_AUDIO segment)
77-
- get_audio_duration_before_time(120) -> 100ms (first USER_AUDIO + partial SILENCE)
78-
- get_audio_duration_before_time(200) -> 150ms (first 100ms + second 50ms)
79-
- get_audio_duration_before_time(500) -> 300ms (all USER_AUDIO, but error reported)
110+
- get_audio_duration_before_time(0) -> 3000ms (dropped audio before provider's first audio)
111+
- get_audio_duration_before_time(500) -> 3500ms (dropped + 500ms user audio)
112+
- get_audio_duration_before_time(1000) -> 4000ms (dropped + 1000ms user audio)
113+
- get_audio_duration_before_time(1500) -> 4000ms (silence excluded)
114+
- get_audio_duration_before_time(2000) -> 4500ms (dropped + 1000 + 500 from second user)
80115
81116
Args:
82-
time_ms: The specified timestamp in milliseconds
117+
time_ms: The timestamp from provider in milliseconds
83118
84119
Returns:
85-
Total duration of user audio before the specified timestamp in milliseconds
120+
The real audio timeline position in milliseconds (only counting real audio)
86121
"""
87122
if time_ms < 0:
88123
if self.error_cb is not None:
@@ -94,7 +129,7 @@ def get_audio_duration_before_time(self, time_ms: int) -> int:
94129
# When requested time is less than 0, return 0
95130
return 0
96131

97-
# Calculate total timeline duration
132+
# Calculate total timeline duration (excluding dropped audio)
98133
total_timeline_duration = (
99134
self.total_user_audio_duration + self.total_silence_audio_duration
100135
)
@@ -109,40 +144,65 @@ def get_audio_duration_before_time(self, time_ms: int) -> int:
109144
except Exception:
110145
# Silently ignore callback errors to keep returning result normally
111146
pass
112-
# When exceeding range, return total user audio duration in timeline
113-
return self.total_user_audio_duration
114-
115-
total_user_audio_duration = 0
116-
current_time = 0
147+
# When exceeding range, return total real audio duration (user + dropped)
148+
return self.total_user_audio_duration + self.total_dropped_audio_duration
117149

118-
# Iterate through timeline, accumulating user audio before specified time
150+
real_audio_time = 0 # Real audio timeline (user audio + dropped audio)
151+
provider_time = 0 # Provider timeline (user audio + silence)
152+
153+
# Iterate through timeline to calculate real audio timestamp
119154
for event_type, duration in self.timeline:
120-
# Stop if current time has reached or exceeded target time
121-
if current_time >= time_ms:
122-
break
123-
124-
if event_type == AudioTimelineEventType.USER_AUDIO:
125-
# If entire audio segment is before target time
126-
if current_time + duration <= time_ms:
127-
total_user_audio_duration += duration
128-
else:
129-
# If audio segment crosses target time, only count portion before target
130-
partial_duration = time_ms - current_time
131-
total_user_audio_duration += max(0, partial_duration)
155+
if event_type == AudioTimelineEventType.DROPPED_AUDIO:
156+
# Dropped audio: exists in real world, adds to real audio time
157+
# but not counted in provider time
158+
real_audio_time += duration
159+
elif event_type == AudioTimelineEventType.USER_AUDIO:
160+
# User audio: sent to provider and is real audio
161+
# Check if this segment crosses the target time
162+
if provider_time + duration > time_ms:
163+
# Only add the partial duration
164+
partial_duration = time_ms - provider_time
165+
real_audio_time += partial_duration
166+
break
167+
168+
# Full segment is before target time
169+
provider_time += duration
170+
real_audio_time += duration
171+
172+
# Check if we've exactly reached the target
173+
if provider_time >= time_ms:
174+
break
175+
elif event_type == AudioTimelineEventType.SILENCE_AUDIO:
176+
# Silence: sent to provider but NOT real audio
177+
# Only advances provider time, not real audio time
178+
if provider_time + duration > time_ms:
179+
# Target time is within this silence segment
180+
# Don't add any audio duration
132181
break
133182

134-
current_time += duration
183+
# Full silence segment is before target time
184+
provider_time += duration
185+
# real_audio_time stays the same (silence excluded)
135186

136-
return total_user_audio_duration
187+
if provider_time >= time_ms:
188+
break
189+
190+
return real_audio_time
137191

138192
def get_total_user_audio_duration(self) -> int:
139-
return sum(
140-
duration
141-
for event, duration in self.timeline
142-
if event == AudioTimelineEventType.USER_AUDIO
143-
)
193+
"""
194+
Get total duration of all user audio received from the user.
195+
This includes both audio sent to provider (USER_AUDIO) and audio dropped (DROPPED_AUDIO).
196+
197+
This is typically used before reset to record the total user audio received in this session.
198+
199+
Returns:
200+
Total duration of user audio in milliseconds (USER_AUDIO + DROPPED_AUDIO)
201+
"""
202+
return self.total_user_audio_duration + self.total_dropped_audio_duration
144203

145204
def reset(self):
146205
self.timeline = []
147206
self.total_user_audio_duration = 0
148207
self.total_silence_audio_duration = 0
208+
self.total_dropped_audio_duration = 0

0 commit comments

Comments
 (0)