Skip to content

Commit b487db1

Browse files
authored
fix: clamp and floor audio_end_ms to integer in interrupts (#315) (#324)
1 parent c42a0a9 commit b487db1

File tree

4 files changed

+152
-12
lines changed

4 files changed

+152
-12
lines changed

.changeset/bright-fox-hill.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
---
2+
'@openai/agents-extensions': patch
3+
'@openai/agents-realtime': patch
4+
---
5+
6+
Fix: clamp and floor `audio_end_ms` in interrupts to prevent Realtime API error with fractional speeds (#315)

packages/agents-extensions/test/index.test.ts

Lines changed: 31 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -69,9 +69,37 @@ describe('TwilioRealtimeTransportLayer', () => {
6969
transport._audioLengthMs = 500;
7070
transport._interrupt(0, true);
7171

72-
const call = sendEventSpy.mock.calls.find(
73-
(c) => c[0]?.type === 'conversation.item.truncate',
74-
);
72+
const call = sendEventSpy.mock.calls
73+
.filter((c) => c[0]?.type === 'conversation.item.truncate')
74+
.at(-1);
7575
expect(call?.[0].audio_end_ms).toBe(50);
7676
});
77+
78+
test('interrupt clamps overshoot and emits integer audio_end_ms', async () => {
79+
const twilio = new FakeTwilioWebSocket();
80+
const transport = new TwilioRealtimeTransportLayer({
81+
twilioWebSocket: twilio as any,
82+
});
83+
84+
const sendEventSpy = vi.spyOn(
85+
transport as TwilioRealtimeTransportLayer,
86+
'sendEvent',
87+
);
88+
89+
await transport.connect({
90+
apiKey: 'ek_test',
91+
initialSessionConfig: { speed: 1.1 },
92+
});
93+
sendEventSpy.mockClear();
94+
95+
// @ts-expect-error - we're testing protected fields.
96+
transport._audioLengthMs = 20;
97+
transport._interrupt(0, true);
98+
99+
const call = sendEventSpy.mock.calls
100+
.filter((c) => c[0]?.type === 'conversation.item.truncate')
101+
.at(-1);
102+
expect(call?.[0].audio_end_ms).toBe(20);
103+
expect(Number.isInteger(call?.[0].audio_end_ms)).toBe(true);
104+
});
77105
});

packages/agents-realtime/src/openaiRealtimeWebsocket.ts

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -360,7 +360,7 @@ export class OpenAIRealtimeWebSocket
360360
* @param elapsedTime - The elapsed time since the response started.
361361
*/
362362
_interrupt(elapsedTime: number, cancelOngoingResponse: boolean = true) {
363-
if (elapsedTime < 0 || elapsedTime > this._audioLengthMs) {
363+
if (elapsedTime < 0) {
364364
return;
365365
}
366366

@@ -369,12 +369,15 @@ export class OpenAIRealtimeWebSocket
369369
this._cancelResponse();
370370
}
371371

372+
const length = this._audioLengthMs ?? Number.POSITIVE_INFINITY;
373+
const audio_end_ms = Math.max(0, Math.min(Math.floor(elapsedTime), length));
374+
372375
this.emit('audio_interrupted');
373376
this.sendEvent({
374377
type: 'conversation.item.truncate',
375378
item_id: this.#currentItemId,
376379
content_index: this.#currentAudioContentIndex,
377-
audio_end_ms: elapsedTime,
380+
audio_end_ms,
378381
});
379382
}
380383

packages/agents-realtime/test/openaiRealtimeWebsocket.test.ts

Lines changed: 110 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,9 @@ describe('OpenAIRealtimeWebSocket', () => {
7070
const ws = new OpenAIRealtimeWebSocket();
7171
const audioSpy = vi.fn();
7272
ws.on('audio', audioSpy);
73-
const sendSpy = vi.spyOn(ws, 'sendEvent');
73+
const sendSpy = vi
74+
.spyOn(ws as any, 'sendEvent')
75+
.mockImplementation(() => {});
7476
const interruptSpy = vi.spyOn(ws, 'interrupt');
7577
const p = ws.connect({ apiKey: 'ek', model: 'm' });
7678
await vi.runAllTimersAsync();
@@ -109,11 +111,13 @@ describe('OpenAIRealtimeWebSocket', () => {
109111
});
110112
expect(interruptSpy).toHaveBeenCalled();
111113
expect(
112-
sendSpy.mock.calls.some((c) => c[0].type === 'response.cancel'),
114+
sendSpy.mock.calls.some(
115+
(c: unknown[]) => (c[0] as any).type === 'response.cancel',
116+
),
113117
).toBe(true);
114118
expect(
115119
sendSpy.mock.calls.some(
116-
(c) => c[0].type === 'conversation.item.truncate',
120+
(c: unknown[]) => (c[0] as any).type === 'conversation.item.truncate',
117121
),
118122
).toBe(true);
119123

@@ -147,7 +151,9 @@ describe('OpenAIRealtimeWebSocket', () => {
147151
}),
148152
});
149153
expect(
150-
sendSpy.mock.calls.every((c) => c[0].type !== 'response.cancel'),
154+
sendSpy.mock.calls.every(
155+
(c: unknown[]) => (c[0] as any).type !== 'response.cancel',
156+
),
151157
).toBe(true);
152158
});
153159

@@ -158,7 +164,9 @@ describe('OpenAIRealtimeWebSocket', () => {
158164

159165
it('close resets state so interrupt does nothing', async () => {
160166
const ws = new OpenAIRealtimeWebSocket();
161-
const sendSpy = vi.spyOn(ws, 'sendEvent');
167+
const sendSpy = vi
168+
.spyOn(OpenAIRealtimeWebSocket.prototype as any, 'sendEvent')
169+
.mockImplementation(() => {});
162170
const p = ws.connect({ apiKey: 'ek', model: 'm' });
163171
await vi.runAllTimersAsync();
164172
await p;
@@ -197,6 +205,99 @@ describe('OpenAIRealtimeWebSocket', () => {
197205
expect(baseSpy).toHaveBeenCalled();
198206
});
199207

208+
it('_interrupt quantizes and clamps elapsedTime', () => {
209+
const ws = new OpenAIRealtimeWebSocket();
210+
const sendSpy = vi
211+
.spyOn(OpenAIRealtimeWebSocket.prototype as any, 'sendEvent')
212+
.mockImplementation(() => {});
213+
// @ts-expect-error - testing protected field.
214+
ws._audioLengthMs = 100;
215+
ws._interrupt(110.9, false);
216+
let call = sendSpy.mock.calls.find(
217+
(c: unknown[]) => (c[0] as any).type === 'conversation.item.truncate',
218+
);
219+
expect((call?.[0] as any).audio_end_ms).toBe(100);
220+
sendSpy.mockClear();
221+
// @ts-expect-error - testing protected field.
222+
ws._audioLengthMs = 200;
223+
ws._interrupt(123.7, false);
224+
call = sendSpy.mock.calls.find(
225+
(c: unknown[]) => (c[0] as any).type === 'conversation.item.truncate',
226+
);
227+
expect((call?.[0] as any).audio_end_ms).toBe(123);
228+
sendSpy.mockRestore();
229+
});
230+
231+
it('_interrupt floors sub-millisecond elapsedTime', () => {
232+
const ws = new OpenAIRealtimeWebSocket();
233+
const sendSpy = vi
234+
.spyOn(OpenAIRealtimeWebSocket.prototype as any, 'sendEvent')
235+
.mockImplementation(() => {});
236+
// @ts-expect-error - testing protected field.
237+
ws._audioLengthMs = 100;
238+
ws._interrupt(0.9, false);
239+
const call = sendSpy.mock.calls.find(
240+
(c: unknown[]) => (c[0] as any).type === 'conversation.item.truncate',
241+
);
242+
expect((call?.[0] as any).audio_end_ms).toBe(0);
243+
expect(Number.isInteger((call?.[0] as any).audio_end_ms)).toBe(true);
244+
sendSpy.mockRestore();
245+
});
246+
247+
it('_interrupt clamps overshoot elapsedTime', () => {
248+
const ws = new OpenAIRealtimeWebSocket();
249+
const sendSpy = vi
250+
.spyOn(OpenAIRealtimeWebSocket.prototype as any, 'sendEvent')
251+
.mockImplementation(() => {});
252+
// @ts-expect-error - testing protected field.
253+
ws._audioLengthMs = 42;
254+
ws._interrupt(42.6, false);
255+
const call = sendSpy.mock.calls.find(
256+
(c: unknown[]) => (c[0] as any).type === 'conversation.item.truncate',
257+
);
258+
expect((call?.[0] as any).audio_end_ms).toBe(42);
259+
expect(Number.isInteger((call?.[0] as any).audio_end_ms)).toBe(true);
260+
sendSpy.mockRestore();
261+
});
262+
263+
it('interrupt payload is integer with fractional speed', async () => {
264+
const ws = new OpenAIRealtimeWebSocket();
265+
const sendSpy = vi
266+
.spyOn(OpenAIRealtimeWebSocket.prototype as any, 'sendEvent')
267+
.mockImplementation(() => {});
268+
const p = ws.connect({
269+
apiKey: 'ek',
270+
model: 'm',
271+
initialSessionConfig: { speed: 1.1 },
272+
} as any);
273+
await vi.runAllTimersAsync();
274+
await p;
275+
// @ts-expect-error - testing protected field.
276+
ws._audioLengthMs = 200;
277+
ws._interrupt(123.4, false);
278+
const call = sendSpy.mock.calls.find(
279+
(c: unknown[]) => (c[0] as any).type === 'conversation.item.truncate',
280+
);
281+
expect(Number.isInteger((call?.[0] as any).audio_end_ms)).toBe(true);
282+
sendSpy.mockRestore();
283+
});
284+
285+
it('interrupt payload is integer with speed 1', () => {
286+
const ws = new OpenAIRealtimeWebSocket();
287+
const sendSpy = vi
288+
.spyOn(OpenAIRealtimeWebSocket.prototype as any, 'sendEvent')
289+
.mockImplementation(() => {});
290+
// @ts-expect-error - testing protected field.
291+
ws._audioLengthMs = 200;
292+
ws._interrupt(123.4, false);
293+
const call = sendSpy.mock.calls.find(
294+
(c: unknown[]) => (c[0] as any).type === 'conversation.item.truncate',
295+
);
296+
expect((call?.[0] as any).audio_end_ms).toBe(123);
297+
expect(Number.isInteger((call?.[0] as any).audio_end_ms)).toBe(true);
298+
sendSpy.mockRestore();
299+
});
300+
200301
it('full interrupt/_interrupt flow', async () => {
201302
const ws = new OpenAIRealtimeWebSocket();
202303
const sendSpy = vi.spyOn(ws, 'sendEvent');
@@ -230,11 +331,13 @@ describe('OpenAIRealtimeWebSocket', () => {
230331
}),
231332
});
232333
expect(
233-
sendSpy.mock.calls.some((c) => c[0].type === 'response.cancel'),
334+
sendSpy.mock.calls.some(
335+
(c: unknown[]) => (c[0] as any).type === 'response.cancel',
336+
),
234337
).toBe(true);
235338
expect(
236339
sendSpy.mock.calls.some(
237-
(c) => c[0].type === 'conversation.item.truncate',
340+
(c: unknown[]) => (c[0] as any).type === 'conversation.item.truncate',
238341
),
239342
).toBe(true);
240343
sendSpy.mockClear();

0 commit comments

Comments
 (0)