diff --git a/.changeset/bright-fox-hill.md b/.changeset/bright-fox-hill.md new file mode 100644 index 00000000..00bd3072 --- /dev/null +++ b/.changeset/bright-fox-hill.md @@ -0,0 +1,6 @@ +--- +'@openai/agents-extensions': patch +'@openai/agents-realtime': patch +--- + +Fix: clamp and floor `audio_end_ms` in interrupts to prevent Realtime API error with fractional speeds (#315) diff --git a/packages/agents-extensions/test/index.test.ts b/packages/agents-extensions/test/index.test.ts index 39add901..24076323 100644 --- a/packages/agents-extensions/test/index.test.ts +++ b/packages/agents-extensions/test/index.test.ts @@ -69,9 +69,37 @@ describe('TwilioRealtimeTransportLayer', () => { transport._audioLengthMs = 500; transport._interrupt(0, true); - const call = sendEventSpy.mock.calls.find( - (c) => c[0]?.type === 'conversation.item.truncate', - ); + const call = sendEventSpy.mock.calls + .filter((c) => c[0]?.type === 'conversation.item.truncate') + .at(-1); expect(call?.[0].audio_end_ms).toBe(50); }); + + test('interrupt clamps overshoot and emits integer audio_end_ms', async () => { + const twilio = new FakeTwilioWebSocket(); + const transport = new TwilioRealtimeTransportLayer({ + twilioWebSocket: twilio as any, + }); + + const sendEventSpy = vi.spyOn( + transport as TwilioRealtimeTransportLayer, + 'sendEvent', + ); + + await transport.connect({ + apiKey: 'ek_test', + initialSessionConfig: { speed: 1.1 }, + }); + sendEventSpy.mockClear(); + + // @ts-expect-error - we're testing protected fields. + transport._audioLengthMs = 20; + transport._interrupt(0, true); + + const call = sendEventSpy.mock.calls + .filter((c) => c[0]?.type === 'conversation.item.truncate') + .at(-1); + expect(call?.[0].audio_end_ms).toBe(20); + expect(Number.isInteger(call?.[0].audio_end_ms)).toBe(true); + }); }); diff --git a/packages/agents-realtime/src/openaiRealtimeWebsocket.ts b/packages/agents-realtime/src/openaiRealtimeWebsocket.ts index d548b361..f9bcb2b6 100644 --- a/packages/agents-realtime/src/openaiRealtimeWebsocket.ts +++ b/packages/agents-realtime/src/openaiRealtimeWebsocket.ts @@ -360,7 +360,7 @@ export class OpenAIRealtimeWebSocket * @param elapsedTime - The elapsed time since the response started. */ _interrupt(elapsedTime: number, cancelOngoingResponse: boolean = true) { - if (elapsedTime < 0 || elapsedTime > this._audioLengthMs) { + if (elapsedTime < 0) { return; } @@ -369,12 +369,15 @@ export class OpenAIRealtimeWebSocket this._cancelResponse(); } + const length = this._audioLengthMs ?? Number.POSITIVE_INFINITY; + const audio_end_ms = Math.max(0, Math.min(Math.floor(elapsedTime), length)); + this.emit('audio_interrupted'); this.sendEvent({ type: 'conversation.item.truncate', item_id: this.#currentItemId, content_index: this.#currentAudioContentIndex, - audio_end_ms: elapsedTime, + audio_end_ms, }); } diff --git a/packages/agents-realtime/test/openaiRealtimeWebsocket.test.ts b/packages/agents-realtime/test/openaiRealtimeWebsocket.test.ts index 91572a49..9d144bb8 100644 --- a/packages/agents-realtime/test/openaiRealtimeWebsocket.test.ts +++ b/packages/agents-realtime/test/openaiRealtimeWebsocket.test.ts @@ -70,7 +70,9 @@ describe('OpenAIRealtimeWebSocket', () => { const ws = new OpenAIRealtimeWebSocket(); const audioSpy = vi.fn(); ws.on('audio', audioSpy); - const sendSpy = vi.spyOn(ws, 'sendEvent'); + const sendSpy = vi + .spyOn(ws as any, 'sendEvent') + .mockImplementation(() => {}); const interruptSpy = vi.spyOn(ws, 'interrupt'); const p = ws.connect({ apiKey: 'ek', model: 'm' }); await vi.runAllTimersAsync(); @@ -109,11 +111,13 @@ describe('OpenAIRealtimeWebSocket', () => { }); expect(interruptSpy).toHaveBeenCalled(); expect( - sendSpy.mock.calls.some((c) => c[0].type === 'response.cancel'), + sendSpy.mock.calls.some( + (c: unknown[]) => (c[0] as any).type === 'response.cancel', + ), ).toBe(true); expect( sendSpy.mock.calls.some( - (c) => c[0].type === 'conversation.item.truncate', + (c: unknown[]) => (c[0] as any).type === 'conversation.item.truncate', ), ).toBe(true); @@ -147,7 +151,9 @@ describe('OpenAIRealtimeWebSocket', () => { }), }); expect( - sendSpy.mock.calls.every((c) => c[0].type !== 'response.cancel'), + sendSpy.mock.calls.every( + (c: unknown[]) => (c[0] as any).type !== 'response.cancel', + ), ).toBe(true); }); @@ -158,7 +164,9 @@ describe('OpenAIRealtimeWebSocket', () => { it('close resets state so interrupt does nothing', async () => { const ws = new OpenAIRealtimeWebSocket(); - const sendSpy = vi.spyOn(ws, 'sendEvent'); + const sendSpy = vi + .spyOn(OpenAIRealtimeWebSocket.prototype as any, 'sendEvent') + .mockImplementation(() => {}); const p = ws.connect({ apiKey: 'ek', model: 'm' }); await vi.runAllTimersAsync(); await p; @@ -197,6 +205,99 @@ describe('OpenAIRealtimeWebSocket', () => { expect(baseSpy).toHaveBeenCalled(); }); + it('_interrupt quantizes and clamps elapsedTime', () => { + const ws = new OpenAIRealtimeWebSocket(); + const sendSpy = vi + .spyOn(OpenAIRealtimeWebSocket.prototype as any, 'sendEvent') + .mockImplementation(() => {}); + // @ts-expect-error - testing protected field. + ws._audioLengthMs = 100; + ws._interrupt(110.9, false); + let call = sendSpy.mock.calls.find( + (c: unknown[]) => (c[0] as any).type === 'conversation.item.truncate', + ); + expect((call?.[0] as any).audio_end_ms).toBe(100); + sendSpy.mockClear(); + // @ts-expect-error - testing protected field. + ws._audioLengthMs = 200; + ws._interrupt(123.7, false); + call = sendSpy.mock.calls.find( + (c: unknown[]) => (c[0] as any).type === 'conversation.item.truncate', + ); + expect((call?.[0] as any).audio_end_ms).toBe(123); + sendSpy.mockRestore(); + }); + + it('_interrupt floors sub-millisecond elapsedTime', () => { + const ws = new OpenAIRealtimeWebSocket(); + const sendSpy = vi + .spyOn(OpenAIRealtimeWebSocket.prototype as any, 'sendEvent') + .mockImplementation(() => {}); + // @ts-expect-error - testing protected field. + ws._audioLengthMs = 100; + ws._interrupt(0.9, false); + const call = sendSpy.mock.calls.find( + (c: unknown[]) => (c[0] as any).type === 'conversation.item.truncate', + ); + expect((call?.[0] as any).audio_end_ms).toBe(0); + expect(Number.isInteger((call?.[0] as any).audio_end_ms)).toBe(true); + sendSpy.mockRestore(); + }); + + it('_interrupt clamps overshoot elapsedTime', () => { + const ws = new OpenAIRealtimeWebSocket(); + const sendSpy = vi + .spyOn(OpenAIRealtimeWebSocket.prototype as any, 'sendEvent') + .mockImplementation(() => {}); + // @ts-expect-error - testing protected field. + ws._audioLengthMs = 42; + ws._interrupt(42.6, false); + const call = sendSpy.mock.calls.find( + (c: unknown[]) => (c[0] as any).type === 'conversation.item.truncate', + ); + expect((call?.[0] as any).audio_end_ms).toBe(42); + expect(Number.isInteger((call?.[0] as any).audio_end_ms)).toBe(true); + sendSpy.mockRestore(); + }); + + it('interrupt payload is integer with fractional speed', async () => { + const ws = new OpenAIRealtimeWebSocket(); + const sendSpy = vi + .spyOn(OpenAIRealtimeWebSocket.prototype as any, 'sendEvent') + .mockImplementation(() => {}); + const p = ws.connect({ + apiKey: 'ek', + model: 'm', + initialSessionConfig: { speed: 1.1 }, + } as any); + await vi.runAllTimersAsync(); + await p; + // @ts-expect-error - testing protected field. + ws._audioLengthMs = 200; + ws._interrupt(123.4, false); + const call = sendSpy.mock.calls.find( + (c: unknown[]) => (c[0] as any).type === 'conversation.item.truncate', + ); + expect(Number.isInteger((call?.[0] as any).audio_end_ms)).toBe(true); + sendSpy.mockRestore(); + }); + + it('interrupt payload is integer with speed 1', () => { + const ws = new OpenAIRealtimeWebSocket(); + const sendSpy = vi + .spyOn(OpenAIRealtimeWebSocket.prototype as any, 'sendEvent') + .mockImplementation(() => {}); + // @ts-expect-error - testing protected field. + ws._audioLengthMs = 200; + ws._interrupt(123.4, false); + const call = sendSpy.mock.calls.find( + (c: unknown[]) => (c[0] as any).type === 'conversation.item.truncate', + ); + expect((call?.[0] as any).audio_end_ms).toBe(123); + expect(Number.isInteger((call?.[0] as any).audio_end_ms)).toBe(true); + sendSpy.mockRestore(); + }); + it('full interrupt/_interrupt flow', async () => { const ws = new OpenAIRealtimeWebSocket(); const sendSpy = vi.spyOn(ws, 'sendEvent'); @@ -230,11 +331,13 @@ describe('OpenAIRealtimeWebSocket', () => { }), }); expect( - sendSpy.mock.calls.some((c) => c[0].type === 'response.cancel'), + sendSpy.mock.calls.some( + (c: unknown[]) => (c[0] as any).type === 'response.cancel', + ), ).toBe(true); expect( sendSpy.mock.calls.some( - (c) => c[0].type === 'conversation.item.truncate', + (c: unknown[]) => (c[0] as any).type === 'conversation.item.truncate', ), ).toBe(true); sendSpy.mockClear();