Skip to content

Commit f6e68f4

Browse files
fix(realtime-ws): stop accidental cancellation error (#220)
1 parent fe5fb97 commit f6e68f4

File tree

6 files changed

+47
-12
lines changed

6 files changed

+47
-12
lines changed

.changeset/strong-lobsters-repair.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
---
2+
"@openai/agents-extensions": patch
3+
"@openai/agents-realtime": patch
4+
---
5+
6+
fix(realtime-ws): stop accidental cancellation error

packages/agents-extensions/src/TwilioRealtimeTransport.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -181,7 +181,7 @@ export class TwilioRealtimeTransportLayer extends OpenAIRealtimeWebSocket {
181181
super.updateSessionConfig(newConfig);
182182
}
183183

184-
_interrupt(_elapsedTime: number) {
184+
_interrupt(_elapsedTime: number, cancelOngoingResponse: boolean = true) {
185185
const elapsedTime = this.#lastPlayedChunkCount + 50; /* 50ms buffer */
186186
this.#logger.debug(
187187
`Interruption detected, clearing Twilio audio and truncating OpenAI audio after ${elapsedTime}ms`,
@@ -192,7 +192,7 @@ export class TwilioRealtimeTransportLayer extends OpenAIRealtimeWebSocket {
192192
streamSid: this.#streamSid,
193193
}),
194194
);
195-
super._interrupt(elapsedTime);
195+
super._interrupt(elapsedTime, cancelOngoingResponse);
196196
}
197197

198198
protected _onAudio(audioEvent: TransportLayerAudio) {

packages/agents-extensions/test/TwilioRealtimeTransport.test.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,7 @@ describe('TwilioRealtimeTransportLayer', () => {
9191
toString: () => JSON.stringify({ event: 'mark', mark: { name: 'u:5' } }),
9292
});
9393
transport._interrupt(0);
94-
expect(interruptSpy).toHaveBeenCalledWith(55);
94+
expect(interruptSpy).toHaveBeenCalledWith(55, true);
9595
expect(twilio.send).toHaveBeenCalledWith(
9696
JSON.stringify({ event: 'clear', streamSid: 'sid' }),
9797
);

packages/agents-extensions/test/index.test.ts

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,10 @@ describe('TwilioRealtimeTransportLayer', () => {
6464
const payload = { event: 'mark', mark: { name: 'badmark' } };
6565
twilio.emit('message', { toString: () => JSON.stringify(payload) });
6666

67-
transport._interrupt(0);
67+
transport._interrupt(0, false);
68+
// @ts-expect-error - we're testing protected fields
69+
transport._audioLengthMs = 500;
70+
transport._interrupt(0, true);
6871

6972
const call = sendEventSpy.mock.calls.find(
7073
(c) => c[0]?.type === 'conversation.item.truncate',

packages/agents-realtime/src/openaiRealtimeBase.ts

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,7 @@ export abstract class OpenAIRealtimeBase
104104
#model: string;
105105
#apiKey: ApiKey | undefined;
106106
#tracingConfig: RealtimeTracingConfig | null = null;
107+
#rawSessionConfig: Record<string, any> | null = null;
107108

108109
protected eventEmitter: RuntimeEventEmitter<OpenAIRealtimeEventTypes> =
109110
new RuntimeEventEmitter<OpenAIRealtimeEventTypes>();
@@ -149,6 +150,10 @@ export abstract class OpenAIRealtimeBase
149150

150151
abstract readonly muted: boolean | null;
151152

153+
protected get _rawSessionConfig(): Record<string, any> | null {
154+
return this.#rawSessionConfig ?? null;
155+
}
156+
152157
protected async _getApiKey(options: RealtimeTransportLayerConnectOptions) {
153158
const apiKey = options.apiKey ?? this.#apiKey;
154159

@@ -186,6 +191,10 @@ export abstract class OpenAIRealtimeBase
186191
return;
187192
}
188193

194+
if (parsed.type === 'session.updated') {
195+
this.#rawSessionConfig = parsed.session;
196+
}
197+
189198
if (parsed.type === 'response.done') {
190199
const response = responseDoneEventSchema.safeParse(parsed);
191200
if (!response.success) {

packages/agents-realtime/src/openaiRealtimeWebsocket.ts

Lines changed: 25 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -215,7 +215,15 @@ export class OpenAIRealtimeWebSocket
215215

216216
const buff = base64ToArrayBuffer(parsed.delta);
217217
// calculate the audio length in milliseconds assuming 24kHz pcm16le
218-
this._audioLengthMs += buff.byteLength / 24 / 2; // 24kHz * 2 bytes per sample
218+
const audioFormat =
219+
this._rawSessionConfig?.output_audio_format ?? 'pcm16';
220+
if (audioFormat.startsWith('g711_')) {
221+
// 8kHz * 1 byte per sample
222+
this._audioLengthMs += buff.byteLength / 8;
223+
} else {
224+
// 24kHz * 2 bytes per sample
225+
this._audioLengthMs += buff.byteLength / 24 / 2;
226+
}
219227

220228
const audioEvent: TransportLayerAudio = {
221229
type: 'audio',
@@ -224,7 +232,9 @@ export class OpenAIRealtimeWebSocket
224232
};
225233
this._onAudio(audioEvent);
226234
} else if (parsed.type === 'input_audio_buffer.speech_started') {
227-
this.interrupt();
235+
const automaticResponseCancellationEnabled =
236+
this._rawSessionConfig?.turn_detection?.interrupt_response ?? false;
237+
this.interrupt(!automaticResponseCancellationEnabled);
228238
} else if (parsed.type === 'response.created') {
229239
this.#ongoingResponse = true;
230240
} else if (parsed.type === 'response.done') {
@@ -343,8 +353,16 @@ export class OpenAIRealtimeWebSocket
343353
*
344354
* @param elapsedTime - The elapsed time since the response started.
345355
*/
346-
_interrupt(elapsedTime: number) {
356+
_interrupt(elapsedTime: number, cancelOngoingResponse: boolean = true) {
357+
if (elapsedTime < 0 || elapsedTime > this._audioLengthMs) {
358+
return;
359+
}
360+
347361
// immediately emit this event so the client can stop playing audio
362+
if (cancelOngoingResponse) {
363+
this._cancelResponse();
364+
}
365+
348366
this.emit('audio_interrupted');
349367
this.sendEvent({
350368
type: 'conversation.item.truncate',
@@ -362,16 +380,15 @@ export class OpenAIRealtimeWebSocket
362380
* You can also call this method directly if you want to interrupt the conversation for example
363381
* based on an event in the client.
364382
*/
365-
interrupt() {
383+
interrupt(cancelOngoingResponse: boolean = true) {
366384
if (!this.#currentItemId || typeof this._firstAudioTimestamp !== 'number') {
367385
return;
368386
}
369387

370-
this._cancelResponse();
371-
372388
const elapsedTime = Date.now() - this._firstAudioTimestamp;
373-
if (elapsedTime >= 0 && elapsedTime < this._audioLengthMs) {
374-
this._interrupt(elapsedTime);
389+
390+
if (elapsedTime >= 0) {
391+
this._interrupt(elapsedTime, cancelOngoingResponse);
375392
}
376393

377394
this.#currentItemId = undefined;

0 commit comments

Comments
 (0)