Skip to content

Commit d148422

Browse files
committed
implement missing text events
1 parent 48f126a commit d148422

File tree

3 files changed

+75
-45
lines changed

3 files changed

+75
-45
lines changed

agents/src/llm/realtime.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ export interface MessageGeneration {
1919
messageId: string;
2020
textStream: ReadableStream<string>;
2121
audioStream: ReadableStream<AudioFrame>;
22+
modalities?: ['text'] | ['text', 'audio'];
2223
}
2324

2425
export interface GenerationCreatedEvent {

plugins/openai/src/realtime/api_proto.ts

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -267,7 +267,7 @@ export interface SessionUpdateEvent extends BaseClientEvent {
267267
type: 'session.update';
268268
session: Partial<{
269269
model: Model;
270-
output_modalities: ['text'] | ['audio'];
270+
modalities: ['text'] | ['audio', 'text'];
271271
instructions: string;
272272
voice: Voice;
273273
input_audio_format: AudioFormat;
@@ -511,6 +511,7 @@ export interface ResponseContentPartDoneEvent extends BaseServerEvent {
511511
export interface ResponseTextDeltaEvent extends BaseServerEvent {
512512
type: 'response.text.delta';
513513
response_id: string;
514+
item_id: string;
514515
output_index: number;
515516
content_index: number;
516517
delta: string;
@@ -519,6 +520,7 @@ export interface ResponseTextDeltaEvent extends BaseServerEvent {
519520
export interface ResponseTextDoneEvent extends BaseServerEvent {
520521
type: 'response.text.done';
521522
response_id: string;
523+
item_id: string;
522524
output_index: number;
523525
content_index: number;
524526
text: string;

plugins/openai/src/realtime/realtime_model.ts

Lines changed: 71 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ interface RealtimeOptions {
3838
model: api_proto.Model;
3939
voice: api_proto.Voice;
4040
temperature: number;
41-
modalities: api_proto.Modality[];
41+
modalities: ['text'] | ['audio', 'text'];
4242
toolChoice?: llm.ToolChoice;
4343
inputAudioTranscription?: api_proto.InputAudioTranscription | null;
4444
// TODO(shubhra): add inputAudioNoiseReduction
@@ -62,6 +62,7 @@ interface MessageGeneration {
6262
textChannel: stream.StreamChannel<string>;
6363
audioChannel: stream.StreamChannel<AudioFrame>;
6464
audioTranscript: string;
65+
modalities?: ['text'] | ['text', 'audio'];
6566
}
6667

6768
interface ResponseGeneration {
@@ -122,7 +123,7 @@ const DEFAULT_REALTIME_MODEL_OPTIONS = {
122123
model: 'gpt-realtime',
123124
voice: 'marin',
124125
temperature: DEFAULT_TEMPERATURE,
125-
modalities: ['audio'] as api_proto.Modality[],
126+
modalities: ['audio', 'text'] as ['audio', 'text'],
126127
inputAudioTranscription: DEFAULT_INPUT_AUDIO_TRANSCRIPTION,
127128
turnDetection: DEFAULT_TURN_DETECTION,
128129
toolChoice: DEFAULT_TOOL_CHOICE,
@@ -144,7 +145,7 @@ export class RealtimeModel extends llm.RealtimeModel {
144145
model?: string;
145146
voice?: string;
146147
temperature?: number;
147-
modalities?: api_proto.Modality[];
148+
modalities?: ['text'] | ['audio', 'text'];
148149
toolChoice?: llm.ToolChoice;
149150
baseURL?: string;
150151
inputAudioTranscription?: api_proto.InputAudioTranscription | null;
@@ -165,7 +166,7 @@ export class RealtimeModel extends llm.RealtimeModel {
165166
turnDetection: options.turnDetection !== null,
166167
userTranscription: options.inputAudioTranscription !== null,
167168
autoToolReplyGeneration: false,
168-
audioOutput: options.modalities ? options.modalities.includes('audio') : true,
169+
audioOutput: options.modalities ? (options.modalities as string[]).includes('audio') : true,
169170
});
170171

171172
const isAzure = !!(options.apiVersion || options.entraToken || options.azureDeployment);
@@ -247,7 +248,7 @@ export class RealtimeModel extends llm.RealtimeModel {
247248
entraToken?: string;
248249
baseURL?: string;
249250
voice?: string;
250-
modalities?: api_proto.Modality[];
251+
modalities?: ['text'] | ['audio', 'text'];
251252
inputAudioTranscription?: api_proto.InputAudioTranscription;
252253
// TODO(shubhra): add inputAudioNoiseReduction
253254
turnDetection?: api_proto.TurnDetectionType;
@@ -399,25 +400,14 @@ export class RealtimeSession extends llm.RealtimeSession {
399400
}
400401

401402
private createSessionUpdateEvent(): api_proto.SessionUpdateEvent {
402-
// OpenAI doesn't support both modalities simultaneously.
403-
// If audio is in modalities, prefer audio; otherwise use text.
404-
405-
// from the docs (https://platform.openai.com/docs/api-reference/realtime-client-events/session)
406-
// output_modalities [array]
407-
//
408-
// The set of modalities the model can respond with. It defaults to ["audio"], indicating that the model will respond with audio plus a transcript. ["text"] can be used to make the model respond with text only. It is not possible to request both text and audio at the same time.
409-
const outputModality = this.oaiRealtimeModel._options.modalities.includes('audio')
410-
? 'audio'
411-
: 'text';
412-
413403
return {
414404
type: 'session.update',
415405
session: {
416406
model: this.oaiRealtimeModel._options.model,
417407
voice: this.oaiRealtimeModel._options.voice,
418408
input_audio_format: 'pcm16',
419409
output_audio_format: 'pcm16',
420-
output_modalities: [outputModality],
410+
modalities: this.oaiRealtimeModel._options.modalities, // Supported combinations are: ['text'] and ['audio', 'text'].",
421411
turn_detection: this.oaiRealtimeModel._options.turnDetection,
422412
input_audio_transcription: this.oaiRealtimeModel._options.inputAudioTranscription,
423413
// TODO(shubhra): add inputAudioNoiseReduction
@@ -928,6 +918,12 @@ export class RealtimeSession extends llm.RealtimeSession {
928918
case 'response.content_part.done':
929919
this.handleResponseContentPartDone(event);
930920
break;
921+
case 'response.text.delta' as any:
922+
this.handleResponseTextDelta(event as any);
923+
break;
924+
case 'response.text.done':
925+
this.handleResponseTextDone(event as any);
926+
break;
931927
case 'response.audio_transcript.delta':
932928
this.handleResponseAudioTranscriptDelta(event);
933929
break;
@@ -1148,35 +1144,40 @@ export class RealtimeSession extends llm.RealtimeSession {
11481144
const itemType = event.part.type;
11491145
const responseId = event.response_id;
11501146

1151-
if (itemType === 'audio') {
1152-
this.resolveGeneration(responseId);
1153-
if (this.textModeRecoveryRetries > 0) {
1154-
this.#logger.info(
1155-
{ retries: this.textModeRecoveryRetries },
1156-
'recovered from text-only response',
1157-
);
1158-
this.textModeRecoveryRetries = 0;
1159-
}
1147+
this.resolveGeneration(responseId);
1148+
if (this.textModeRecoveryRetries > 0) {
1149+
this.#logger.info(
1150+
{ retries: this.textModeRecoveryRetries },
1151+
'recovered from text-only response',
1152+
);
1153+
this.textModeRecoveryRetries = 0;
1154+
}
11601155

1161-
const itemGeneration: MessageGeneration = {
1162-
messageId: itemId,
1163-
textChannel: stream.createStreamChannel<string>(),
1164-
audioChannel: stream.createStreamChannel<AudioFrame>(),
1165-
audioTranscript: '',
1166-
};
1167-
1168-
this.currentGeneration.messageChannel.write({
1169-
messageId: itemId,
1170-
textStream: itemGeneration.textChannel.stream(),
1171-
audioStream: itemGeneration.audioChannel.stream(),
1172-
});
1156+
const itemGeneration: MessageGeneration = {
1157+
messageId: itemId,
1158+
textChannel: stream.createStreamChannel<string>(),
1159+
audioChannel: stream.createStreamChannel<AudioFrame>(),
1160+
audioTranscript: '',
1161+
};
11731162

1174-
this.currentGeneration.messages.set(itemId, itemGeneration);
1175-
this.currentGeneration._firstTokenTimestamp = Date.now();
1176-
return;
1177-
} else {
1178-
this.interrupt();
1179-
if (this.textModeRecoveryRetries === 0) {
1163+
if (!this.oaiRealtimeModel.capabilities.audioOutput) {
1164+
itemGeneration.audioChannel.close();
1165+
itemGeneration.modalities = ['text'];
1166+
}
1167+
1168+
this.currentGeneration.messageChannel.write({
1169+
messageId: itemId,
1170+
textStream: itemGeneration.textChannel.stream(),
1171+
audioStream: itemGeneration.audioChannel.stream(),
1172+
modalities: itemGeneration.modalities || ['text', 'audio'],
1173+
});
1174+
1175+
this.currentGeneration.messages.set(itemId, itemGeneration);
1176+
this.currentGeneration._firstTokenTimestamp = Date.now();
1177+
1178+
if (itemType === 'text') {
1179+
// Only warn if we expected audio but received text
1180+
if (this.textModeRecoveryRetries === 0 && this.oaiRealtimeModel.capabilities.audioOutput) {
11801181
this.#logger.warn({ responseId }, 'received text-only response from OpenAI Realtime API');
11811182
}
11821183
}
@@ -1194,6 +1195,32 @@ export class RealtimeSession extends llm.RealtimeSession {
11941195
// TODO(shubhra): handle text mode recovery
11951196
}
11961197

1198+
private handleResponseTextDelta(event: api_proto.ResponseTextDeltaEvent): void {
1199+
if (!this.currentGeneration) {
1200+
throw new Error('currentGeneration is not set');
1201+
}
1202+
1203+
const itemGeneration = this.currentGeneration.messages.get(event.item_id);
1204+
if (!itemGeneration) {
1205+
throw new Error('itemGeneration is not set');
1206+
}
1207+
1208+
// Set first token timestamp if in text-only mode
1209+
if (itemGeneration.modalities?.[0] === 'text' && !this.currentGeneration._firstTokenTimestamp) {
1210+
this.currentGeneration._firstTokenTimestamp = Date.now();
1211+
}
1212+
1213+
itemGeneration.textChannel.write(event.delta);
1214+
itemGeneration.audioTranscript += event.delta;
1215+
}
1216+
1217+
private handleResponseTextDone(_event: api_proto.ResponseTextDoneEvent): void {
1218+
if (!this.currentGeneration) {
1219+
throw new Error('currentGeneration is not set');
1220+
}
1221+
// No additional processing needed - just assert generation exists
1222+
}
1223+
11971224
private handleResponseAudioTranscriptDelta(
11981225
event: api_proto.ResponseAudioTranscriptDeltaEvent,
11991226
): void {

0 commit comments

Comments
 (0)