fix modalities:

simllll · simllll · commit 48f126aa81c3 · 2025-10-22T16:50:14.000+02:00
output_modalities
array

The set of modalities the model can respond with. It defaults to ["audio"], indicating that the model will respond with audio plus a transcript. ["text"] can be used to make the model respond with text only. It is not possible to request both text and audio at the same time.
diff --git a/plugins/openai/src/realtime/api_proto.ts b/plugins/openai/src/realtime/api_proto.ts
@@ -190,7 +190,7 @@ export interface SessionResource {
   id: string;
   object: 'realtime.session';
   model: string;
-  output_modalities: ['text', 'audio'] | ['text'] | ['audio']; // default: ["text", "audio"]
+  output_modalities: ['text'] | ['audio'];
   instructions: string;
   voice: Voice; // default: "alloy"
   input_audio_format: AudioFormat; // default: "pcm16"
@@ -267,7 +267,7 @@ export interface SessionUpdateEvent extends BaseClientEvent {
   type: 'session.update';
   session: Partial<{
     model: Model;
-    output_modalities: ['text', 'audio'] | ['text'] | ['audio'];
+    output_modalities: ['text'] | ['audio'];
     instructions: string;
     voice: Voice;
     input_audio_format: AudioFormat;
@@ -350,7 +350,7 @@ export interface ConversationItemDeleteEvent extends BaseClientEvent {
 export interface ResponseCreateEvent extends BaseClientEvent {
   type: 'response.create';
   response?: Partial<{
-    output_modalities: ['text', 'audio'] | ['text'] | ['audio'];
+    output_modalities: ['text'] | ['audio'];
     instructions: string;
     voice: Voice;
     output_audio_format: AudioFormat;
diff --git a/plugins/openai/src/realtime/realtime_model.ts b/plugins/openai/src/realtime/realtime_model.ts
@@ -122,7 +122,7 @@ const DEFAULT_REALTIME_MODEL_OPTIONS = {
   model: 'gpt-realtime',
   voice: 'marin',
   temperature: DEFAULT_TEMPERATURE,
-  modalities: ['text', 'audio'] as api_proto.Modality[],
+  modalities: ['audio'] as api_proto.Modality[],
   inputAudioTranscription: DEFAULT_INPUT_AUDIO_TRANSCRIPTION,
   turnDetection: DEFAULT_TURN_DETECTION,
   toolChoice: DEFAULT_TOOL_CHOICE,
@@ -399,17 +399,25 @@ export class RealtimeSession extends llm.RealtimeSession {
   }
 
   private createSessionUpdateEvent(): api_proto.SessionUpdateEvent {
+    // OpenAI doesn't support both modalities simultaneously.
+    // If audio is in modalities, prefer audio; otherwise use text.
+
+    // from the docs (https://platform.openai.com/docs/api-reference/realtime-client-events/session)
+    // output_modalities [array]
+    //
+    // The set of modalities the model can respond with. It defaults to ["audio"], indicating that the model will respond with audio plus a transcript. ["text"] can be used to make the model respond with text only. It is not possible to request both text and audio at the same time.
+    const outputModality = this.oaiRealtimeModel._options.modalities.includes('audio')
+      ? 'audio'
+      : 'text';
+
     return {
       type: 'session.update',
       session: {
         model: this.oaiRealtimeModel._options.model,
         voice: this.oaiRealtimeModel._options.voice,
         input_audio_format: 'pcm16',
         output_audio_format: 'pcm16',
-        output_modalities: this.oaiRealtimeModel._options.modalities as
-          | ['text', 'audio']
-          | ['text']
-          | ['audio'],
+        output_modalities: [outputModality],
         turn_detection: this.oaiRealtimeModel._options.turnDetection,
         input_audio_transcription: this.oaiRealtimeModel._options.inputAudioTranscription,
         // TODO(shubhra): add inputAudioNoiseReduction