Skip to content

Commit 48f126a

Browse files
committed
fix modalities:
output_modalities array The set of modalities the model can respond with. It defaults to ["audio"], indicating that the model will respond with audio plus a transcript. ["text"] can be used to make the model respond with text only. It is not possible to request both text and audio at the same time.
1 parent a5f005b commit 48f126a

File tree

2 files changed

+16
-8
lines changed

2 files changed

+16
-8
lines changed

plugins/openai/src/realtime/api_proto.ts

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -190,7 +190,7 @@ export interface SessionResource {
190190
id: string;
191191
object: 'realtime.session';
192192
model: string;
193-
output_modalities: ['text', 'audio'] | ['text'] | ['audio']; // default: ["text", "audio"]
193+
output_modalities: ['text'] | ['audio'];
194194
instructions: string;
195195
voice: Voice; // default: "alloy"
196196
input_audio_format: AudioFormat; // default: "pcm16"
@@ -267,7 +267,7 @@ export interface SessionUpdateEvent extends BaseClientEvent {
267267
type: 'session.update';
268268
session: Partial<{
269269
model: Model;
270-
output_modalities: ['text', 'audio'] | ['text'] | ['audio'];
270+
output_modalities: ['text'] | ['audio'];
271271
instructions: string;
272272
voice: Voice;
273273
input_audio_format: AudioFormat;
@@ -350,7 +350,7 @@ export interface ConversationItemDeleteEvent extends BaseClientEvent {
350350
export interface ResponseCreateEvent extends BaseClientEvent {
351351
type: 'response.create';
352352
response?: Partial<{
353-
output_modalities: ['text', 'audio'] | ['text'] | ['audio'];
353+
output_modalities: ['text'] | ['audio'];
354354
instructions: string;
355355
voice: Voice;
356356
output_audio_format: AudioFormat;

plugins/openai/src/realtime/realtime_model.ts

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -122,7 +122,7 @@ const DEFAULT_REALTIME_MODEL_OPTIONS = {
122122
model: 'gpt-realtime',
123123
voice: 'marin',
124124
temperature: DEFAULT_TEMPERATURE,
125-
modalities: ['text', 'audio'] as api_proto.Modality[],
125+
modalities: ['audio'] as api_proto.Modality[],
126126
inputAudioTranscription: DEFAULT_INPUT_AUDIO_TRANSCRIPTION,
127127
turnDetection: DEFAULT_TURN_DETECTION,
128128
toolChoice: DEFAULT_TOOL_CHOICE,
@@ -399,17 +399,25 @@ export class RealtimeSession extends llm.RealtimeSession {
399399
}
400400

401401
private createSessionUpdateEvent(): api_proto.SessionUpdateEvent {
402+
// OpenAI doesn't support both modalities simultaneously.
403+
// If audio is in modalities, prefer audio; otherwise use text.
404+
405+
// from the docs (https://platform.openai.com/docs/api-reference/realtime-client-events/session)
406+
// output_modalities [array]
407+
//
408+
// The set of modalities the model can respond with. It defaults to ["audio"], indicating that the model will respond with audio plus a transcript. ["text"] can be used to make the model respond with text only. It is not possible to request both text and audio at the same time.
409+
const outputModality = this.oaiRealtimeModel._options.modalities.includes('audio')
410+
? 'audio'
411+
: 'text';
412+
402413
return {
403414
type: 'session.update',
404415
session: {
405416
model: this.oaiRealtimeModel._options.model,
406417
voice: this.oaiRealtimeModel._options.voice,
407418
input_audio_format: 'pcm16',
408419
output_audio_format: 'pcm16',
409-
output_modalities: this.oaiRealtimeModel._options.modalities as
410-
| ['text', 'audio']
411-
| ['text']
412-
| ['audio'],
420+
output_modalities: [outputModality],
413421
turn_detection: this.oaiRealtimeModel._options.turnDetection,
414422
input_audio_transcription: this.oaiRealtimeModel._options.inputAudioTranscription,
415423
// TODO(shubhra): add inputAudioNoiseReduction

0 commit comments

Comments
 (0)