Skip to content

Commit 4361aa6

Browse files
committed
fix modalities parameter for openai
1 parent 7fc7808 commit 4361aa6

File tree

3 files changed

+11
-1
lines changed

3 files changed

+11
-1
lines changed

agents/src/llm/realtime.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ export interface RealtimeCapabilities {
4040
turnDetection: boolean;
4141
userTranscription: boolean;
4242
autoToolReplyGeneration: boolean;
43+
audioOutput: boolean;
4344
}
4445

4546
export interface InputTranscriptionCompleted {

plugins/google/src/beta/realtime/realtime_api.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -290,6 +290,7 @@ export class RealtimeModel extends llm.RealtimeModel {
290290
turnDetection: serverTurnDetection,
291291
userTranscription: inputAudioTranscription !== null,
292292
autoToolReplyGeneration: true,
293+
audioOutput: (options.modalities || [Modality.AUDIO]).includes(Modality.AUDIO),
293294
});
294295

295296
// Environment variable fallbacks

plugins/openai/src/realtime/realtime_model.ts

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ interface RealtimeOptions {
3838
model: api_proto.Model;
3939
voice: api_proto.Voice;
4040
temperature: number;
41+
modalities: api_proto.Modality[];
4142
toolChoice?: llm.ToolChoice;
4243
inputAudioTranscription?: api_proto.InputAudioTranscription | null;
4344
// TODO(shubhra): add inputAudioNoiseReduction
@@ -121,6 +122,7 @@ const DEFAULT_REALTIME_MODEL_OPTIONS = {
121122
model: 'gpt-realtime',
122123
voice: 'marin',
123124
temperature: DEFAULT_TEMPERATURE,
125+
modalities: ['text', 'audio'] as api_proto.Modality[],
124126
inputAudioTranscription: DEFAULT_INPUT_AUDIO_TRANSCRIPTION,
125127
turnDetection: DEFAULT_TURN_DETECTION,
126128
toolChoice: DEFAULT_TOOL_CHOICE,
@@ -142,6 +144,7 @@ export class RealtimeModel extends llm.RealtimeModel {
142144
model?: string;
143145
voice?: string;
144146
temperature?: number;
147+
modalities?: api_proto.Modality[];
145148
toolChoice?: llm.ToolChoice;
146149
baseURL?: string;
147150
inputAudioTranscription?: api_proto.InputAudioTranscription | null;
@@ -162,6 +165,7 @@ export class RealtimeModel extends llm.RealtimeModel {
162165
turnDetection: options.turnDetection !== null,
163166
userTranscription: options.inputAudioTranscription !== null,
164167
autoToolReplyGeneration: false,
168+
audioOutput: options.modalities ? options.modalities.includes('audio') : true,
165169
});
166170

167171
const isAzure = !!(options.apiVersion || options.entraToken || options.azureDeployment);
@@ -197,6 +201,7 @@ export class RealtimeModel extends llm.RealtimeModel {
197201
apiKey,
198202
isAzure,
199203
model: options.model || DEFAULT_REALTIME_MODEL_OPTIONS.model,
204+
modalities: options.modalities || DEFAULT_REALTIME_MODEL_OPTIONS.modalities,
200205
};
201206
}
202207

@@ -229,6 +234,7 @@ export class RealtimeModel extends llm.RealtimeModel {
229234
entraToken,
230235
baseURL,
231236
voice = 'alloy',
237+
modalities,
232238
inputAudioTranscription = AZURE_DEFAULT_INPUT_AUDIO_TRANSCRIPTION,
233239
turnDetection = AZURE_DEFAULT_TURN_DETECTION,
234240
temperature = 0.8,
@@ -241,6 +247,7 @@ export class RealtimeModel extends llm.RealtimeModel {
241247
entraToken?: string;
242248
baseURL?: string;
243249
voice?: string;
250+
modalities?: api_proto.Modality[];
244251
inputAudioTranscription?: api_proto.InputAudioTranscription;
245252
// TODO(shubhra): add inputAudioNoiseReduction
246253
turnDetection?: api_proto.TurnDetectionType;
@@ -273,6 +280,7 @@ export class RealtimeModel extends llm.RealtimeModel {
273280

274281
return new RealtimeModel({
275282
voice,
283+
modalities,
276284
inputAudioTranscription,
277285
turnDetection,
278286
temperature,
@@ -398,7 +406,7 @@ export class RealtimeSession extends llm.RealtimeSession {
398406
voice: this.oaiRealtimeModel._options.voice,
399407
input_audio_format: 'pcm16',
400408
output_audio_format: 'pcm16',
401-
modalities: ['text', 'audio'],
409+
modalities: this.oaiRealtimeModel._options.modalities as ['text', 'audio'] | ['text'],
402410
turn_detection: this.oaiRealtimeModel._options.turnDetection,
403411
input_audio_transcription: this.oaiRealtimeModel._options.inputAudioTranscription,
404412
// TODO(shubhra): add inputAudioNoiseReduction

0 commit comments

Comments
 (0)