@@ -38,6 +38,7 @@ interface RealtimeOptions {
3838 model : api_proto . Model ;
3939 voice : api_proto . Voice ;
4040 temperature : number ;
41+ modalities : api_proto . Modality [ ] ;
4142 toolChoice ?: llm . ToolChoice ;
4243 inputAudioTranscription ?: api_proto . InputAudioTranscription | null ;
4344 // TODO(shubhra): add inputAudioNoiseReduction
@@ -121,6 +122,7 @@ const DEFAULT_REALTIME_MODEL_OPTIONS = {
121122 model : 'gpt-realtime' ,
122123 voice : 'marin' ,
123124 temperature : DEFAULT_TEMPERATURE ,
125+ modalities : [ 'text' , 'audio' ] as api_proto . Modality [ ] ,
124126 inputAudioTranscription : DEFAULT_INPUT_AUDIO_TRANSCRIPTION ,
125127 turnDetection : DEFAULT_TURN_DETECTION ,
126128 toolChoice : DEFAULT_TOOL_CHOICE ,
@@ -142,6 +144,7 @@ export class RealtimeModel extends llm.RealtimeModel {
142144 model ?: string ;
143145 voice ?: string ;
144146 temperature ?: number ;
147+ modalities ?: api_proto . Modality [ ] ;
145148 toolChoice ?: llm . ToolChoice ;
146149 baseURL ?: string ;
147150 inputAudioTranscription ?: api_proto . InputAudioTranscription | null ;
@@ -162,6 +165,7 @@ export class RealtimeModel extends llm.RealtimeModel {
162165 turnDetection : options . turnDetection !== null ,
163166 userTranscription : options . inputAudioTranscription !== null ,
164167 autoToolReplyGeneration : false ,
168+ audioOutput : options . modalities ? options . modalities . includes ( 'audio' ) : true ,
165169 } ) ;
166170
167171 const isAzure = ! ! ( options . apiVersion || options . entraToken || options . azureDeployment ) ;
@@ -197,6 +201,7 @@ export class RealtimeModel extends llm.RealtimeModel {
197201 apiKey,
198202 isAzure,
199203 model : options . model || DEFAULT_REALTIME_MODEL_OPTIONS . model ,
204+ modalities : options . modalities || DEFAULT_REALTIME_MODEL_OPTIONS . modalities ,
200205 } ;
201206 }
202207
@@ -229,6 +234,7 @@ export class RealtimeModel extends llm.RealtimeModel {
229234 entraToken,
230235 baseURL,
231236 voice = 'alloy' ,
237+ modalities,
232238 inputAudioTranscription = AZURE_DEFAULT_INPUT_AUDIO_TRANSCRIPTION ,
233239 turnDetection = AZURE_DEFAULT_TURN_DETECTION ,
234240 temperature = 0.8 ,
@@ -241,6 +247,7 @@ export class RealtimeModel extends llm.RealtimeModel {
241247 entraToken ?: string ;
242248 baseURL ?: string ;
243249 voice ?: string ;
250+ modalities ?: api_proto . Modality [ ] ;
244251 inputAudioTranscription ?: api_proto . InputAudioTranscription ;
245252 // TODO(shubhra): add inputAudioNoiseReduction
246253 turnDetection ?: api_proto . TurnDetectionType ;
@@ -273,6 +280,7 @@ export class RealtimeModel extends llm.RealtimeModel {
273280
274281 return new RealtimeModel ( {
275282 voice,
283+ modalities,
276284 inputAudioTranscription,
277285 turnDetection,
278286 temperature,
@@ -398,7 +406,7 @@ export class RealtimeSession extends llm.RealtimeSession {
398406 voice : this . oaiRealtimeModel . _options . voice ,
399407 input_audio_format : 'pcm16' ,
400408 output_audio_format : 'pcm16' ,
401- modalities : [ 'text' , 'audio' ] ,
409+ modalities : this . oaiRealtimeModel . _options . modalities as [ 'text' , 'audio' ] | [ 'text '] ,
402410 turn_detection : this . oaiRealtimeModel . _options . turnDetection ,
403411 input_audio_transcription : this . oaiRealtimeModel . _options . inputAudioTranscription ,
404412 // TODO(shubhra): add inputAudioNoiseReduction
0 commit comments