@@ -38,7 +38,7 @@ interface RealtimeOptions {
3838 model : api_proto . Model ;
3939 voice : api_proto . Voice ;
4040 temperature : number ;
41- modalities : api_proto . Modality [ ] ;
41+ modalities : [ 'text' ] | [ 'audio' , 'text' ] ;
4242 toolChoice ?: llm . ToolChoice ;
4343 inputAudioTranscription ?: api_proto . InputAudioTranscription | null ;
4444 // TODO(shubhra): add inputAudioNoiseReduction
@@ -62,6 +62,7 @@ interface MessageGeneration {
6262 textChannel : stream . StreamChannel < string > ;
6363 audioChannel : stream . StreamChannel < AudioFrame > ;
6464 audioTranscript : string ;
65+ modalities ?: [ 'text' ] | [ 'text' , 'audio' ] ;
6566}
6667
6768interface ResponseGeneration {
@@ -122,7 +123,7 @@ const DEFAULT_REALTIME_MODEL_OPTIONS = {
122123 model : 'gpt-realtime' ,
123124 voice : 'marin' ,
124125 temperature : DEFAULT_TEMPERATURE ,
125- modalities : [ 'audio' ] as api_proto . Modality [ ] ,
126+ modalities : [ 'audio' , 'text' ] as [ 'audio' , 'text' ] ,
126127 inputAudioTranscription : DEFAULT_INPUT_AUDIO_TRANSCRIPTION ,
127128 turnDetection : DEFAULT_TURN_DETECTION ,
128129 toolChoice : DEFAULT_TOOL_CHOICE ,
@@ -144,7 +145,7 @@ export class RealtimeModel extends llm.RealtimeModel {
144145 model ?: string ;
145146 voice ?: string ;
146147 temperature ?: number ;
147- modalities ?: api_proto . Modality [ ] ;
148+ modalities ?: [ 'text' ] | [ 'audio' , 'text' ] ;
148149 toolChoice ?: llm . ToolChoice ;
149150 baseURL ?: string ;
150151 inputAudioTranscription ?: api_proto . InputAudioTranscription | null ;
@@ -165,7 +166,7 @@ export class RealtimeModel extends llm.RealtimeModel {
165166 turnDetection : options . turnDetection !== null ,
166167 userTranscription : options . inputAudioTranscription !== null ,
167168 autoToolReplyGeneration : false ,
168- audioOutput : options . modalities ? options . modalities . includes ( 'audio' ) : true ,
169+ audioOutput : options . modalities ? ( options . modalities as string [ ] ) . includes ( 'audio' ) : true ,
169170 } ) ;
170171
171172 const isAzure = ! ! ( options . apiVersion || options . entraToken || options . azureDeployment ) ;
@@ -247,7 +248,7 @@ export class RealtimeModel extends llm.RealtimeModel {
247248 entraToken ?: string ;
248249 baseURL ?: string ;
249250 voice ?: string ;
250- modalities ?: api_proto . Modality [ ] ;
251+ modalities ?: [ 'text' ] | [ 'audio' , 'text' ] ;
251252 inputAudioTranscription ?: api_proto . InputAudioTranscription ;
252253 // TODO(shubhra): add inputAudioNoiseReduction
253254 turnDetection ?: api_proto . TurnDetectionType ;
@@ -399,25 +400,14 @@ export class RealtimeSession extends llm.RealtimeSession {
399400 }
400401
401402 private createSessionUpdateEvent ( ) : api_proto . SessionUpdateEvent {
402- // OpenAI doesn't support both modalities simultaneously.
403- // If audio is in modalities, prefer audio; otherwise use text.
404-
405- // from the docs (https://platform.openai.com/docs/api-reference/realtime-client-events/session)
406- // output_modalities [array]
407- //
408- // The set of modalities the model can respond with. It defaults to ["audio"], indicating that the model will respond with audio plus a transcript. ["text"] can be used to make the model respond with text only. It is not possible to request both text and audio at the same time.
409- const outputModality = this . oaiRealtimeModel . _options . modalities . includes ( 'audio' )
410- ? 'audio'
411- : 'text' ;
412-
413403 return {
414404 type : 'session.update' ,
415405 session : {
416406 model : this . oaiRealtimeModel . _options . model ,
417407 voice : this . oaiRealtimeModel . _options . voice ,
418408 input_audio_format : 'pcm16' ,
419409 output_audio_format : 'pcm16' ,
420- output_modalities : [ outputModality ] ,
410+ modalities : this . oaiRealtimeModel . _options . modalities , // Supported combinations are: ['text'] and ['audio', 'text']." ,
421411 turn_detection : this . oaiRealtimeModel . _options . turnDetection ,
422412 input_audio_transcription : this . oaiRealtimeModel . _options . inputAudioTranscription ,
423413 // TODO(shubhra): add inputAudioNoiseReduction
@@ -928,6 +918,12 @@ export class RealtimeSession extends llm.RealtimeSession {
928918 case 'response.content_part.done' :
929919 this . handleResponseContentPartDone ( event ) ;
930920 break ;
921+ case 'response.text.delta' as any :
922+ this . handleResponseTextDelta ( event as any ) ;
923+ break ;
924+ case 'response.text.done' :
925+ this . handleResponseTextDone ( event as any ) ;
926+ break ;
931927 case 'response.audio_transcript.delta' :
932928 this . handleResponseAudioTranscriptDelta ( event ) ;
933929 break ;
@@ -1148,35 +1144,40 @@ export class RealtimeSession extends llm.RealtimeSession {
11481144 const itemType = event . part . type ;
11491145 const responseId = event . response_id ;
11501146
1151- if ( itemType === 'audio' ) {
1152- this . resolveGeneration ( responseId ) ;
1153- if ( this . textModeRecoveryRetries > 0 ) {
1154- this . #logger. info (
1155- { retries : this . textModeRecoveryRetries } ,
1156- 'recovered from text-only response' ,
1157- ) ;
1158- this . textModeRecoveryRetries = 0 ;
1159- }
1147+ this . resolveGeneration ( responseId ) ;
1148+ if ( this . textModeRecoveryRetries > 0 ) {
1149+ this . #logger. info (
1150+ { retries : this . textModeRecoveryRetries } ,
1151+ 'recovered from text-only response' ,
1152+ ) ;
1153+ this . textModeRecoveryRetries = 0 ;
1154+ }
11601155
1161- const itemGeneration : MessageGeneration = {
1162- messageId : itemId ,
1163- textChannel : stream . createStreamChannel < string > ( ) ,
1164- audioChannel : stream . createStreamChannel < AudioFrame > ( ) ,
1165- audioTranscript : '' ,
1166- } ;
1167-
1168- this . currentGeneration . messageChannel . write ( {
1169- messageId : itemId ,
1170- textStream : itemGeneration . textChannel . stream ( ) ,
1171- audioStream : itemGeneration . audioChannel . stream ( ) ,
1172- } ) ;
1156+ const itemGeneration : MessageGeneration = {
1157+ messageId : itemId ,
1158+ textChannel : stream . createStreamChannel < string > ( ) ,
1159+ audioChannel : stream . createStreamChannel < AudioFrame > ( ) ,
1160+ audioTranscript : '' ,
1161+ } ;
11731162
1174- this . currentGeneration . messages . set ( itemId , itemGeneration ) ;
1175- this . currentGeneration . _firstTokenTimestamp = Date . now ( ) ;
1176- return ;
1177- } else {
1178- this . interrupt ( ) ;
1179- if ( this . textModeRecoveryRetries === 0 ) {
1163+ if ( ! this . oaiRealtimeModel . capabilities . audioOutput ) {
1164+ itemGeneration . audioChannel . close ( ) ;
1165+ itemGeneration . modalities = [ 'text' ] ;
1166+ }
1167+
1168+ this . currentGeneration . messageChannel . write ( {
1169+ messageId : itemId ,
1170+ textStream : itemGeneration . textChannel . stream ( ) ,
1171+ audioStream : itemGeneration . audioChannel . stream ( ) ,
1172+ modalities : itemGeneration . modalities || [ 'text' , 'audio' ] ,
1173+ } ) ;
1174+
1175+ this . currentGeneration . messages . set ( itemId , itemGeneration ) ;
1176+ this . currentGeneration . _firstTokenTimestamp = Date . now ( ) ;
1177+
1178+ if ( itemType === 'text' ) {
1179+ // Only warn if we expected audio but received text
1180+ if ( this . textModeRecoveryRetries === 0 && this . oaiRealtimeModel . capabilities . audioOutput ) {
11801181 this . #logger. warn ( { responseId } , 'received text-only response from OpenAI Realtime API' ) ;
11811182 }
11821183 }
@@ -1194,6 +1195,32 @@ export class RealtimeSession extends llm.RealtimeSession {
11941195 // TODO(shubhra): handle text mode recovery
11951196 }
11961197
1198+ private handleResponseTextDelta ( event : api_proto . ResponseTextDeltaEvent ) : void {
1199+ if ( ! this . currentGeneration ) {
1200+ throw new Error ( 'currentGeneration is not set' ) ;
1201+ }
1202+
1203+ const itemGeneration = this . currentGeneration . messages . get ( event . item_id ) ;
1204+ if ( ! itemGeneration ) {
1205+ throw new Error ( 'itemGeneration is not set' ) ;
1206+ }
1207+
1208+ // Set first token timestamp if in text-only mode
1209+ if ( itemGeneration . modalities ?. [ 0 ] === 'text' && ! this . currentGeneration . _firstTokenTimestamp ) {
1210+ this . currentGeneration . _firstTokenTimestamp = Date . now ( ) ;
1211+ }
1212+
1213+ itemGeneration . textChannel . write ( event . delta ) ;
1214+ itemGeneration . audioTranscript += event . delta ;
1215+ }
1216+
1217+ private handleResponseTextDone ( _event : api_proto . ResponseTextDoneEvent ) : void {
1218+ if ( ! this . currentGeneration ) {
1219+ throw new Error ( 'currentGeneration is not set' ) ;
1220+ }
1221+ // No additional processing needed - just assert generation exists
1222+ }
1223+
11971224 private handleResponseAudioTranscriptDelta (
11981225 event : api_proto . ResponseAudioTranscriptDeltaEvent ,
11991226 ) : void {
0 commit comments