22
33import { APIResource } from '../../../resource' ;
44import * as RealtimeAPI from './realtime' ;
5+ import * as Shared from '../../shared' ;
56import * as SessionsAPI from './sessions' ;
67import {
78 Session as SessionsAPISession ,
@@ -741,9 +742,38 @@ export interface RealtimeResponse {
741742 id ?: string ;
742743
743744 /**
744- * Developer-provided string key-value pairs associated with this response.
745+ * Which conversation the response is added to, determined by the `conversation`
746+ * field in the `response.create` event. If `auto`, the response will be added to
747+ * the default conversation and the value of `conversation_id` will be an id like
748+ * `conv_1234`. If `none`, the response will not be added to any conversation and
749+ * the value of `conversation_id` will be `null`. If responses are being triggered
750+ * by server VAD, the response will be added to the default conversation, thus the
751+ * `conversation_id` will be an id like `conv_1234`.
745752 */
746- metadata ?: unknown | null ;
753+ conversation_id ?: string ;
754+
755+ /**
756+ * Maximum number of output tokens for a single assistant response, inclusive of
757+ * tool calls, that was used in this response.
758+ */
759+ max_output_tokens ?: number | 'inf' ;
760+
761+ /**
762+ * Set of 16 key-value pairs that can be attached to an object. This can be useful
763+ * for storing additional information about the object in a structured format, and
764+ * querying for objects via API or the dashboard.
765+ *
766+ * Keys are strings with a maximum length of 64 characters. Values are strings with
767+ * a maximum length of 512 characters.
768+ */
769+ metadata ?: Shared . Metadata | null ;
770+
771+ /**
772+ * The set of modalities the model used to respond. If there are multiple
773+ * modalities, the model will pick one, for example if `modalities` is
774+ * `["text", "audio"]`, the model could be responding in either text or audio.
775+ */
776+ modalities ?: Array < 'text' | 'audio' > ;
747777
748778 /**
749779 * The object type, must be `realtime.response`.
@@ -755,6 +785,11 @@ export interface RealtimeResponse {
755785 */
756786 output ?: Array < ConversationItem > ;
757787
788+ /**
789+ * The format of output audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`.
790+ */
791+ output_audio_format ?: 'pcm16' | 'g711_ulaw' | 'g711_alaw' ;
792+
758793 /**
759794 * The final status of the response (`completed`, `cancelled`, `failed`, or
760795 * `incomplete`).
@@ -766,13 +801,24 @@ export interface RealtimeResponse {
766801 */
767802 status_details ?: RealtimeResponseStatus ;
768803
804+ /**
805+ * Sampling temperature for the model, limited to [0.6, 1.2]. Defaults to 0.8.
806+ */
807+ temperature ?: number ;
808+
769809 /**
770810 * Usage statistics for the Response, this will correspond to billing. A Realtime
771811 * API session will maintain a conversation context and append new Items to the
772812 * Conversation, thus output from previous turns (text and audio tokens) will
773813 * become the input for later turns.
774814 */
775815 usage ?: RealtimeResponseUsage ;
816+
817+ /**
818+ * The voice the model used to respond. Current voice options are `alloy`, `ash`,
819+ * `ballad`, `coral`, `echo` `sage`, `shimmer` and `verse`.
820+ */
821+ voice ?: 'alloy' | 'ash' | 'ballad' | 'coral' | 'echo' | 'sage' | 'shimmer' | 'verse' ;
776822}
777823
778824/**
@@ -1320,11 +1366,13 @@ export namespace ResponseCreateEvent {
13201366
13211367 /**
13221368 * Set of 16 key-value pairs that can be attached to an object. This can be useful
1323- * for storing additional information about the object in a structured format. Keys
1324- * can be a maximum of 64 characters long and values can be a maximum of 512
1325- * characters long.
1369+ * for storing additional information about the object in a structured format, and
1370+ * querying for objects via API or the dashboard.
1371+ *
1372+ * Keys are strings with a maximum length of 64 characters. Values are strings with
1373+ * a maximum length of 512 characters.
13261374 */
1327- metadata ?: unknown | null ;
1375+ metadata ?: Shared . Metadata | null ;
13281376
13291377 /**
13301378 * The set of modalities the model can respond with. To disable audio, set this to
@@ -1716,8 +1764,11 @@ export namespace SessionUpdateEvent {
17161764 * Configuration for input audio transcription, defaults to off and can be set to
17171765 * `null` to turn off once on. Input audio transcription is not native to the
17181766 * model, since the model consumes audio directly. Transcription runs
1719- * asynchronously through Whisper and should be treated as rough guidance rather
1720- * than the representation understood by the model.
1767+ * asynchronously through
1768+ * [OpenAI Whisper transcription](https://platform.openai.com/docs/api-reference/audio/createTranscription)
1769+ * and should be treated as rough guidance rather than the representation
1770+ * understood by the model. The client can optionally set the language and prompt
1771+ * for transcription, these fields will be passed to the Whisper API.
17211772 */
17221773 input_audio_transcription ?: Session . InputAudioTranscription ;
17231774
@@ -1801,15 +1852,33 @@ export namespace SessionUpdateEvent {
18011852 * Configuration for input audio transcription, defaults to off and can be set to
18021853 * `null` to turn off once on. Input audio transcription is not native to the
18031854 * model, since the model consumes audio directly. Transcription runs
1804- * asynchronously through Whisper and should be treated as rough guidance rather
1805- * than the representation understood by the model.
1855+ * asynchronously through
1856+ * [OpenAI Whisper transcription](https://platform.openai.com/docs/api-reference/audio/createTranscription)
1857+ * and should be treated as rough guidance rather than the representation
1858+ * understood by the model. The client can optionally set the language and prompt
1859+ * for transcription, these fields will be passed to the Whisper API.
18061860 */
18071861 export interface InputAudioTranscription {
1862+ /**
1863+ * The language of the input audio. Supplying the input language in
1864+ * [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`)
1865+ * format will improve accuracy and latency.
1866+ */
1867+ language ?: string ;
1868+
18081869 /**
18091870 * The model to use for transcription, `whisper-1` is the only currently supported
18101871 * model.
18111872 */
18121873 model ?: string ;
1874+
1875+ /**
1876+ * An optional text to guide the model's style or continue a previous audio
1877+ * segment. The
1878+ * [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting)
1879+ * should match the audio language.
1880+ */
1881+ prompt ?: string ;
18131882 }
18141883
18151884 export interface Tool {
0 commit comments