diff --git a/.release-please-manifest.json b/.release-please-manifest.json index 48f0d34ba..dec479608 100644 --- a/.release-please-manifest.json +++ b/.release-please-manifest.json @@ -1,3 +1,3 @@ { - ".": "5.19.1" + ".": "5.20.0" } diff --git a/.stats.yml b/.stats.yml index c41be6ee5..36a3c7f58 100644 --- a/.stats.yml +++ b/.stats.yml @@ -1,4 +1,4 @@ configured_endpoints: 118 -openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/openai%2Fopenai-51afd6abbcb18c3086f62993f9379c18443b9e516cbc0548ddfb932e835657f8.yml -openapi_spec_hash: dae6afeaefa15cb8700c7a870531e06f -config_hash: b854932c0ea24b400bdd64e4376936bd +openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/openai%2Fopenai-7807ec6037efcee1af7decbfd3974a42b761fb6c6a71b4050fe43484d7fcbac4.yml +openapi_spec_hash: da6851e3891ad2659a50ed6a736fd32a +config_hash: 74d955cdc2377213f5268ea309090f6c diff --git a/CHANGELOG.md b/CHANGELOG.md index d9b8e7d29..02919cc52 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,18 @@ # Changelog +## 5.20.0 (2025-09-08) + +Full Changelog: [v5.19.1...v5.20.0](https://github.com/openai/openai-node/compare/v5.19.1...v5.20.0) + +### Features + +* **api:** ship the RealtimeGA API shape ([4286ddd](https://github.com/openai/openai-node/commit/4286ddd4f990dd26e15e510039457f17d787820d)) + + +### Chores + +* ci build action ([c8ce143](https://github.com/openai/openai-node/commit/c8ce143196fdbc7ee1c7832bce2417b6e3d25885)) + ## 5.19.1 (2025-09-03) Full Changelog: [v5.19.0...v5.19.1](https://github.com/openai/openai-node/compare/v5.19.0...v5.19.1) diff --git a/api.md b/api.md index e8a4c861d..0e1134d94 100644 --- a/api.md +++ b/api.md @@ -776,6 +776,7 @@ Methods: Types: +- AudioTranscription - ConversationCreatedEvent - ConversationItem - ConversationItemAdded @@ -804,11 +805,16 @@ Types: - McpListToolsCompleted - McpListToolsFailed - McpListToolsInProgress +- Models +- NoiseReductionType - OutputAudioBufferClearEvent - RateLimitsUpdatedEvent - RealtimeAudioConfig +- RealtimeAudioConfigInput +- RealtimeAudioConfigOutput +- RealtimeAudioFormats +- RealtimeAudioInputTurnDetection - RealtimeClientEvent -- RealtimeClientSecretConfig - RealtimeConversationItemAssistantMessage - RealtimeConversationItemFunctionCall - RealtimeConversationItemFunctionCallOutput @@ -824,6 +830,9 @@ Types: - RealtimeMcpToolExecutionError - RealtimeMcphttpError - RealtimeResponse +- RealtimeResponseCreateAudioOutput +- RealtimeResponseCreateMcpTool +- RealtimeResponseCreateParams - RealtimeResponseStatus - RealtimeResponseUsage - RealtimeResponseUsageInputTokenDetails @@ -835,8 +844,12 @@ Types: - RealtimeToolsConfig - RealtimeToolsConfigUnion - RealtimeTracingConfig +- RealtimeTranscriptionSessionAudio +- RealtimeTranscriptionSessionAudioInput +- RealtimeTranscriptionSessionAudioInputTurnDetection - RealtimeTranscriptionSessionCreateRequest - RealtimeTruncation +- RealtimeTruncationRetentionRatio - ResponseAudioDeltaEvent - ResponseAudioDoneEvent - ResponseAudioTranscriptDeltaEvent @@ -869,7 +882,12 @@ Types: Types: +- RealtimeSessionClientSecret - RealtimeSessionCreateResponse +- RealtimeTranscriptionSessionClientSecret +- RealtimeTranscriptionSessionCreateResponse +- RealtimeTranscriptionSessionInputAudioTranscription +- RealtimeTranscriptionSessionTurnDetection - ClientSecretCreateResponse Methods: diff --git a/jsr.json b/jsr.json index d6d3f55a9..43571736b 100644 --- a/jsr.json +++ b/jsr.json @@ -1,6 +1,6 @@ { "name": "@openai/openai", - "version": "5.19.1", + "version": "5.20.0", "exports": { ".": "./index.ts", "./helpers/zod": "./helpers/zod.ts", diff --git a/package.json b/package.json index a8fd383fc..340fba521 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "openai", - "version": "5.19.1", + "version": "5.20.0", "description": "The official TypeScript library for the OpenAI API", "author": "OpenAI ", "types": "dist/index.d.ts", diff --git a/scripts/utils/upload-artifact.sh b/scripts/utils/upload-artifact.sh index a157309c4..1ef9d0dfd 100755 --- a/scripts/utils/upload-artifact.sh +++ b/scripts/utils/upload-artifact.sh @@ -12,7 +12,7 @@ if [[ "$SIGNED_URL" == "null" ]]; then exit 1 fi -UPLOAD_RESPONSE=$(tar -cz "${BUILD_PATH:-dist}" | curl -v -X PUT \ +UPLOAD_RESPONSE=$(tar "${BASE_PATH:+-C$BASE_PATH}" -cz "${ARTIFACT_PATH:-dist}" | curl -v -X PUT \ -H "Content-Type: application/gzip" \ --data-binary @- "$SIGNED_URL" 2>&1) diff --git a/src/resources/realtime/client-secrets.ts b/src/resources/realtime/client-secrets.ts index c48fe8243..6539260ac 100644 --- a/src/resources/realtime/client-secrets.ts +++ b/src/resources/realtime/client-secrets.ts @@ -2,13 +2,13 @@ import { APIResource } from '../../core/resource'; import * as RealtimeAPI from './realtime'; +import * as ResponsesAPI from '../responses/responses'; import { APIPromise } from '../../core/api-promise'; import { RequestOptions } from '../../internal/request-options'; export class ClientSecrets extends APIResource { /** - * Create a Realtime session and client secret for either realtime or - * transcription. + * Create a Realtime client secret with an associated session configuration. */ create(body: ClientSecretCreateParams, options?: RequestOptions): APIPromise { return this._client.post('/realtime/client_secrets', { body, ...options }); @@ -16,29 +16,43 @@ export class ClientSecrets extends APIResource { } /** - * A Realtime session configuration object. + * Ephemeral key returned by the API. */ -export interface RealtimeSessionCreateResponse { +export interface RealtimeSessionClientSecret { /** - * Unique identifier for the session that looks like `sess_1234567890abcdef`. + * Timestamp for when the token expires. Currently, all tokens expire after one + * minute. */ - id?: string; + expires_at: number; /** - * Configuration for input and output audio for the session. + * Ephemeral key usable in client environments to authenticate connections to the + * Realtime API. Use this in client-side environments rather than a standard API + * token, which should only be used server-side. + */ + value: string; +} + +/** + * A new Realtime session configuration, with an ephemeral key. Default TTL for + * keys is one minute. + */ +export interface RealtimeSessionCreateResponse { + /** + * Configuration for input and output audio. */ audio?: RealtimeSessionCreateResponse.Audio; /** - * Expiration timestamp for the session, in seconds since epoch. + * Ephemeral key returned by the API. */ - expires_at?: number; + client_secret?: RealtimeSessionClientSecret; /** * Additional fields to include in server outputs. * - * - `item.input_audio_transcription.logprobs`: Include logprobs for input audio - * transcription. + * `item.input_audio_transcription.logprobs`: Include logprobs for input audio + * transcription. */ include?: Array<'item.input_audio_transcription.logprobs'>; @@ -67,50 +81,67 @@ export interface RealtimeSessionCreateResponse { /** * The Realtime model used for this session. */ - model?: string; + model?: + | (string & {}) + | 'gpt-realtime' + | 'gpt-realtime-2025-08-28' + | 'gpt-4o-realtime-preview' + | 'gpt-4o-realtime-preview-2024-10-01' + | 'gpt-4o-realtime-preview-2024-12-17' + | 'gpt-4o-realtime-preview-2025-06-03' + | 'gpt-4o-mini-realtime-preview' + | 'gpt-4o-mini-realtime-preview-2024-12-17'; /** - * The object type. Always `realtime.session`. + * The set of modalities the model can respond with. It defaults to `["audio"]`, + * indicating that the model will respond with audio plus a transcript. `["text"]` + * can be used to make the model respond with text only. It is not possible to + * request both `text` and `audio` at the same time. */ - object?: string; + output_modalities?: Array<'text' | 'audio'>; /** - * The set of modalities the model can respond with. To disable audio, set this to - * ["text"]. + * Reference to a prompt template and its variables. + * [Learn more](https://platform.openai.com/docs/guides/text?api-mode=responses#reusable-prompts). */ - output_modalities?: Array<'text' | 'audio'>; + prompt?: ResponsesAPI.ResponsePrompt | null; /** - * How the model chooses tools. Options are `auto`, `none`, `required`, or specify - * a function. + * How the model chooses tools. Provide one of the string modes or force a specific + * function/MCP tool. */ - tool_choice?: string; + tool_choice?: ResponsesAPI.ToolChoiceOptions | ResponsesAPI.ToolChoiceFunction | ResponsesAPI.ToolChoiceMcp; /** - * Tools (functions) available to the model. + * Tools available to the model. */ - tools?: Array; + tools?: Array; /** - * Configuration options for tracing. Set to null to disable tracing. Once tracing - * is enabled for a session, the configuration cannot be modified. + * Realtime API can write session traces to the + * [Traces Dashboard](/logs?api=traces). Set to null to disable tracing. Once + * tracing is enabled for a session, the configuration cannot be modified. * * `auto` will create a trace for the session with default values for the workflow * name, group id, and metadata. */ - tracing?: 'auto' | RealtimeSessionCreateResponse.TracingConfiguration; + tracing?: 'auto' | RealtimeSessionCreateResponse.TracingConfiguration | null; /** - * Configuration for turn detection. Can be set to `null` to turn off. Server VAD - * means that the model will detect the start and end of speech based on audio - * volume and respond at the end of user speech. + * Controls how the realtime conversation is truncated prior to model inference. + * The default is `auto`. */ - turn_detection?: RealtimeSessionCreateResponse.TurnDetection; + truncation?: RealtimeAPI.RealtimeTruncation; + + /** + * The type of session to create. Always `realtime` for the Realtime API. + */ + type?: 'realtime'; } export namespace RealtimeSessionCreateResponse { /** - * Configuration for input and output audio for the session. + * Configuration for input and output audio. */ export interface Audio { input?: Audio.Input; @@ -121,79 +152,153 @@ export namespace RealtimeSessionCreateResponse { export namespace Audio { export interface Input { /** - * The format of input audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`. + * The format of the input audio. */ - format?: string; + format?: RealtimeAPI.RealtimeAudioFormats; /** - * Configuration for input audio noise reduction. + * Configuration for input audio noise reduction. This can be set to `null` to turn + * off. Noise reduction filters audio added to the input audio buffer before it is + * sent to VAD and the model. Filtering the audio can improve VAD and turn + * detection accuracy (reducing false positives) and model performance by improving + * perception of the input audio. */ noise_reduction?: Input.NoiseReduction; /** - * Configuration for input audio transcription. + * Configuration for input audio transcription, defaults to off and can be set to + * `null` to turn off once on. Input audio transcription is not native to the + * model, since the model consumes audio directly. Transcription runs + * asynchronously through + * [the /audio/transcriptions endpoint](https://platform.openai.com/docs/api-reference/audio/createTranscription) + * and should be treated as guidance of input audio content rather than precisely + * what the model heard. The client can optionally set the language and prompt for + * transcription, these offer additional guidance to the transcription service. */ - transcription?: Input.Transcription; + transcription?: RealtimeAPI.AudioTranscription; /** - * Configuration for turn detection. + * Configuration for turn detection, ether Server VAD or Semantic VAD. This can be + * set to `null` to turn off, in which case the client must manually trigger model + * response. Server VAD means that the model will detect the start and end of + * speech based on audio volume and respond at the end of user speech. Semantic VAD + * is more advanced and uses a turn detection model (in conjunction with VAD) to + * semantically estimate whether the user has finished speaking, then dynamically + * sets a timeout based on this probability. For example, if user audio trails off + * with "uhhm", the model will score a low probability of turn end and wait longer + * for the user to continue speaking. This can be useful for more natural + * conversations, but may have a higher latency. */ turn_detection?: Input.TurnDetection; } export namespace Input { /** - * Configuration for input audio noise reduction. + * Configuration for input audio noise reduction. This can be set to `null` to turn + * off. Noise reduction filters audio added to the input audio buffer before it is + * sent to VAD and the model. Filtering the audio can improve VAD and turn + * detection accuracy (reducing false positives) and model performance by improving + * perception of the input audio. */ export interface NoiseReduction { - type?: 'near_field' | 'far_field'; + /** + * Type of noise reduction. `near_field` is for close-talking microphones such as + * headphones, `far_field` is for far-field microphones such as laptop or + * conference room microphones. + */ + type?: RealtimeAPI.NoiseReductionType; } /** - * Configuration for input audio transcription. + * Configuration for turn detection, ether Server VAD or Semantic VAD. This can be + * set to `null` to turn off, in which case the client must manually trigger model + * response. Server VAD means that the model will detect the start and end of + * speech based on audio volume and respond at the end of user speech. Semantic VAD + * is more advanced and uses a turn detection model (in conjunction with VAD) to + * semantically estimate whether the user has finished speaking, then dynamically + * sets a timeout based on this probability. For example, if user audio trails off + * with "uhhm", the model will score a low probability of turn end and wait longer + * for the user to continue speaking. This can be useful for more natural + * conversations, but may have a higher latency. */ - export interface Transcription { + export interface TurnDetection { /** - * The language of the input audio. + * Whether or not to automatically generate a response when a VAD stop event + * occurs. */ - language?: string; + create_response?: boolean; /** - * The model to use for transcription. + * Used only for `semantic_vad` mode. The eagerness of the model to respond. `low` + * will wait longer for the user to continue speaking, `high` will respond more + * quickly. `auto` is the default and is equivalent to `medium`. `low`, `medium`, + * and `high` have max timeouts of 8s, 4s, and 2s respectively. */ - model?: string; + eagerness?: 'low' | 'medium' | 'high' | 'auto'; /** - * Optional text to guide the model's style or continue a previous audio segment. + * Optional idle timeout after which turn detection will auto-timeout when no + * additional audio is received. */ - prompt?: string; - } + idle_timeout_ms?: number | null; - /** - * Configuration for turn detection. - */ - export interface TurnDetection { + /** + * Whether or not to automatically interrupt any ongoing response with output to + * the default conversation (i.e. `conversation` of `auto`) when a VAD start event + * occurs. + */ + interrupt_response?: boolean; + + /** + * Used only for `server_vad` mode. Amount of audio to include before the VAD + * detected speech (in milliseconds). Defaults to 300ms. + */ prefix_padding_ms?: number; + /** + * Used only for `server_vad` mode. Duration of silence to detect speech stop (in + * milliseconds). Defaults to 500ms. With shorter values the model will respond + * more quickly, but may jump in on short pauses from the user. + */ silence_duration_ms?: number; + /** + * Used only for `server_vad` mode. Activation threshold for VAD (0.0 to 1.0), this + * defaults to 0.5. A higher threshold will require louder audio to activate the + * model, and thus might perform better in noisy environments. + */ threshold?: number; /** - * Type of turn detection, only `server_vad` is currently supported. + * Type of turn detection. */ - type?: string; + type?: 'server_vad' | 'semantic_vad'; } } export interface Output { /** - * The format of output audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`. + * The format of the output audio. */ - format?: string; + format?: RealtimeAPI.RealtimeAudioFormats; + /** + * The speed of the model's spoken response as a multiple of the original speed. + * 1.0 is the default speed. 0.25 is the minimum speed. 1.5 is the maximum speed. + * This value can only be changed in between model turns, not while a response is + * in progress. + * + * This parameter is a post-processing adjustment to the audio after it is + * generated, it's also possible to prompt the model to speak faster or slower. + */ speed?: number; + /** + * The voice the model uses to respond. Voice cannot be changed during the session + * once the model has responded with audio at least once. Current voice options are + * `alloy`, `ash`, `ballad`, `coral`, `echo`, `sage`, `shimmer`, `verse`, `marin`, + * and `cedar`. We recommend `marin` and `cedar` for best quality. + */ voice?: | (string & {}) | 'alloy' @@ -209,229 +314,324 @@ export namespace RealtimeSessionCreateResponse { } } - export interface Tool { + /** + * Give the model access to additional tools via remote Model Context Protocol + * (MCP) servers. + * [Learn more about MCP](https://platform.openai.com/docs/guides/tools-remote-mcp). + */ + export interface McpTool { /** - * The description of the function, including guidance on when and how to call it, - * and guidance about what to tell the user when calling (if anything). + * A label for this MCP server, used to identify it in tool calls. */ - description?: string; + server_label: string; /** - * The name of the function. + * The type of the MCP tool. Always `mcp`. */ - name?: string; + type: 'mcp'; /** - * Parameters of the function in JSON Schema. + * List of allowed tool names or a filter object. */ - parameters?: unknown; + allowed_tools?: Array | McpTool.McpToolFilter | null; /** - * The type of the tool, i.e. `function`. + * An OAuth access token that can be used with a remote MCP server, either with a + * custom MCP server URL or a service connector. Your application must handle the + * OAuth authorization flow and provide the token here. */ - type?: 'function'; - } + authorization?: string; - /** - * Granular configuration for tracing. - */ - export interface TracingConfiguration { /** - * The group id to attach to this trace to enable filtering and grouping in the - * traces dashboard. + * Identifier for service connectors, like those available in ChatGPT. One of + * `server_url` or `connector_id` must be provided. Learn more about service + * connectors + * [here](https://platform.openai.com/docs/guides/tools-remote-mcp#connectors). + * + * Currently supported `connector_id` values are: + * + * - Dropbox: `connector_dropbox` + * - Gmail: `connector_gmail` + * - Google Calendar: `connector_googlecalendar` + * - Google Drive: `connector_googledrive` + * - Microsoft Teams: `connector_microsoftteams` + * - Outlook Calendar: `connector_outlookcalendar` + * - Outlook Email: `connector_outlookemail` + * - SharePoint: `connector_sharepoint` */ - group_id?: string; + connector_id?: + | 'connector_dropbox' + | 'connector_gmail' + | 'connector_googlecalendar' + | 'connector_googledrive' + | 'connector_microsoftteams' + | 'connector_outlookcalendar' + | 'connector_outlookemail' + | 'connector_sharepoint'; /** - * The arbitrary metadata to attach to this trace to enable filtering in the traces - * dashboard. + * Optional HTTP headers to send to the MCP server. Use for authentication or other + * purposes. */ - metadata?: unknown; + headers?: { [key: string]: string } | null; /** - * The name of the workflow to attach to this trace. This is used to name the trace - * in the traces dashboard. + * Specify which of the MCP server's tools require approval. */ - workflow_name?: string; + require_approval?: McpTool.McpToolApprovalFilter | 'always' | 'never' | null; + + /** + * Optional description of the MCP server, used to provide more context. + */ + server_description?: string; + + /** + * The URL for the MCP server. One of `server_url` or `connector_id` must be + * provided. + */ + server_url?: string; } - /** - * Configuration for turn detection. Can be set to `null` to turn off. Server VAD - * means that the model will detect the start and end of speech based on audio - * volume and respond at the end of user speech. - */ - export interface TurnDetection { + export namespace McpTool { /** - * Amount of audio to include before the VAD detected speech (in milliseconds). - * Defaults to 300ms. + * A filter object to specify which tools are allowed. */ - prefix_padding_ms?: number; + export interface McpToolFilter { + /** + * Indicates whether or not a tool modifies data or is read-only. If an MCP server + * is + * [annotated with `readOnlyHint`](https://modelcontextprotocol.io/specification/2025-06-18/schema#toolannotations-readonlyhint), + * it will match this filter. + */ + read_only?: boolean; + + /** + * List of allowed tool names. + */ + tool_names?: Array; + } + + /** + * Specify which of the MCP server's tools require approval. Can be `always`, + * `never`, or a filter object associated with tools that require approval. + */ + export interface McpToolApprovalFilter { + /** + * A filter object to specify which tools are allowed. + */ + always?: McpToolApprovalFilter.Always; + + /** + * A filter object to specify which tools are allowed. + */ + never?: McpToolApprovalFilter.Never; + } + + export namespace McpToolApprovalFilter { + /** + * A filter object to specify which tools are allowed. + */ + export interface Always { + /** + * Indicates whether or not a tool modifies data or is read-only. If an MCP server + * is + * [annotated with `readOnlyHint`](https://modelcontextprotocol.io/specification/2025-06-18/schema#toolannotations-readonlyhint), + * it will match this filter. + */ + read_only?: boolean; + + /** + * List of allowed tool names. + */ + tool_names?: Array; + } + + /** + * A filter object to specify which tools are allowed. + */ + export interface Never { + /** + * Indicates whether or not a tool modifies data or is read-only. If an MCP server + * is + * [annotated with `readOnlyHint`](https://modelcontextprotocol.io/specification/2025-06-18/schema#toolannotations-readonlyhint), + * it will match this filter. + */ + read_only?: boolean; + + /** + * List of allowed tool names. + */ + tool_names?: Array; + } + } + } + /** + * Granular configuration for tracing. + */ + export interface TracingConfiguration { /** - * Duration of silence to detect speech stop (in milliseconds). Defaults to 500ms. - * With shorter values the model will respond more quickly, but may jump in on - * short pauses from the user. + * The group id to attach to this trace to enable filtering and grouping in the + * Traces Dashboard. */ - silence_duration_ms?: number; + group_id?: string; /** - * Activation threshold for VAD (0.0 to 1.0), this defaults to 0.5. A higher - * threshold will require louder audio to activate the model, and thus might - * perform better in noisy environments. + * The arbitrary metadata to attach to this trace to enable filtering in the Traces + * Dashboard. */ - threshold?: number; + metadata?: unknown; /** - * Type of turn detection, only `server_vad` is currently supported. + * The name of the workflow to attach to this trace. This is used to name the trace + * in the Traces Dashboard. */ - type?: string; + workflow_name?: string; } } /** - * Response from creating a session and client secret for the Realtime API. + * Ephemeral key returned by the API. Only present when the session is created on + * the server via REST API. */ -export interface ClientSecretCreateResponse { +export interface RealtimeTranscriptionSessionClientSecret { /** - * Expiration timestamp for the client secret, in seconds since epoch. + * Timestamp for when the token expires. Currently, all tokens expire after one + * minute. */ expires_at: number; /** - * The session configuration for either a realtime or transcription session. - */ - session: - | RealtimeSessionCreateResponse - | ClientSecretCreateResponse.RealtimeTranscriptionSessionCreateResponse; - - /** - * The generated client secret value. + * Ephemeral key usable in client environments to authenticate connections to the + * Realtime API. Use this in client-side environments rather than a standard API + * token, which should only be used server-side. */ value: string; } -export namespace ClientSecretCreateResponse { +/** + * A new Realtime transcription session configuration. + * + * When a session is created on the server via REST API, the session object also + * contains an ephemeral key. Default TTL for keys is 10 minutes. This property is + * not present when a session is updated via the WebSocket API. + */ +export interface RealtimeTranscriptionSessionCreateResponse { /** - * A Realtime transcription session configuration object. + * Ephemeral key returned by the API. Only present when the session is created on + * the server via REST API. */ - export interface RealtimeTranscriptionSessionCreateResponse { - /** - * Unique identifier for the session that looks like `sess_1234567890abcdef`. - */ - id?: string; - - /** - * Configuration for input audio for the session. - */ - audio?: RealtimeTranscriptionSessionCreateResponse.Audio; + client_secret: RealtimeTranscriptionSessionClientSecret; - /** - * Expiration timestamp for the session, in seconds since epoch. - */ - expires_at?: number; + /** + * The format of input audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`. + */ + input_audio_format?: string; - /** - * Additional fields to include in server outputs. - * - * - `item.input_audio_transcription.logprobs`: Include logprobs for input audio - * transcription. - */ - include?: Array<'item.input_audio_transcription.logprobs'>; + /** + * Configuration of the transcription model. + */ + input_audio_transcription?: RealtimeTranscriptionSessionInputAudioTranscription; - /** - * The object type. Always `realtime.transcription_session`. - */ - object?: string; - } + /** + * The set of modalities the model can respond with. To disable audio, set this to + * ["text"]. + */ + modalities?: Array<'text' | 'audio'>; - export namespace RealtimeTranscriptionSessionCreateResponse { - /** - * Configuration for input audio for the session. - */ - export interface Audio { - input?: Audio.Input; - } + /** + * Configuration for turn detection. Can be set to `null` to turn off. Server VAD + * means that the model will detect the start and end of speech based on audio + * volume and respond at the end of user speech. + */ + turn_detection?: RealtimeTranscriptionSessionTurnDetection; +} - export namespace Audio { - export interface Input { - /** - * The format of input audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`. - */ - format?: string; +/** + * Configuration of the transcription model. + */ +export interface RealtimeTranscriptionSessionInputAudioTranscription { + /** + * The language of the input audio. Supplying the input language in + * [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`) + * format will improve accuracy and latency. + */ + language?: string; - /** - * Configuration for input audio noise reduction. - */ - noise_reduction?: Input.NoiseReduction; + /** + * The model to use for transcription. Current options are `whisper-1`, + * `gpt-4o-transcribe-latest`, `gpt-4o-mini-transcribe`, and `gpt-4o-transcribe`. + */ + model?: 'whisper-1' | 'gpt-4o-transcribe-latest' | 'gpt-4o-mini-transcribe' | 'gpt-4o-transcribe'; - /** - * Configuration of the transcription model. - */ - transcription?: Input.Transcription; + /** + * An optional text to guide the model's style or continue a previous audio + * segment. For `whisper-1`, the + * [prompt is a list of keywords](https://platform.openai.com/docs/guides/speech-to-text#prompting). + * For `gpt-4o-transcribe` models, the prompt is a free text string, for example + * "expect words related to technology". + */ + prompt?: string; +} - /** - * Configuration for turn detection. - */ - turn_detection?: Input.TurnDetection; - } +/** + * Configuration for turn detection. Can be set to `null` to turn off. Server VAD + * means that the model will detect the start and end of speech based on audio + * volume and respond at the end of user speech. + */ +export interface RealtimeTranscriptionSessionTurnDetection { + /** + * Amount of audio to include before the VAD detected speech (in milliseconds). + * Defaults to 300ms. + */ + prefix_padding_ms?: number; - export namespace Input { - /** - * Configuration for input audio noise reduction. - */ - export interface NoiseReduction { - type?: 'near_field' | 'far_field'; - } + /** + * Duration of silence to detect speech stop (in milliseconds). Defaults to 500ms. + * With shorter values the model will respond more quickly, but may jump in on + * short pauses from the user. + */ + silence_duration_ms?: number; - /** - * Configuration of the transcription model. - */ - export interface Transcription { - /** - * The language of the input audio. Supplying the input language in - * [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`) - * format will improve accuracy and latency. - */ - language?: string; - - /** - * The model to use for transcription. Can be `gpt-4o-transcribe`, - * `gpt-4o-mini-transcribe`, or `whisper-1`. - */ - model?: 'gpt-4o-transcribe' | 'gpt-4o-mini-transcribe' | 'whisper-1'; - - /** - * An optional text to guide the model's style or continue a previous audio - * segment. The - * [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting) - * should match the audio language. - */ - prompt?: string; - } + /** + * Activation threshold for VAD (0.0 to 1.0), this defaults to 0.5. A higher + * threshold will require louder audio to activate the model, and thus might + * perform better in noisy environments. + */ + threshold?: number; - /** - * Configuration for turn detection. - */ - export interface TurnDetection { - prefix_padding_ms?: number; + /** + * Type of turn detection, only `server_vad` is currently supported. + */ + type?: string; +} - silence_duration_ms?: number; +/** + * Response from creating a session and client secret for the Realtime API. + */ +export interface ClientSecretCreateResponse { + /** + * Expiration timestamp for the client secret, in seconds since epoch. + */ + expires_at: number; - threshold?: number; + /** + * The session configuration for either a realtime or transcription session. + */ + session: RealtimeSessionCreateResponse | RealtimeTranscriptionSessionCreateResponse; - /** - * Type of turn detection, only `server_vad` is currently supported. - */ - type?: string; - } - } - } - } + /** + * The generated client secret value. + */ + value: string; } export interface ClientSecretCreateParams { /** - * Configuration for the ephemeral token expiration. + * Configuration for the client secret expiration. Expiration refers to the time + * after which a client secret will no longer be valid for creating sessions. The + * session itself may continue after that time once started. A secret can be used + * to create multiple sessions until it expires. */ expires_after?: ClientSecretCreateParams.ExpiresAfter; @@ -444,18 +644,23 @@ export interface ClientSecretCreateParams { export namespace ClientSecretCreateParams { /** - * Configuration for the ephemeral token expiration. + * Configuration for the client secret expiration. Expiration refers to the time + * after which a client secret will no longer be valid for creating sessions. The + * session itself may continue after that time once started. A secret can be used + * to create multiple sessions until it expires. */ export interface ExpiresAfter { /** - * The anchor point for the ephemeral token expiration. Only `created_at` is - * currently supported. + * The anchor point for the client secret expiration, meaning that `seconds` will + * be added to the `created_at` time of the client secret to produce an expiration + * timestamp. Only `created_at` is currently supported. */ anchor?: 'created_at'; /** * The number of seconds from the anchor point to the expiration. Select a value - * between `10` and `7200`. + * between `10` and `7200` (2 hours). This default to 600 seconds (10 minutes) if + * not specified. */ seconds?: number; } @@ -463,7 +668,12 @@ export namespace ClientSecretCreateParams { export declare namespace ClientSecrets { export { + type RealtimeSessionClientSecret as RealtimeSessionClientSecret, type RealtimeSessionCreateResponse as RealtimeSessionCreateResponse, + type RealtimeTranscriptionSessionClientSecret as RealtimeTranscriptionSessionClientSecret, + type RealtimeTranscriptionSessionCreateResponse as RealtimeTranscriptionSessionCreateResponse, + type RealtimeTranscriptionSessionInputAudioTranscription as RealtimeTranscriptionSessionInputAudioTranscription, + type RealtimeTranscriptionSessionTurnDetection as RealtimeTranscriptionSessionTurnDetection, type ClientSecretCreateResponse as ClientSecretCreateResponse, type ClientSecretCreateParams as ClientSecretCreateParams, }; diff --git a/src/resources/realtime/index.ts b/src/resources/realtime/index.ts index a6c5db35e..550532500 100644 --- a/src/resources/realtime/index.ts +++ b/src/resources/realtime/index.ts @@ -2,7 +2,12 @@ export { ClientSecrets, + type RealtimeSessionClientSecret, type RealtimeSessionCreateResponse, + type RealtimeTranscriptionSessionClientSecret, + type RealtimeTranscriptionSessionCreateResponse, + type RealtimeTranscriptionSessionInputAudioTranscription, + type RealtimeTranscriptionSessionTurnDetection, type ClientSecretCreateResponse, type ClientSecretCreateParams, } from './client-secrets'; diff --git a/src/resources/realtime/realtime.ts b/src/resources/realtime/realtime.ts index e05f4fb6d..9dee11e11 100644 --- a/src/resources/realtime/realtime.ts +++ b/src/resources/realtime/realtime.ts @@ -8,7 +8,12 @@ import { ClientSecretCreateParams, ClientSecretCreateResponse, ClientSecrets, + RealtimeSessionClientSecret, RealtimeSessionCreateResponse, + RealtimeTranscriptionSessionClientSecret, + RealtimeTranscriptionSessionCreateResponse, + RealtimeTranscriptionSessionInputAudioTranscription, + RealtimeTranscriptionSessionTurnDetection, } from './client-secrets'; import * as ResponsesAPI from '../responses/responses'; @@ -16,6 +21,30 @@ export class Realtime extends APIResource { clientSecrets: ClientSecretsAPI.ClientSecrets = new ClientSecretsAPI.ClientSecrets(this._client); } +export interface AudioTranscription { + /** + * The language of the input audio. Supplying the input language in + * [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`) + * format will improve accuracy and latency. + */ + language?: string; + + /** + * The model to use for transcription. Current options are `whisper-1`, + * `gpt-4o-transcribe-latest`, `gpt-4o-mini-transcribe`, and `gpt-4o-transcribe`. + */ + model?: 'whisper-1' | 'gpt-4o-transcribe-latest' | 'gpt-4o-mini-transcribe' | 'gpt-4o-transcribe'; + + /** + * An optional text to guide the model's style or continue a previous audio + * segment. For `whisper-1`, the + * [prompt is a list of keywords](https://platform.openai.com/docs/guides/speech-to-text#prompting). + * For `gpt-4o-transcribe` models, the prompt is a free text string, for example + * "expect words related to technology". + */ + prompt?: string; +} + /** * Returned when a conversation is created. Emitted right after session creation. */ @@ -68,7 +97,20 @@ export type ConversationItem = | RealtimeMcpApprovalRequest; /** - * Returned when a conversation item is added. + * Sent by the server when an Item is added to the default Conversation. This can + * happen in several cases: + * + * - When the client sends a `conversation.item.create` event. + * - When the input audio buffer is committed. In this case the item will be a user + * message containing the audio from the buffer. + * - When the model is generating a Response. In this case the + * `conversation.item.added` event will be sent when the model starts generating + * a specific Item, and thus it will not yet have any content (and `status` will + * be `in_progress`). + * + * The event will include the full content of the Item (except when model is + * generating a Response) except for audio data, which can be retrieved separately + * with a `conversation.item.retrieve` event if necessary. */ export interface ConversationItemAdded { /** @@ -212,6 +254,9 @@ export interface ConversationItemDeletedEvent { /** * Returned when a conversation item is finalized. + * + * The event will include the full content of the Item except for audio data, which + * can be retrieved separately with a `conversation.item.retrieve` event if needed. */ export interface ConversationItemDone { /** @@ -239,9 +284,9 @@ export interface ConversationItemDone { /** * This event is the output of audio transcription for user audio written to the * user audio buffer. Transcription begins when the input audio buffer is committed - * by the client or server (in `server_vad` mode). Transcription runs - * asynchronously with Response creation, so this event may come before or after - * the Response events. + * by the client or server (when VAD is enabled). Transcription runs asynchronously + * with Response creation, so this event may come before or after the Response + * events. * * Realtime API models accept audio natively, and thus input transcription is a * separate process run on a separate ASR (Automatic Speech Recognition) model. The @@ -260,7 +305,7 @@ export interface ConversationItemInputAudioTranscriptionCompletedEvent { event_id: string; /** - * The ID of the user message item containing the audio. + * The ID of the item containing the audio that is being transcribed. */ item_id: string; @@ -275,7 +320,8 @@ export interface ConversationItemInputAudioTranscriptionCompletedEvent { type: 'conversation.item.input_audio_transcription.completed'; /** - * Usage statistics for the transcription. + * Usage statistics for the transcription, this is billed according to the ASR + * model's pricing rather than the realtime model's pricing. */ usage: | ConversationItemInputAudioTranscriptionCompletedEvent.TranscriptTextUsageTokens @@ -353,7 +399,7 @@ export namespace ConversationItemInputAudioTranscriptionCompletedEvent { /** * Returned when the text value of an input audio transcription content part is - * updated. + * updated with incremental transcription results. */ export interface ConversationItemInputAudioTranscriptionDeltaEvent { /** @@ -362,7 +408,7 @@ export interface ConversationItemInputAudioTranscriptionDeltaEvent { event_id: string; /** - * The ID of the item. + * The ID of the item containing the audio that is being transcribed. */ item_id: string; @@ -382,7 +428,12 @@ export interface ConversationItemInputAudioTranscriptionDeltaEvent { delta?: string; /** - * The log probabilities of the transcription. + * The log probabilities of the transcription. These can be enabled by + * configurating the session with + * `"include": ["item.input_audio_transcription.logprobs"]`. Each entry in the + * array corresponds a log probability of which token would be selected for this + * chunk of transcription. This can help to identify if it was possible there were + * multiple valid options for a given chunk of transcription. */ logprobs?: Array | null; } @@ -542,7 +593,7 @@ export interface ConversationItemTruncateEvent { audio_end_ms: number; /** - * The index of the content part to truncate. Set this to 0. + * The index of the content part to truncate. Set this to `0`. */ content_index: number; @@ -701,14 +752,19 @@ export namespace ConversationItemWithReference { /** * Send this event to append audio bytes to the input audio buffer. The audio - * buffer is temporary storage you can write to and later commit. In Server VAD - * mode, the audio buffer is used to detect speech and the server will decide when - * to commit. When Server VAD is disabled, you must commit the audio buffer - * manually. + * buffer is temporary storage you can write to and later commit. A "commit" will + * create a new user message item in the conversation history from the buffer + * content and clear the buffer. Input audio transcription (if enabled) will be + * generated when the buffer is committed. + * + * If VAD is enabled the audio buffer is used to detect speech and the server will + * decide when to commit. When Server VAD is disabled, you must commit the audio + * buffer manually. Input audio noise reduction operates on writes to the audio + * buffer. * * The client may choose how much audio to place in each event up to a maximum of * 15 MiB, for example streaming smaller chunks from the client may allow the VAD - * to be more responsive. Unlike made other client events, the server will not send + * to be more responsive. Unlike most other client events, the server will not send * a confirmation response to this event. */ export interface InputAudioBufferAppendEvent { @@ -988,6 +1044,36 @@ export interface McpListToolsInProgress { type: 'mcp_list_tools.in_progress'; } +export interface Models { + /** + * The description of the function, including guidance on when and how to call it, + * and guidance about what to tell the user when calling (if anything). + */ + description?: string; + + /** + * The name of the function. + */ + name?: string; + + /** + * Parameters of the function in JSON Schema. + */ + parameters?: unknown; + + /** + * The type of the tool, i.e. `function`. + */ + type?: 'function'; +} + +/** + * Type of noise reduction. `near_field` is for close-talking microphones such as + * headphones, `far_field` is for far-field microphones such as laptop or + * conference room microphones. + */ +export type NoiseReductionType = 'near_field' | 'far_field'; + /** * **WebRTC Only:** Emit to cut off the current audio response. This will trigger * the server to stop generating audio and emit a `output_audio_buffer.cleared` @@ -1058,212 +1144,217 @@ export namespace RateLimitsUpdatedEvent { * Configuration for input and output audio. */ export interface RealtimeAudioConfig { - input?: RealtimeAudioConfig.Input; + input?: RealtimeAudioConfigInput; - output?: RealtimeAudioConfig.Output; + output?: RealtimeAudioConfigOutput; } -export namespace RealtimeAudioConfig { - export interface Input { - /** - * The format of input audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`. For - * `pcm16`, input audio must be 16-bit PCM at a 24kHz sample rate, single channel - * (mono), and little-endian byte order. - */ - format?: 'pcm16' | 'g711_ulaw' | 'g711_alaw'; +export interface RealtimeAudioConfigInput { + /** + * The format of the input audio. + */ + format?: RealtimeAudioFormats; - /** - * Configuration for input audio noise reduction. This can be set to `null` to turn - * off. Noise reduction filters audio added to the input audio buffer before it is - * sent to VAD and the model. Filtering the audio can improve VAD and turn - * detection accuracy (reducing false positives) and model performance by improving - * perception of the input audio. - */ - noise_reduction?: Input.NoiseReduction; + /** + * Configuration for input audio noise reduction. This can be set to `null` to turn + * off. Noise reduction filters audio added to the input audio buffer before it is + * sent to VAD and the model. Filtering the audio can improve VAD and turn + * detection accuracy (reducing false positives) and model performance by improving + * perception of the input audio. + */ + noise_reduction?: RealtimeAudioConfigInput.NoiseReduction; - /** - * Configuration for input audio transcription, defaults to off and can be set to - * `null` to turn off once on. Input audio transcription is not native to the - * model, since the model consumes audio directly. Transcription runs - * asynchronously through - * [the /audio/transcriptions endpoint](https://platform.openai.com/docs/api-reference/audio/createTranscription) - * and should be treated as guidance of input audio content rather than precisely - * what the model heard. The client can optionally set the language and prompt for - * transcription, these offer additional guidance to the transcription service. - */ - transcription?: Input.Transcription; + /** + * Configuration for input audio transcription, defaults to off and can be set to + * `null` to turn off once on. Input audio transcription is not native to the + * model, since the model consumes audio directly. Transcription runs + * asynchronously through + * [the /audio/transcriptions endpoint](https://platform.openai.com/docs/api-reference/audio/createTranscription) + * and should be treated as guidance of input audio content rather than precisely + * what the model heard. The client can optionally set the language and prompt for + * transcription, these offer additional guidance to the transcription service. + */ + transcription?: AudioTranscription; + + /** + * Configuration for turn detection, ether Server VAD or Semantic VAD. This can be + * set to `null` to turn off, in which case the client must manually trigger model + * response. Server VAD means that the model will detect the start and end of + * speech based on audio volume and respond at the end of user speech. Semantic VAD + * is more advanced and uses a turn detection model (in conjunction with VAD) to + * semantically estimate whether the user has finished speaking, then dynamically + * sets a timeout based on this probability. For example, if user audio trails off + * with "uhhm", the model will score a low probability of turn end and wait longer + * for the user to continue speaking. This can be useful for more natural + * conversations, but may have a higher latency. + */ + turn_detection?: RealtimeAudioInputTurnDetection; +} +export namespace RealtimeAudioConfigInput { + /** + * Configuration for input audio noise reduction. This can be set to `null` to turn + * off. Noise reduction filters audio added to the input audio buffer before it is + * sent to VAD and the model. Filtering the audio can improve VAD and turn + * detection accuracy (reducing false positives) and model performance by improving + * perception of the input audio. + */ + export interface NoiseReduction { /** - * Configuration for turn detection, ether Server VAD or Semantic VAD. This can be - * set to `null` to turn off, in which case the client must manually trigger model - * response. Server VAD means that the model will detect the start and end of - * speech based on audio volume and respond at the end of user speech. Semantic VAD - * is more advanced and uses a turn detection model (in conjunction with VAD) to - * semantically estimate whether the user has finished speaking, then dynamically - * sets a timeout based on this probability. For example, if user audio trails off - * with "uhhm", the model will score a low probability of turn end and wait longer - * for the user to continue speaking. This can be useful for more natural - * conversations, but may have a higher latency. + * Type of noise reduction. `near_field` is for close-talking microphones such as + * headphones, `far_field` is for far-field microphones such as laptop or + * conference room microphones. */ - turn_detection?: Input.TurnDetection; + type?: RealtimeAPI.NoiseReductionType; } +} + +export interface RealtimeAudioConfigOutput { + /** + * The format of the output audio. + */ + format?: RealtimeAudioFormats; + + /** + * The speed of the model's spoken response as a multiple of the original speed. + * 1.0 is the default speed. 0.25 is the minimum speed. 1.5 is the maximum speed. + * This value can only be changed in between model turns, not while a response is + * in progress. + * + * This parameter is a post-processing adjustment to the audio after it is + * generated, it's also possible to prompt the model to speak faster or slower. + */ + speed?: number; + + /** + * The voice the model uses to respond. Voice cannot be changed during the session + * once the model has responded with audio at least once. Current voice options are + * `alloy`, `ash`, `ballad`, `coral`, `echo`, `sage`, `shimmer`, `verse`, `marin`, + * and `cedar`. We recommend `marin` and `cedar` for best quality. + */ + voice?: + | (string & {}) + | 'alloy' + | 'ash' + | 'ballad' + | 'coral' + | 'echo' + | 'sage' + | 'shimmer' + | 'verse' + | 'marin' + | 'cedar'; +} + +/** + * The PCM audio format. Only a 24kHz sample rate is supported. + */ +export type RealtimeAudioFormats = + | RealtimeAudioFormats.AudioPCM + | RealtimeAudioFormats.AudioPCMU + | RealtimeAudioFormats.AudioPCMA; - export namespace Input { +export namespace RealtimeAudioFormats { + /** + * The PCM audio format. Only a 24kHz sample rate is supported. + */ + export interface AudioPCM { /** - * Configuration for input audio noise reduction. This can be set to `null` to turn - * off. Noise reduction filters audio added to the input audio buffer before it is - * sent to VAD and the model. Filtering the audio can improve VAD and turn - * detection accuracy (reducing false positives) and model performance by improving - * perception of the input audio. + * The sample rate of the audio. Always `24000`. */ - export interface NoiseReduction { - /** - * Type of noise reduction. `near_field` is for close-talking microphones such as - * headphones, `far_field` is for far-field microphones such as laptop or - * conference room microphones. - */ - type?: 'near_field' | 'far_field'; - } + rate?: 24000; /** - * Configuration for input audio transcription, defaults to off and can be set to - * `null` to turn off once on. Input audio transcription is not native to the - * model, since the model consumes audio directly. Transcription runs - * asynchronously through - * [the /audio/transcriptions endpoint](https://platform.openai.com/docs/api-reference/audio/createTranscription) - * and should be treated as guidance of input audio content rather than precisely - * what the model heard. The client can optionally set the language and prompt for - * transcription, these offer additional guidance to the transcription service. + * The audio format. Always `audio/pcm`. */ - export interface Transcription { - /** - * The language of the input audio. Supplying the input language in - * [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`) - * format will improve accuracy and latency. - */ - language?: string; - - /** - * The model to use for transcription. Current options are `whisper-1`, - * `gpt-4o-transcribe-latest`, `gpt-4o-mini-transcribe`, `gpt-4o-transcribe`, and - * `gpt-4o-transcribe-diarize`. - */ - model?: - | 'whisper-1' - | 'gpt-4o-transcribe-latest' - | 'gpt-4o-mini-transcribe' - | 'gpt-4o-transcribe' - | 'gpt-4o-transcribe-diarize'; - - /** - * An optional text to guide the model's style or continue a previous audio - * segment. For `whisper-1`, the - * [prompt is a list of keywords](https://platform.openai.com/docs/guides/speech-to-text#prompting). - * For `gpt-4o-transcribe` models, the prompt is a free text string, for example - * "expect words related to technology". - */ - prompt?: string; - } + type?: 'audio/pcm'; + } + /** + * The G.711 μ-law format. + */ + export interface AudioPCMU { /** - * Configuration for turn detection, ether Server VAD or Semantic VAD. This can be - * set to `null` to turn off, in which case the client must manually trigger model - * response. Server VAD means that the model will detect the start and end of - * speech based on audio volume and respond at the end of user speech. Semantic VAD - * is more advanced and uses a turn detection model (in conjunction with VAD) to - * semantically estimate whether the user has finished speaking, then dynamically - * sets a timeout based on this probability. For example, if user audio trails off - * with "uhhm", the model will score a low probability of turn end and wait longer - * for the user to continue speaking. This can be useful for more natural - * conversations, but may have a higher latency. + * The audio format. Always `audio/pcmu`. */ - export interface TurnDetection { - /** - * Whether or not to automatically generate a response when a VAD stop event - * occurs. - */ - create_response?: boolean; - - /** - * Used only for `semantic_vad` mode. The eagerness of the model to respond. `low` - * will wait longer for the user to continue speaking, `high` will respond more - * quickly. `auto` is the default and is equivalent to `medium`. - */ - eagerness?: 'low' | 'medium' | 'high' | 'auto'; + type?: 'audio/pcmu'; + } - /** - * Optional idle timeout after which turn detection will auto-timeout when no - * additional audio is received. - */ - idle_timeout_ms?: number | null; + /** + * The G.711 A-law format. + */ + export interface AudioPCMA { + /** + * The audio format. Always `audio/pcma`. + */ + type?: 'audio/pcma'; + } +} - /** - * Whether or not to automatically interrupt any ongoing response with output to - * the default conversation (i.e. `conversation` of `auto`) when a VAD start event - * occurs. - */ - interrupt_response?: boolean; +/** + * Configuration for turn detection, ether Server VAD or Semantic VAD. This can be + * set to `null` to turn off, in which case the client must manually trigger model + * response. Server VAD means that the model will detect the start and end of + * speech based on audio volume and respond at the end of user speech. Semantic VAD + * is more advanced and uses a turn detection model (in conjunction with VAD) to + * semantically estimate whether the user has finished speaking, then dynamically + * sets a timeout based on this probability. For example, if user audio trails off + * with "uhhm", the model will score a low probability of turn end and wait longer + * for the user to continue speaking. This can be useful for more natural + * conversations, but may have a higher latency. + */ +export interface RealtimeAudioInputTurnDetection { + /** + * Whether or not to automatically generate a response when a VAD stop event + * occurs. + */ + create_response?: boolean; - /** - * Used only for `server_vad` mode. Amount of audio to include before the VAD - * detected speech (in milliseconds). Defaults to 300ms. - */ - prefix_padding_ms?: number; + /** + * Used only for `semantic_vad` mode. The eagerness of the model to respond. `low` + * will wait longer for the user to continue speaking, `high` will respond more + * quickly. `auto` is the default and is equivalent to `medium`. `low`, `medium`, + * and `high` have max timeouts of 8s, 4s, and 2s respectively. + */ + eagerness?: 'low' | 'medium' | 'high' | 'auto'; - /** - * Used only for `server_vad` mode. Duration of silence to detect speech stop (in - * milliseconds). Defaults to 500ms. With shorter values the model will respond - * more quickly, but may jump in on short pauses from the user. - */ - silence_duration_ms?: number; + /** + * Optional idle timeout after which turn detection will auto-timeout when no + * additional audio is received. + */ + idle_timeout_ms?: number | null; - /** - * Used only for `server_vad` mode. Activation threshold for VAD (0.0 to 1.0), this - * defaults to 0.5. A higher threshold will require louder audio to activate the - * model, and thus might perform better in noisy environments. - */ - threshold?: number; + /** + * Whether or not to automatically interrupt any ongoing response with output to + * the default conversation (i.e. `conversation` of `auto`) when a VAD start event + * occurs. + */ + interrupt_response?: boolean; - /** - * Type of turn detection. - */ - type?: 'server_vad' | 'semantic_vad'; - } - } + /** + * Used only for `server_vad` mode. Amount of audio to include before the VAD + * detected speech (in milliseconds). Defaults to 300ms. + */ + prefix_padding_ms?: number; - export interface Output { - /** - * The format of output audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`. - * For `pcm16`, output audio is sampled at a rate of 24kHz. - */ - format?: 'pcm16' | 'g711_ulaw' | 'g711_alaw'; + /** + * Used only for `server_vad` mode. Duration of silence to detect speech stop (in + * milliseconds). Defaults to 500ms. With shorter values the model will respond + * more quickly, but may jump in on short pauses from the user. + */ + silence_duration_ms?: number; - /** - * The speed of the model's spoken response. 1.0 is the default speed. 0.25 is the - * minimum speed. 1.5 is the maximum speed. This value can only be changed in - * between model turns, not while a response is in progress. - */ - speed?: number; + /** + * Used only for `server_vad` mode. Activation threshold for VAD (0.0 to 1.0), this + * defaults to 0.5. A higher threshold will require louder audio to activate the + * model, and thus might perform better in noisy environments. + */ + threshold?: number; - /** - * The voice the model uses to respond. Voice cannot be changed during the session - * once the model has responded with audio at least once. Current voice options are - * `alloy`, `ash`, `ballad`, `coral`, `echo`, `sage`, `shimmer`, `verse`, `marin`, - * and `cedar`. - */ - voice?: - | (string & {}) - | 'alloy' - | 'ash' - | 'ballad' - | 'coral' - | 'echo' - | 'sage' - | 'shimmer' - | 'verse' - | 'marin' - | 'cedar'; - } + /** + * Type of turn detection. + */ + type?: 'server_vad' | 'semantic_vad'; } /** @@ -1283,35 +1374,6 @@ export type RealtimeClientEvent = | SessionUpdateEvent | TranscriptionSessionUpdate; -/** - * Configuration options for the generated client secret. - */ -export interface RealtimeClientSecretConfig { - /** - * Configuration for the ephemeral token expiration. - */ - expires_after?: RealtimeClientSecretConfig.ExpiresAfter; -} - -export namespace RealtimeClientSecretConfig { - /** - * Configuration for the ephemeral token expiration. - */ - export interface ExpiresAfter { - /** - * The anchor point for the ephemeral token expiration. Only `created_at` is - * currently supported. - */ - anchor: 'created_at'; - - /** - * The number of seconds from the anchor point to the expiration. Select a value - * between `10` and `7200`. - */ - seconds?: number; - } -} - /** * An assistant message item in a Realtime conversation. */ @@ -1332,12 +1394,14 @@ export interface RealtimeConversationItemAssistantMessage { type: 'message'; /** - * The unique ID of the item. + * The unique ID of the item. This may be provided by the client or generated by + * the server. */ id?: string; /** - * Identifier for the API object being returned - always `realtime.item`. + * Identifier for the API object being returned - always `realtime.item`. Optional + * when creating a new item. */ object?: 'realtime.item'; @@ -1349,15 +1413,29 @@ export interface RealtimeConversationItemAssistantMessage { export namespace RealtimeConversationItemAssistantMessage { export interface Content { + /** + * Base64-encoded audio bytes, these will be parsed as the format specified in the + * session output audio type configuration. This defaults to PCM 16-bit 24kHz mono + * if not specified. + */ + audio?: string; + /** * The text content. */ text?: string; /** - * The content type. Always `text` for assistant messages. + * The transcript of the audio content, this will always be present if the output + * type is `audio`. + */ + transcript?: string; + + /** + * The content type, `output_text` or `output_audio` depending on the session + * `output_modalities` configuration. */ - type?: 'text'; + type?: 'output_text' | 'output_audio'; } } @@ -1366,7 +1444,9 @@ export namespace RealtimeConversationItemAssistantMessage { */ export interface RealtimeConversationItemFunctionCall { /** - * The arguments of the function call. + * The arguments of the function call. This is a JSON-encoded string representing + * the arguments passed to the function, for example + * `{"arg1": "value1", "arg2": 42}`. */ arguments: string; @@ -1381,7 +1461,8 @@ export interface RealtimeConversationItemFunctionCall { type: 'function_call'; /** - * The unique ID of the item. + * The unique ID of the item. This may be provided by the client or generated by + * the server. */ id?: string; @@ -1391,7 +1472,8 @@ export interface RealtimeConversationItemFunctionCall { call_id?: string; /** - * Identifier for the API object being returned - always `realtime.item`. + * Identifier for the API object being returned - always `realtime.item`. Optional + * when creating a new item. */ object?: 'realtime.item'; @@ -1411,7 +1493,8 @@ export interface RealtimeConversationItemFunctionCallOutput { call_id: string; /** - * The output of the function call. + * The output of the function call, this is free text and can contain any + * information or simply be empty. */ output: string; @@ -1421,12 +1504,14 @@ export interface RealtimeConversationItemFunctionCallOutput { type: 'function_call_output'; /** - * The unique ID of the item. + * The unique ID of the item. This may be provided by the client or generated by + * the server. */ id?: string; /** - * Identifier for the API object being returned - always `realtime.item`. + * Identifier for the API object being returned - always `realtime.item`. Optional + * when creating a new item. */ object?: 'realtime.item'; @@ -1437,7 +1522,12 @@ export interface RealtimeConversationItemFunctionCallOutput { } /** - * A system message item in a Realtime conversation. + * A system message in a Realtime conversation can be used to provide additional + * context or instructions to the model. This is similar but distinct from the + * instruction prompt provided at the start of a conversation, as system messages + * can be added at any point in the conversation. For major changes to the + * conversation's behavior, use instructions, but for smaller updates (e.g. "the + * user is now asking about a different topic"), use system messages. */ export interface RealtimeConversationItemSystemMessage { /** @@ -1456,12 +1546,14 @@ export interface RealtimeConversationItemSystemMessage { type: 'message'; /** - * The unique ID of the item. + * The unique ID of the item. This may be provided by the client or generated by + * the server. */ id?: string; /** - * Identifier for the API object being returned - always `realtime.item`. + * Identifier for the API object being returned - always `realtime.item`. Optional + * when creating a new item. */ object?: 'realtime.item'; @@ -1505,12 +1597,14 @@ export interface RealtimeConversationItemUserMessage { type: 'message'; /** - * The unique ID of the item. + * The unique ID of the item. This may be provided by the client or generated by + * the server. */ id?: string; /** - * Identifier for the API object being returned - always `realtime.item`. + * Identifier for the API object being returned - always `realtime.item`. Optional + * when creating a new item. */ object?: 'realtime.item'; @@ -1523,24 +1617,40 @@ export interface RealtimeConversationItemUserMessage { export namespace RealtimeConversationItemUserMessage { export interface Content { /** - * Base64-encoded audio bytes (for `input_audio`). + * Base64-encoded audio bytes (for `input_audio`), these will be parsed as the + * format specified in the session input audio type configuration. This defaults to + * PCM 16-bit 24kHz mono if not specified. */ audio?: string; + /** + * The detail level of the image (for `input_image`). `auto` will default to + * `high`. + */ + detail?: 'auto' | 'low' | 'high'; + + /** + * Base64-encoded image bytes (for `input_image`) as a data URI. For example + * `...`. Supported formats are PNG + * and JPEG. + */ + image_url?: string; + /** * The text content (for `input_text`). */ text?: string; /** - * Transcript of the audio (for `input_audio`). + * Transcript of the audio (for `input_audio`). This is not sent to the model, but + * will be attached to the message item for reference. */ transcript?: string; /** - * The content type (`input_text` or `input_audio`). + * The content type (`input_text`, `input_audio`, or `input_image`). */ - type?: 'input_text' | 'input_audio'; + type?: 'input_text' | 'input_audio' | 'input_image'; } } @@ -1780,18 +1890,22 @@ export interface RealtimeMcphttpError { */ export interface RealtimeResponse { /** - * The unique ID of the response. + * The unique ID of the response, will look like `resp_1234`. */ id?: string; + /** + * Configuration for audio output. + */ + audio?: RealtimeResponse.Audio; + /** * Which conversation the response is added to, determined by the `conversation` * field in the `response.create` event. If `auto`, the response will be added to * the default conversation and the value of `conversation_id` will be an id like * `conv_1234`. If `none`, the response will not be added to any conversation and * the value of `conversation_id` will be `null`. If responses are being triggered - * by server VAD, the response will be added to the default conversation, thus the - * `conversation_id` will be an id like `conv_1234`. + * automatically by VAD the response will be added to the default conversation */ conversation_id?: string; @@ -1812,14 +1926,7 @@ export interface RealtimeResponse { metadata?: Shared.Metadata | null; /** - * The set of modalities the model used to respond. If there are multiple - * modalities, the model will pick one, for example if `modalities` is - * `["text", "audio"]`, the model could be responding in either text or audio. - */ - modalities?: Array<'text' | 'audio'>; - - /** - * The object type, must be `realtime.response`. + * The object type, must be `realtime.response`. */ object?: 'realtime.response'; @@ -1829,9 +1936,12 @@ export interface RealtimeResponse { output?: Array; /** - * The format of output audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`. + * The set of modalities the model used to respond, currently the only possible + * values are `[\"audio\"]`, `[\"text\"]`. Audio output always include a text + * transcript. Setting the output to mode `text` will disable audio output from the + * model. */ - output_audio_format?: 'pcm16' | 'g711_ulaw' | 'g711_alaw'; + output_modalities?: Array<'text' | 'audio'>; /** * The final status of the response (`completed`, `cancelled`, `failed`, or @@ -1844,11 +1954,6 @@ export interface RealtimeResponse { */ status_details?: RealtimeResponseStatus; - /** - * Sampling temperature for the model, limited to [0.6, 1.2]. Defaults to 0.8. - */ - temperature?: number; - /** * Usage statistics for the Response, this will correspond to billing. A Realtime * API session will maintain a conversation context and append new Items to the @@ -1856,23 +1961,313 @@ export interface RealtimeResponse { * become the input for later turns. */ usage?: RealtimeResponseUsage; +} +export namespace RealtimeResponse { /** - * The voice the model used to respond. Current voice options are `alloy`, `ash`, - * `ballad`, `coral`, `echo`, `sage`, `shimmer`, and `verse`. + * Configuration for audio output. */ - voice?: - | (string & {}) - | 'alloy' - | 'ash' - | 'ballad' - | 'coral' - | 'echo' - | 'sage' - | 'shimmer' - | 'verse' - | 'marin' - | 'cedar'; + export interface Audio { + output?: Audio.Output; + } + + export namespace Audio { + export interface Output { + /** + * The format of the output audio. + */ + format?: RealtimeAPI.RealtimeAudioFormats; + + /** + * The voice the model uses to respond. Voice cannot be changed during the session + * once the model has responded with audio at least once. Current voice options are + * `alloy`, `ash`, `ballad`, `coral`, `echo`, `sage`, `shimmer`, `verse`, `marin`, + * and `cedar`. We recommend `marin` and `cedar` for best quality. + */ + voice?: + | (string & {}) + | 'alloy' + | 'ash' + | 'ballad' + | 'coral' + | 'echo' + | 'sage' + | 'shimmer' + | 'verse' + | 'marin' + | 'cedar'; + } + } +} + +/** + * Configuration for audio input and output. + */ +export interface RealtimeResponseCreateAudioOutput { + output?: RealtimeResponseCreateAudioOutput.Output; +} + +export namespace RealtimeResponseCreateAudioOutput { + export interface Output { + /** + * The format of the output audio. + */ + format?: RealtimeAPI.RealtimeAudioFormats; + + /** + * The voice the model uses to respond. Voice cannot be changed during the session + * once the model has responded with audio at least once. Current voice options are + * `alloy`, `ash`, `ballad`, `coral`, `echo`, `sage`, `shimmer`, `verse`, `marin`, + * and `cedar`. We recommend `marin` and `cedar` for best quality. + */ + voice?: + | (string & {}) + | 'alloy' + | 'ash' + | 'ballad' + | 'coral' + | 'echo' + | 'sage' + | 'shimmer' + | 'verse' + | 'marin' + | 'cedar'; + } +} + +/** + * Give the model access to additional tools via remote Model Context Protocol + * (MCP) servers. + * [Learn more about MCP](https://platform.openai.com/docs/guides/tools-remote-mcp). + */ +export interface RealtimeResponseCreateMcpTool { + /** + * A label for this MCP server, used to identify it in tool calls. + */ + server_label: string; + + /** + * The type of the MCP tool. Always `mcp`. + */ + type: 'mcp'; + + /** + * List of allowed tool names or a filter object. + */ + allowed_tools?: Array | RealtimeResponseCreateMcpTool.McpToolFilter | null; + + /** + * An OAuth access token that can be used with a remote MCP server, either with a + * custom MCP server URL or a service connector. Your application must handle the + * OAuth authorization flow and provide the token here. + */ + authorization?: string; + + /** + * Identifier for service connectors, like those available in ChatGPT. One of + * `server_url` or `connector_id` must be provided. Learn more about service + * connectors + * [here](https://platform.openai.com/docs/guides/tools-remote-mcp#connectors). + * + * Currently supported `connector_id` values are: + * + * - Dropbox: `connector_dropbox` + * - Gmail: `connector_gmail` + * - Google Calendar: `connector_googlecalendar` + * - Google Drive: `connector_googledrive` + * - Microsoft Teams: `connector_microsoftteams` + * - Outlook Calendar: `connector_outlookcalendar` + * - Outlook Email: `connector_outlookemail` + * - SharePoint: `connector_sharepoint` + */ + connector_id?: + | 'connector_dropbox' + | 'connector_gmail' + | 'connector_googlecalendar' + | 'connector_googledrive' + | 'connector_microsoftteams' + | 'connector_outlookcalendar' + | 'connector_outlookemail' + | 'connector_sharepoint'; + + /** + * Optional HTTP headers to send to the MCP server. Use for authentication or other + * purposes. + */ + headers?: { [key: string]: string } | null; + + /** + * Specify which of the MCP server's tools require approval. + */ + require_approval?: RealtimeResponseCreateMcpTool.McpToolApprovalFilter | 'always' | 'never' | null; + + /** + * Optional description of the MCP server, used to provide more context. + */ + server_description?: string; + + /** + * The URL for the MCP server. One of `server_url` or `connector_id` must be + * provided. + */ + server_url?: string; +} + +export namespace RealtimeResponseCreateMcpTool { + /** + * A filter object to specify which tools are allowed. + */ + export interface McpToolFilter { + /** + * Indicates whether or not a tool modifies data or is read-only. If an MCP server + * is + * [annotated with `readOnlyHint`](https://modelcontextprotocol.io/specification/2025-06-18/schema#toolannotations-readonlyhint), + * it will match this filter. + */ + read_only?: boolean; + + /** + * List of allowed tool names. + */ + tool_names?: Array; + } + + /** + * Specify which of the MCP server's tools require approval. Can be `always`, + * `never`, or a filter object associated with tools that require approval. + */ + export interface McpToolApprovalFilter { + /** + * A filter object to specify which tools are allowed. + */ + always?: McpToolApprovalFilter.Always; + + /** + * A filter object to specify which tools are allowed. + */ + never?: McpToolApprovalFilter.Never; + } + + export namespace McpToolApprovalFilter { + /** + * A filter object to specify which tools are allowed. + */ + export interface Always { + /** + * Indicates whether or not a tool modifies data or is read-only. If an MCP server + * is + * [annotated with `readOnlyHint`](https://modelcontextprotocol.io/specification/2025-06-18/schema#toolannotations-readonlyhint), + * it will match this filter. + */ + read_only?: boolean; + + /** + * List of allowed tool names. + */ + tool_names?: Array; + } + + /** + * A filter object to specify which tools are allowed. + */ + export interface Never { + /** + * Indicates whether or not a tool modifies data or is read-only. If an MCP server + * is + * [annotated with `readOnlyHint`](https://modelcontextprotocol.io/specification/2025-06-18/schema#toolannotations-readonlyhint), + * it will match this filter. + */ + read_only?: boolean; + + /** + * List of allowed tool names. + */ + tool_names?: Array; + } + } +} + +/** + * Create a new Realtime response with these parameters + */ +export interface RealtimeResponseCreateParams { + /** + * Configuration for audio input and output. + */ + audio?: RealtimeResponseCreateAudioOutput; + + /** + * Controls which conversation the response is added to. Currently supports `auto` + * and `none`, with `auto` as the default value. The `auto` value means that the + * contents of the response will be added to the default conversation. Set this to + * `none` to create an out-of-band response which will not add items to default + * conversation. + */ + conversation?: (string & {}) | 'auto' | 'none'; + + /** + * Input items to include in the prompt for the model. Using this field creates a + * new context for this Response instead of using the default conversation. An + * empty array `[]` will clear the context for this Response. Note that this can + * include references to items that previously appeared in the session using their + * id. + */ + input?: Array; + + /** + * The default system instructions (i.e. system message) prepended to model calls. + * This field allows the client to guide the model on desired responses. The model + * can be instructed on response content and format, (e.g. "be extremely succinct", + * "act friendly", "here are examples of good responses") and on audio behavior + * (e.g. "talk quickly", "inject emotion into your voice", "laugh frequently"). The + * instructions are not guaranteed to be followed by the model, but they provide + * guidance to the model on the desired behavior. Note that the server sets default + * instructions which will be used if this field is not set and are visible in the + * `session.created` event at the start of the session. + */ + instructions?: string; + + /** + * Maximum number of output tokens for a single assistant response, inclusive of + * tool calls. Provide an integer between 1 and 4096 to limit output tokens, or + * `inf` for the maximum available tokens for a given model. Defaults to `inf`. + */ + max_output_tokens?: number | 'inf'; + + /** + * Set of 16 key-value pairs that can be attached to an object. This can be useful + * for storing additional information about the object in a structured format, and + * querying for objects via API or the dashboard. + * + * Keys are strings with a maximum length of 64 characters. Values are strings with + * a maximum length of 512 characters. + */ + metadata?: Shared.Metadata | null; + + /** + * The set of modalities the model used to respond, currently the only possible + * values are `[\"audio\"]`, `[\"text\"]`. Audio output always include a text + * transcript. Setting the output to mode `text` will disable audio output from the + * model. + */ + output_modalities?: Array<'text' | 'audio'>; + + /** + * Reference to a prompt template and its variables. + * [Learn more](https://platform.openai.com/docs/guides/text?api-mode=responses#reusable-prompts). + */ + prompt?: ResponsesAPI.ResponsePrompt | null; + + /** + * How the model chooses tools. Provide one of the string modes or force a specific + * function/MCP tool. + */ + tool_choice?: ResponsesAPI.ToolChoiceOptions | ResponsesAPI.ToolChoiceFunction | ResponsesAPI.ToolChoiceMcp; + + /** + * Tools available to the model. + */ + tools?: Array; } /** @@ -1927,7 +2322,10 @@ export namespace RealtimeResponseStatus { */ export interface RealtimeResponseUsage { /** - * Details about the input tokens used in the Response. + * Details about the input tokens used in the Response. Cached tokens are tokens + * from previous turns in the conversation that are included as context for the + * current response. Cached tokens here are counted as a subset of input tokens, + * meaning input tokens will include cached and uncached tokens. */ input_token_details?: RealtimeResponseUsageInputTokenDetails; @@ -1956,25 +2354,60 @@ export interface RealtimeResponseUsage { } /** - * Details about the input tokens used in the Response. + * Details about the input tokens used in the Response. Cached tokens are tokens + * from previous turns in the conversation that are included as context for the + * current response. Cached tokens here are counted as a subset of input tokens, + * meaning input tokens will include cached and uncached tokens. */ export interface RealtimeResponseUsageInputTokenDetails { /** - * The number of audio tokens used in the Response. + * The number of audio tokens used as input for the Response. */ audio_tokens?: number; /** - * The number of cached tokens used in the Response. + * The number of cached tokens used as input for the Response. */ cached_tokens?: number; /** - * The number of text tokens used in the Response. + * Details about the cached tokens used as input for the Response. + */ + cached_tokens_details?: RealtimeResponseUsageInputTokenDetails.CachedTokensDetails; + + /** + * The number of image tokens used as input for the Response. + */ + image_tokens?: number; + + /** + * The number of text tokens used as input for the Response. */ text_tokens?: number; } +export namespace RealtimeResponseUsageInputTokenDetails { + /** + * Details about the cached tokens used as input for the Response. + */ + export interface CachedTokensDetails { + /** + * The number of cached audio tokens used as input for the Response. + */ + audio_tokens?: number; + + /** + * The number of cached image tokens used as input for the Response. + */ + image_tokens?: number; + + /** + * The number of cached text tokens used as input for the Response. + */ + text_tokens?: number; + } +} + /** * Details about the output tokens used in the Response. */ @@ -2045,7 +2478,10 @@ export type RealtimeServerEvent = export namespace RealtimeServerEvent { /** * Returned when a conversation item is retrieved with - * `conversation.item.retrieve`. + * `conversation.item.retrieve`. This is provided as a way to fetch the server's + * representation of an item, for example to get access to the post-processed audio + * data after noise cancellation and VAD. It includes the full content of the Item, + * including audio data. */ export interface ConversationItemRetrieved { /** @@ -2184,7 +2620,7 @@ export interface RealtimeSession { * what the model heard. The client can optionally set the language and prompt for * transcription, these offer additional guidance to the transcription service. */ - input_audio_transcription?: RealtimeSession.InputAudioTranscription | null; + input_audio_transcription?: AudioTranscription | null; /** * The default system instructions (i.e. system message) prepended to model calls. @@ -2266,7 +2702,7 @@ export interface RealtimeSession { /** * Tools (functions) available to the model. */ - tools?: Array; + tools?: Array; /** * Configuration options for tracing. Set to null to disable tracing. Once tracing @@ -2324,64 +2760,7 @@ export namespace RealtimeSession { * headphones, `far_field` is for far-field microphones such as laptop or * conference room microphones. */ - type?: 'near_field' | 'far_field'; - } - - /** - * Configuration for input audio transcription, defaults to off and can be set to - * `null` to turn off once on. Input audio transcription is not native to the - * model, since the model consumes audio directly. Transcription runs - * asynchronously through - * [the /audio/transcriptions endpoint](https://platform.openai.com/docs/api-reference/audio/createTranscription) - * and should be treated as guidance of input audio content rather than precisely - * what the model heard. The client can optionally set the language and prompt for - * transcription, these offer additional guidance to the transcription service. - */ - export interface InputAudioTranscription { - /** - * The language of the input audio. Supplying the input language in - * [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`) - * format will improve accuracy and latency. - */ - language?: string; - - /** - * The model to use for transcription, current options are `gpt-4o-transcribe`, - * `gpt-4o-mini-transcribe`, and `whisper-1`. - */ - model?: string; - - /** - * An optional text to guide the model's style or continue a previous audio - * segment. For `whisper-1`, the - * [prompt is a list of keywords](https://platform.openai.com/docs/guides/speech-to-text#prompting). - * For `gpt-4o-transcribe` models, the prompt is a free text string, for example - * "expect words related to technology". - */ - prompt?: string; - } - - export interface Tool { - /** - * The description of the function, including guidance on when and how to call it, - * and guidance about what to tell the user when calling (if anything). - */ - description?: string; - - /** - * The name of the function. - */ - name?: string; - - /** - * Parameters of the function in JSON Schema. - */ - parameters?: unknown; - - /** - * The type of the tool, i.e. `function`. - */ - type?: 'function'; + type?: RealtimeAPI.NoiseReductionType; } /** @@ -2477,22 +2856,6 @@ export namespace RealtimeSession { * Realtime session object configuration. */ export interface RealtimeSessionCreateRequest { - /** - * The Realtime model used for this session. - */ - model: - | (string & {}) - | 'gpt-realtime' - | 'gpt-realtime-2025-08-28' - | 'gpt-4o-realtime' - | 'gpt-4o-mini-realtime' - | 'gpt-4o-realtime-preview' - | 'gpt-4o-realtime-preview-2024-10-01' - | 'gpt-4o-realtime-preview-2024-12-17' - | 'gpt-4o-realtime-preview-2025-06-03' - | 'gpt-4o-mini-realtime-preview' - | 'gpt-4o-mini-realtime-preview-2024-12-17'; - /** * The type of session to create. Always `realtime` for the Realtime API. */ @@ -2503,16 +2866,11 @@ export interface RealtimeSessionCreateRequest { */ audio?: RealtimeAudioConfig; - /** - * Configuration options for the generated client secret. - */ - client_secret?: RealtimeClientSecretConfig; - /** * Additional fields to include in server outputs. * - * - `item.input_audio_transcription.logprobs`: Include logprobs for input audio - * transcription. + * `item.input_audio_transcription.logprobs`: Include logprobs for input audio + * transcription. */ include?: Array<'item.input_audio_transcription.logprobs'>; @@ -2539,8 +2897,24 @@ export interface RealtimeSessionCreateRequest { max_output_tokens?: number | 'inf'; /** - * The set of modalities the model can respond with. To disable audio, set this to - * ["text"]. + * The Realtime model used for this session. + */ + model?: + | (string & {}) + | 'gpt-realtime' + | 'gpt-realtime-2025-08-28' + | 'gpt-4o-realtime-preview' + | 'gpt-4o-realtime-preview-2024-10-01' + | 'gpt-4o-realtime-preview-2024-12-17' + | 'gpt-4o-realtime-preview-2025-06-03' + | 'gpt-4o-mini-realtime-preview' + | 'gpt-4o-mini-realtime-preview-2024-12-17'; + + /** + * The set of modalities the model can respond with. It defaults to `["audio"]`, + * indicating that the model will respond with audio plus a transcript. `["text"]` + * can be used to make the model respond with text only. It is not possible to + * request both `text` and `audio` at the same time. */ output_modalities?: Array<'text' | 'audio'>; @@ -2550,12 +2924,6 @@ export interface RealtimeSessionCreateRequest { */ prompt?: ResponsesAPI.ResponsePrompt | null; - /** - * Sampling temperature for the model, limited to [0.6, 1.2]. For audio models a - * temperature of 0.8 is highly recommended for best performance. - */ - temperature?: number; - /** * How the model chooses tools. Provide one of the string modes or force a specific * function/MCP tool. @@ -2568,8 +2936,9 @@ export interface RealtimeSessionCreateRequest { tools?: RealtimeToolsConfig; /** - * Configuration options for tracing. Set to null to disable tracing. Once tracing - * is enabled for a session, the configuration cannot be modified. + * Realtime API can write session traces to the + * [Traces Dashboard](/logs?api=traces). Set to null to disable tracing. Once + * tracing is enabled for a session, the configuration cannot be modified. * * `auto` will create a trace for the session with default values for the workflow * name, group id, and metadata. @@ -2578,8 +2947,7 @@ export interface RealtimeSessionCreateRequest { /** * Controls how the realtime conversation is truncated prior to model inference. - * The default is `auto`. When set to `retention_ratio`, the server retains a - * fraction of the conversation tokens prior to the instructions. + * The default is `auto`. */ truncation?: RealtimeTruncation; } @@ -2603,32 +2971,9 @@ export type RealtimeToolsConfig = Array; * (MCP) servers. * [Learn more about MCP](https://platform.openai.com/docs/guides/tools-remote-mcp). */ -export type RealtimeToolsConfigUnion = RealtimeToolsConfigUnion.Function | RealtimeToolsConfigUnion.Mcp; +export type RealtimeToolsConfigUnion = Models | RealtimeToolsConfigUnion.Mcp; export namespace RealtimeToolsConfigUnion { - export interface Function { - /** - * The description of the function, including guidance on when and how to call it, - * and guidance about what to tell the user when calling (if anything). - */ - description?: string; - - /** - * The name of the function. - */ - name?: string; - - /** - * Parameters of the function in JSON Schema. - */ - parameters?: unknown; - - /** - * The type of the tool, i.e. `function`. - */ - type?: 'function'; - } - /** * Give the model access to additional tools via remote Model Context Protocol * (MCP) servers. @@ -2783,8 +3128,9 @@ export namespace RealtimeToolsConfigUnion { } /** - * Configuration options for tracing. Set to null to disable tracing. Once tracing - * is enabled for a session, the configuration cannot be modified. + * Realtime API can write session traces to the + * [Traces Dashboard](/logs?api=traces). Set to null to disable tracing. Once + * tracing is enabled for a session, the configuration cannot be modified. * * `auto` will create a trace for the session with default values for the workflow * name, group id, and metadata. @@ -2798,54 +3144,36 @@ export namespace RealtimeTracingConfig { export interface TracingConfiguration { /** * The group id to attach to this trace to enable filtering and grouping in the - * traces dashboard. + * Traces Dashboard. */ group_id?: string; /** - * The arbitrary metadata to attach to this trace to enable filtering in the traces - * dashboard. + * The arbitrary metadata to attach to this trace to enable filtering in the Traces + * Dashboard. */ metadata?: unknown; /** * The name of the workflow to attach to this trace. This is used to name the trace - * in the traces dashboard. + * in the Traces Dashboard. */ workflow_name?: string; } } /** - * Realtime transcription session object configuration. + * Configuration for input and output audio. */ -export interface RealtimeTranscriptionSessionCreateRequest { - /** - * ID of the model to use. The options are `gpt-4o-transcribe`, - * `gpt-4o-mini-transcribe`, and `whisper-1` (which is powered by our open source - * Whisper V2 model). - */ - model: (string & {}) | 'whisper-1' | 'gpt-4o-transcribe' | 'gpt-4o-mini-transcribe'; - - /** - * The type of session to create. Always `transcription` for transcription - * sessions. - */ - type: 'transcription'; +export interface RealtimeTranscriptionSessionAudio { + input?: RealtimeTranscriptionSessionAudioInput; +} +export interface RealtimeTranscriptionSessionAudioInput { /** - * The set of items to include in the transcription. Current available items are: - * - * - `item.input_audio_transcription.logprobs` + * The PCM audio format. Only a 24kHz sample rate is supported. */ - include?: Array<'item.input_audio_transcription.logprobs'>; - - /** - * The format of input audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`. For - * `pcm16`, input audio must be 16-bit PCM at a 24kHz sample rate, single channel - * (mono), and little-endian byte order. - */ - input_audio_format?: 'pcm16' | 'g711_ulaw' | 'g711_alaw'; + format?: RealtimeAudioFormats; /** * Configuration for input audio noise reduction. This can be set to `null` to turn @@ -2854,24 +3182,36 @@ export interface RealtimeTranscriptionSessionCreateRequest { * detection accuracy (reducing false positives) and model performance by improving * perception of the input audio. */ - input_audio_noise_reduction?: RealtimeTranscriptionSessionCreateRequest.InputAudioNoiseReduction; + noise_reduction?: RealtimeTranscriptionSessionAudioInput.NoiseReduction; /** - * Configuration for input audio transcription. The client can optionally set the - * language and prompt for transcription, these offer additional guidance to the - * transcription service. + * Configuration for input audio transcription, defaults to off and can be set to + * `null` to turn off once on. Input audio transcription is not native to the + * model, since the model consumes audio directly. Transcription runs + * asynchronously through + * [the /audio/transcriptions endpoint](https://platform.openai.com/docs/api-reference/audio/createTranscription) + * and should be treated as guidance of input audio content rather than precisely + * what the model heard. The client can optionally set the language and prompt for + * transcription, these offer additional guidance to the transcription service. */ - input_audio_transcription?: RealtimeTranscriptionSessionCreateRequest.InputAudioTranscription; + transcription?: AudioTranscription; /** - * Configuration for turn detection. Can be set to `null` to turn off. Server VAD - * means that the model will detect the start and end of speech based on audio - * volume and respond at the end of user speech. + * Configuration for turn detection, ether Server VAD or Semantic VAD. This can be + * set to `null` to turn off, in which case the client must manually trigger model + * response. Server VAD means that the model will detect the start and end of + * speech based on audio volume and respond at the end of user speech. Semantic VAD + * is more advanced and uses a turn detection model (in conjunction with VAD) to + * semantically estimate whether the user has finished speaking, then dynamically + * sets a timeout based on this probability. For example, if user audio trails off + * with "uhhm", the model will score a low probability of turn end and wait longer + * for the user to continue speaking. This can be useful for more natural + * conversations, but may have a higher latency. */ - turn_detection?: RealtimeTranscriptionSessionCreateRequest.TurnDetection; + turn_detection?: RealtimeTranscriptionSessionAudioInputTurnDetection; } -export namespace RealtimeTranscriptionSessionCreateRequest { +export namespace RealtimeTranscriptionSessionAudioInput { /** * Configuration for input audio noise reduction. This can be set to `null` to turn * off. Noise reduction filters audio added to the input audio buffer before it is @@ -2879,105 +3219,127 @@ export namespace RealtimeTranscriptionSessionCreateRequest { * detection accuracy (reducing false positives) and model performance by improving * perception of the input audio. */ - export interface InputAudioNoiseReduction { + export interface NoiseReduction { /** * Type of noise reduction. `near_field` is for close-talking microphones such as * headphones, `far_field` is for far-field microphones such as laptop or * conference room microphones. */ - type?: 'near_field' | 'far_field'; + type?: RealtimeAPI.NoiseReductionType; } +} +/** + * Configuration for turn detection, ether Server VAD or Semantic VAD. This can be + * set to `null` to turn off, in which case the client must manually trigger model + * response. Server VAD means that the model will detect the start and end of + * speech based on audio volume and respond at the end of user speech. Semantic VAD + * is more advanced and uses a turn detection model (in conjunction with VAD) to + * semantically estimate whether the user has finished speaking, then dynamically + * sets a timeout based on this probability. For example, if user audio trails off + * with "uhhm", the model will score a low probability of turn end and wait longer + * for the user to continue speaking. This can be useful for more natural + * conversations, but may have a higher latency. + */ +export interface RealtimeTranscriptionSessionAudioInputTurnDetection { /** - * Configuration for input audio transcription. The client can optionally set the - * language and prompt for transcription, these offer additional guidance to the - * transcription service. + * Whether or not to automatically generate a response when a VAD stop event + * occurs. */ - export interface InputAudioTranscription { - /** - * The language of the input audio. Supplying the input language in - * [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`) - * format will improve accuracy and latency. - */ - language?: string; + create_response?: boolean; - /** - * The model to use for transcription, current options are `gpt-4o-transcribe`, - * `gpt-4o-mini-transcribe`, and `whisper-1`. - */ - model?: 'gpt-4o-transcribe' | 'gpt-4o-mini-transcribe' | 'whisper-1'; + /** + * Used only for `semantic_vad` mode. The eagerness of the model to respond. `low` + * will wait longer for the user to continue speaking, `high` will respond more + * quickly. `auto` is the default and is equivalent to `medium`. + */ + eagerness?: 'low' | 'medium' | 'high' | 'auto'; - /** - * An optional text to guide the model's style or continue a previous audio - * segment. For `whisper-1`, the - * [prompt is a list of keywords](https://platform.openai.com/docs/guides/speech-to-text#prompting). - * For `gpt-4o-transcribe` models, the prompt is a free text string, for example - * "expect words related to technology". - */ - prompt?: string; - } + /** + * Optional idle timeout after which turn detection will auto-timeout when no + * additional audio is received. + */ + idle_timeout_ms?: number | null; /** - * Configuration for turn detection. Can be set to `null` to turn off. Server VAD - * means that the model will detect the start and end of speech based on audio - * volume and respond at the end of user speech. + * Whether or not to automatically interrupt any ongoing response with output to + * the default conversation (i.e. `conversation` of `auto`) when a VAD start event + * occurs. */ - export interface TurnDetection { - /** - * Amount of audio to include before the VAD detected speech (in milliseconds). - * Defaults to 300ms. - */ - prefix_padding_ms?: number; + interrupt_response?: boolean; - /** - * Duration of silence to detect speech stop (in milliseconds). Defaults to 500ms. - * With shorter values the model will respond more quickly, but may jump in on - * short pauses from the user. - */ - silence_duration_ms?: number; + /** + * Used only for `server_vad` mode. Amount of audio to include before the VAD + * detected speech (in milliseconds). Defaults to 300ms. + */ + prefix_padding_ms?: number; + + /** + * Used only for `server_vad` mode. Duration of silence to detect speech stop (in + * milliseconds). Defaults to 500ms. With shorter values the model will respond + * more quickly, but may jump in on short pauses from the user. + */ + silence_duration_ms?: number; + + /** + * Used only for `server_vad` mode. Activation threshold for VAD (0.0 to 1.0), this + * defaults to 0.5. A higher threshold will require louder audio to activate the + * model, and thus might perform better in noisy environments. + */ + threshold?: number; + + /** + * Type of turn detection. + */ + type?: 'server_vad' | 'semantic_vad'; +} + +/** + * Realtime transcription session object configuration. + */ +export interface RealtimeTranscriptionSessionCreateRequest { + /** + * The type of session to create. Always `transcription` for transcription + * sessions. + */ + type: 'transcription'; - /** - * Activation threshold for VAD (0.0 to 1.0), this defaults to 0.5. A higher - * threshold will require louder audio to activate the model, and thus might - * perform better in noisy environments. - */ - threshold?: number; + /** + * Configuration for input and output audio. + */ + audio?: RealtimeTranscriptionSessionAudio; - /** - * Type of turn detection. Only `server_vad` is currently supported for - * transcription sessions. - */ - type?: 'server_vad'; - } + /** + * Additional fields to include in server outputs. + * + * `item.input_audio_transcription.logprobs`: Include logprobs for input audio + * transcription. + */ + include?: Array<'item.input_audio_transcription.logprobs'>; } /** * Controls how the realtime conversation is truncated prior to model inference. - * The default is `auto`. When set to `retention_ratio`, the server retains a - * fraction of the conversation tokens prior to the instructions. + * The default is `auto`. */ -export type RealtimeTruncation = 'auto' | 'disabled' | RealtimeTruncation.RetentionRatioTruncation; +export type RealtimeTruncation = 'auto' | 'disabled' | RealtimeTruncationRetentionRatio; -export namespace RealtimeTruncation { +/** + * Retain a fraction of the conversation tokens when the conversation exceeds the + * input token limit. This allows you to amortize truncations across multiple + * turns, which can help improve cached token usage. + */ +export interface RealtimeTruncationRetentionRatio { /** - * Retain a fraction of the conversation tokens. + * Fraction of post-instruction conversation tokens to retain (0.0 - 1.0) when the + * conversation exceeds the input token limit. */ - export interface RetentionRatioTruncation { - /** - * Fraction of pre-instruction conversation tokens to retain (0.0 - 1.0). - */ - retention_ratio: number; - - /** - * Use retention ratio truncation. - */ - type: 'retention_ratio'; + retention_ratio: number; - /** - * Optional cap on tokens allowed after the instructions. - */ - post_instructions_token_limit?: number | null; - } + /** + * Use retention ratio truncation. + */ + type: 'retention_ratio'; } /** @@ -3141,7 +3503,9 @@ export interface ResponseAudioTranscriptDoneEvent { /** * Send this event to cancel an in-progress response. The server will respond with * a `response.done` event with a status of `response.status=cancelled`. If there - * is no response to cancel, the server will respond with an error. + * is no response to cancel, the server will respond with an error. It's safe to + * call `response.cancel` even if no response is in progress, an error will be + * returned the session will remain unaffected. */ export interface ResponseCancelEvent { /** @@ -3304,15 +3668,26 @@ export namespace ResponseContentPartDoneEvent { * * A Response will include at least one Item, and may have two, in which case the * second will be a function call. These Items will be appended to the conversation - * history. + * history by default. * * The server will respond with a `response.created` event, events for Items and * content created, and finally a `response.done` event to indicate the Response is * complete. * - * The `response.create` event includes inference configuration like - * `instructions`, and `temperature`. These fields will override the Session's - * configuration for this Response only. + * The `response.create` event includes inference configuration like `instructions` + * and `tools`. If these are set, they will override the Session's configuration + * for this Response only. + * + * Responses can be created out-of-band of the default Conversation, meaning that + * they can have arbitrary input, and it's possible to disable writing the output + * to the Conversation. Only one Response can write to the default Conversation at + * a time, but otherwise multiple Responses can be created in parallel. The + * `metadata` field is a good way to disambiguate multiple simultaneous Responses. + * + * Clients can set `conversation` to `none` to create a Response that does not + * write to the default Conversation. Arbitrary input can be provided with the + * `input` field, which is an array accepting raw Items and references to existing + * Items. */ export interface ResponseCreateEvent { /** @@ -3328,142 +3703,7 @@ export interface ResponseCreateEvent { /** * Create a new Realtime response with these parameters */ - response?: ResponseCreateEvent.Response; -} - -export namespace ResponseCreateEvent { - /** - * Create a new Realtime response with these parameters - */ - export interface Response { - /** - * Controls which conversation the response is added to. Currently supports `auto` - * and `none`, with `auto` as the default value. The `auto` value means that the - * contents of the response will be added to the default conversation. Set this to - * `none` to create an out-of-band response which will not add items to default - * conversation. - */ - conversation?: (string & {}) | 'auto' | 'none'; - - /** - * Input items to include in the prompt for the model. Using this field creates a - * new context for this Response instead of using the default conversation. An - * empty array `[]` will clear the context for this Response. Note that this can - * include references to items from the default conversation. - */ - input?: Array; - - /** - * The default system instructions (i.e. system message) prepended to model calls. - * This field allows the client to guide the model on desired responses. The model - * can be instructed on response content and format, (e.g. "be extremely succinct", - * "act friendly", "here are examples of good responses") and on audio behavior - * (e.g. "talk quickly", "inject emotion into your voice", "laugh frequently"). The - * instructions are not guaranteed to be followed by the model, but they provide - * guidance to the model on the desired behavior. - * - * Note that the server sets default instructions which will be used if this field - * is not set and are visible in the `session.created` event at the start of the - * session. - */ - instructions?: string; - - /** - * Maximum number of output tokens for a single assistant response, inclusive of - * tool calls. Provide an integer between 1 and 4096 to limit output tokens, or - * `inf` for the maximum available tokens for a given model. Defaults to `inf`. - */ - max_output_tokens?: number | 'inf'; - - /** - * Set of 16 key-value pairs that can be attached to an object. This can be useful - * for storing additional information about the object in a structured format, and - * querying for objects via API or the dashboard. - * - * Keys are strings with a maximum length of 64 characters. Values are strings with - * a maximum length of 512 characters. - */ - metadata?: Shared.Metadata | null; - - /** - * The set of modalities the model can respond with. To disable audio, set this to - * ["text"]. - */ - modalities?: Array<'text' | 'audio'>; - - /** - * The format of output audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`. - */ - output_audio_format?: 'pcm16' | 'g711_ulaw' | 'g711_alaw'; - - /** - * Reference to a prompt template and its variables. - * [Learn more](https://platform.openai.com/docs/guides/text?api-mode=responses#reusable-prompts). - */ - prompt?: ResponsesAPI.ResponsePrompt | null; - - /** - * Sampling temperature for the model, limited to [0.6, 1.2]. Defaults to 0.8. - */ - temperature?: number; - - /** - * How the model chooses tools. Provide one of the string modes or force a specific - * function/MCP tool. - */ - tool_choice?: - | ResponsesAPI.ToolChoiceOptions - | ResponsesAPI.ToolChoiceFunction - | ResponsesAPI.ToolChoiceMcp; - - /** - * Tools (functions) available to the model. - */ - tools?: Array; - - /** - * The voice the model uses to respond. Voice cannot be changed during the session - * once the model has responded with audio at least once. Current voice options are - * `alloy`, `ash`, `ballad`, `coral`, `echo`, `sage`, `shimmer`, and `verse`. - */ - voice?: - | (string & {}) - | 'alloy' - | 'ash' - | 'ballad' - | 'coral' - | 'echo' - | 'sage' - | 'shimmer' - | 'verse' - | 'marin' - | 'cedar'; - } - - export namespace Response { - export interface Tool { - /** - * The description of the function, including guidance on when and how to call it, - * and guidance about what to tell the user when calling (if anything). - */ - description?: string; - - /** - * The name of the function. - */ - name?: string; - - /** - * Parameters of the function in JSON Schema. - */ - parameters?: unknown; - - /** - * The type of the tool, i.e. `function`. - */ - type?: 'function'; - } - } + response?: RealtimeResponseCreateParams; } /** @@ -3491,6 +3731,13 @@ export interface ResponseCreatedEvent { * Returned when a Response is done streaming. Always emitted, no matter the final * state. The Response object included in the `response.done` event will include * all output Items in the Response but will omit the raw audio data. + * + * Clients should check the `status` field of the Response to determine if it was + * successful (`completed`) or if there was another outcome: `cancelled`, `failed`, + * or `incomplete`. + * + * A response will contain all output items that were generated during the + * response, excluding any audio content. */ export interface ResponseDoneEvent { /** @@ -3894,9 +4141,9 @@ export interface SessionCreatedEvent { event_id: string; /** - * Realtime session object. + * The session configuration. */ - session: RealtimeSession; + session: RealtimeSessionCreateRequest | RealtimeTranscriptionSessionCreateRequest; /** * The event type, must be `session.created`. @@ -3905,21 +4152,22 @@ export interface SessionCreatedEvent { } /** - * Send this event to update the session’s default configuration. The client may - * send this event at any time to update any field, except for `voice`. However, - * note that once a session has been initialized with a particular `model`, it - * can’t be changed to another model using `session.update`. + * Send this event to update the session’s configuration. The client may send this + * event at any time to update any field except for `voice` and `model`. `voice` + * can be updated only if there have been no other audio outputs yet. * * When the server receives a `session.update`, it will respond with a * `session.updated` event showing the full, effective configuration. Only the - * fields that are present are updated. To clear a field like `instructions`, pass - * an empty string. + * fields that are present in the `session.update` are updated. To clear a field + * like `instructions`, pass an empty string. To clear a field like `tools`, pass + * an empty array. To clear a field like `turn_detection`, pass `null`. */ export interface SessionUpdateEvent { /** - * Realtime session object configuration. + * Update the Realtime session. Choose either a realtime session or a transcription + * session. */ - session: RealtimeSessionCreateRequest; + session: RealtimeSessionCreateRequest | RealtimeTranscriptionSessionCreateRequest; /** * The event type, must be `session.update`. @@ -3927,7 +4175,10 @@ export interface SessionUpdateEvent { type: 'session.update'; /** - * Optional client-generated ID used to identify this event. + * Optional client-generated ID used to identify this event. This is an arbitrary + * string that a client may assign. It will be passed back if there is an error + * with the event, but the corresponding `session.updated` event will not include + * it. */ event_id?: string; } @@ -3943,9 +4194,9 @@ export interface SessionUpdatedEvent { event_id: string; /** - * Realtime session object. + * The session configuration. */ - session: RealtimeSession; + session: RealtimeSessionCreateRequest | RealtimeTranscriptionSessionCreateRequest; /** * The event type, must be `session.updated`. @@ -3963,9 +4214,13 @@ export interface TranscriptionSessionCreated { event_id: string; /** - * A Realtime transcription session configuration object. + * A new Realtime transcription session configuration. + * + * When a session is created on the server via REST API, the session object also + * contains an ephemeral key. Default TTL for keys is 10 minutes. This property is + * not present when a session is updated via the WebSocket API. */ - session: TranscriptionSessionCreated.Session; + session: ClientSecretsAPI.RealtimeTranscriptionSessionCreateResponse; /** * The event type, must be `transcription_session.created`. @@ -3973,125 +4228,6 @@ export interface TranscriptionSessionCreated { type: 'transcription_session.created'; } -export namespace TranscriptionSessionCreated { - /** - * A Realtime transcription session configuration object. - */ - export interface Session { - /** - * Unique identifier for the session that looks like `sess_1234567890abcdef`. - */ - id?: string; - - /** - * Configuration for input audio for the session. - */ - audio?: Session.Audio; - - /** - * Expiration timestamp for the session, in seconds since epoch. - */ - expires_at?: number; - - /** - * Additional fields to include in server outputs. - * - * - `item.input_audio_transcription.logprobs`: Include logprobs for input audio - * transcription. - */ - include?: Array<'item.input_audio_transcription.logprobs'>; - - /** - * The object type. Always `realtime.transcription_session`. - */ - object?: string; - } - - export namespace Session { - /** - * Configuration for input audio for the session. - */ - export interface Audio { - input?: Audio.Input; - } - - export namespace Audio { - export interface Input { - /** - * The format of input audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`. - */ - format?: string; - - /** - * Configuration for input audio noise reduction. - */ - noise_reduction?: Input.NoiseReduction; - - /** - * Configuration of the transcription model. - */ - transcription?: Input.Transcription; - - /** - * Configuration for turn detection. - */ - turn_detection?: Input.TurnDetection; - } - - export namespace Input { - /** - * Configuration for input audio noise reduction. - */ - export interface NoiseReduction { - type?: 'near_field' | 'far_field'; - } - - /** - * Configuration of the transcription model. - */ - export interface Transcription { - /** - * The language of the input audio. Supplying the input language in - * [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`) - * format will improve accuracy and latency. - */ - language?: string; - - /** - * The model to use for transcription. Can be `gpt-4o-transcribe`, - * `gpt-4o-mini-transcribe`, or `whisper-1`. - */ - model?: 'gpt-4o-transcribe' | 'gpt-4o-mini-transcribe' | 'whisper-1'; - - /** - * An optional text to guide the model's style or continue a previous audio - * segment. The - * [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting) - * should match the audio language. - */ - prompt?: string; - } - - /** - * Configuration for turn detection. - */ - export interface TurnDetection { - prefix_padding_ms?: number; - - silence_duration_ms?: number; - - threshold?: number; - - /** - * Type of turn detection, only `server_vad` is currently supported. - */ - type?: string; - } - } - } - } -} - /** * Send this event to update a transcription session. */ @@ -4099,7 +4235,7 @@ export interface TranscriptionSessionUpdate { /** * Realtime transcription session object configuration. */ - session: RealtimeTranscriptionSessionCreateRequest; + session: TranscriptionSessionUpdate.Session; /** * The event type, must be `transcription_session.update`. @@ -4112,150 +4248,130 @@ export interface TranscriptionSessionUpdate { event_id?: string; } -/** - * Returned when a transcription session is updated with a - * `transcription_session.update` event, unless there is an error. - */ -export interface TranscriptionSessionUpdatedEvent { - /** - * The unique ID of the server event. - */ - event_id: string; - - /** - * A Realtime transcription session configuration object. - */ - session: TranscriptionSessionUpdatedEvent.Session; - - /** - * The event type, must be `transcription_session.updated`. - */ - type: 'transcription_session.updated'; -} - -export namespace TranscriptionSessionUpdatedEvent { +export namespace TranscriptionSessionUpdate { /** - * A Realtime transcription session configuration object. + * Realtime transcription session object configuration. */ export interface Session { /** - * Unique identifier for the session that looks like `sess_1234567890abcdef`. + * The set of items to include in the transcription. Current available items are: + * `item.input_audio_transcription.logprobs` */ - id?: string; + include?: Array<'item.input_audio_transcription.logprobs'>; /** - * Configuration for input audio for the session. + * The format of input audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`. For + * `pcm16`, input audio must be 16-bit PCM at a 24kHz sample rate, single channel + * (mono), and little-endian byte order. */ - audio?: Session.Audio; + input_audio_format?: 'pcm16' | 'g711_ulaw' | 'g711_alaw'; /** - * Expiration timestamp for the session, in seconds since epoch. + * Configuration for input audio noise reduction. This can be set to `null` to turn + * off. Noise reduction filters audio added to the input audio buffer before it is + * sent to VAD and the model. Filtering the audio can improve VAD and turn + * detection accuracy (reducing false positives) and model performance by improving + * perception of the input audio. */ - expires_at?: number; + input_audio_noise_reduction?: Session.InputAudioNoiseReduction; /** - * Additional fields to include in server outputs. - * - * - `item.input_audio_transcription.logprobs`: Include logprobs for input audio - * transcription. + * Configuration for input audio transcription. The client can optionally set the + * language and prompt for transcription, these offer additional guidance to the + * transcription service. */ - include?: Array<'item.input_audio_transcription.logprobs'>; + input_audio_transcription?: RealtimeAPI.AudioTranscription; /** - * The object type. Always `realtime.transcription_session`. + * Configuration for turn detection. Can be set to `null` to turn off. Server VAD + * means that the model will detect the start and end of speech based on audio + * volume and respond at the end of user speech. */ - object?: string; + turn_detection?: Session.TurnDetection; } export namespace Session { /** - * Configuration for input audio for the session. + * Configuration for input audio noise reduction. This can be set to `null` to turn + * off. Noise reduction filters audio added to the input audio buffer before it is + * sent to VAD and the model. Filtering the audio can improve VAD and turn + * detection accuracy (reducing false positives) and model performance by improving + * perception of the input audio. */ - export interface Audio { - input?: Audio.Input; + export interface InputAudioNoiseReduction { + /** + * Type of noise reduction. `near_field` is for close-talking microphones such as + * headphones, `far_field` is for far-field microphones such as laptop or + * conference room microphones. + */ + type?: RealtimeAPI.NoiseReductionType; } - export namespace Audio { - export interface Input { - /** - * The format of input audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`. - */ - format?: string; - - /** - * Configuration for input audio noise reduction. - */ - noise_reduction?: Input.NoiseReduction; - - /** - * Configuration of the transcription model. - */ - transcription?: Input.Transcription; - - /** - * Configuration for turn detection. - */ - turn_detection?: Input.TurnDetection; - } + /** + * Configuration for turn detection. Can be set to `null` to turn off. Server VAD + * means that the model will detect the start and end of speech based on audio + * volume and respond at the end of user speech. + */ + export interface TurnDetection { + /** + * Amount of audio to include before the VAD detected speech (in milliseconds). + * Defaults to 300ms. + */ + prefix_padding_ms?: number; - export namespace Input { - /** - * Configuration for input audio noise reduction. - */ - export interface NoiseReduction { - type?: 'near_field' | 'far_field'; - } + /** + * Duration of silence to detect speech stop (in milliseconds). Defaults to 500ms. + * With shorter values the model will respond more quickly, but may jump in on + * short pauses from the user. + */ + silence_duration_ms?: number; - /** - * Configuration of the transcription model. - */ - export interface Transcription { - /** - * The language of the input audio. Supplying the input language in - * [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`) - * format will improve accuracy and latency. - */ - language?: string; - - /** - * The model to use for transcription. Can be `gpt-4o-transcribe`, - * `gpt-4o-mini-transcribe`, or `whisper-1`. - */ - model?: 'gpt-4o-transcribe' | 'gpt-4o-mini-transcribe' | 'whisper-1'; - - /** - * An optional text to guide the model's style or continue a previous audio - * segment. The - * [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting) - * should match the audio language. - */ - prompt?: string; - } + /** + * Activation threshold for VAD (0.0 to 1.0), this defaults to 0.5. A higher + * threshold will require louder audio to activate the model, and thus might + * perform better in noisy environments. + */ + threshold?: number; - /** - * Configuration for turn detection. - */ - export interface TurnDetection { - prefix_padding_ms?: number; + /** + * Type of turn detection. Only `server_vad` is currently supported for + * transcription sessions. + */ + type?: 'server_vad'; + } + } +} - silence_duration_ms?: number; +/** + * Returned when a transcription session is updated with a + * `transcription_session.update` event, unless there is an error. + */ +export interface TranscriptionSessionUpdatedEvent { + /** + * The unique ID of the server event. + */ + event_id: string; - threshold?: number; + /** + * A new Realtime transcription session configuration. + * + * When a session is created on the server via REST API, the session object also + * contains an ephemeral key. Default TTL for keys is 10 minutes. This property is + * not present when a session is updated via the WebSocket API. + */ + session: ClientSecretsAPI.RealtimeTranscriptionSessionCreateResponse; - /** - * Type of turn detection, only `server_vad` is currently supported. - */ - type?: string; - } - } - } - } + /** + * The event type, must be `transcription_session.updated`. + */ + type: 'transcription_session.updated'; } Realtime.ClientSecrets = ClientSecrets; export declare namespace Realtime { export { + type AudioTranscription as AudioTranscription, type ConversationCreatedEvent as ConversationCreatedEvent, type ConversationItem as ConversationItem, type ConversationItemAdded as ConversationItemAdded, @@ -4284,11 +4400,16 @@ export declare namespace Realtime { type McpListToolsCompleted as McpListToolsCompleted, type McpListToolsFailed as McpListToolsFailed, type McpListToolsInProgress as McpListToolsInProgress, + type Models as Models, + type NoiseReductionType as NoiseReductionType, type OutputAudioBufferClearEvent as OutputAudioBufferClearEvent, type RateLimitsUpdatedEvent as RateLimitsUpdatedEvent, type RealtimeAudioConfig as RealtimeAudioConfig, + type RealtimeAudioConfigInput as RealtimeAudioConfigInput, + type RealtimeAudioConfigOutput as RealtimeAudioConfigOutput, + type RealtimeAudioFormats as RealtimeAudioFormats, + type RealtimeAudioInputTurnDetection as RealtimeAudioInputTurnDetection, type RealtimeClientEvent as RealtimeClientEvent, - type RealtimeClientSecretConfig as RealtimeClientSecretConfig, type RealtimeConversationItemAssistantMessage as RealtimeConversationItemAssistantMessage, type RealtimeConversationItemFunctionCall as RealtimeConversationItemFunctionCall, type RealtimeConversationItemFunctionCallOutput as RealtimeConversationItemFunctionCallOutput, @@ -4304,6 +4425,9 @@ export declare namespace Realtime { type RealtimeMcpToolExecutionError as RealtimeMcpToolExecutionError, type RealtimeMcphttpError as RealtimeMcphttpError, type RealtimeResponse as RealtimeResponse, + type RealtimeResponseCreateAudioOutput as RealtimeResponseCreateAudioOutput, + type RealtimeResponseCreateMcpTool as RealtimeResponseCreateMcpTool, + type RealtimeResponseCreateParams as RealtimeResponseCreateParams, type RealtimeResponseStatus as RealtimeResponseStatus, type RealtimeResponseUsage as RealtimeResponseUsage, type RealtimeResponseUsageInputTokenDetails as RealtimeResponseUsageInputTokenDetails, @@ -4315,8 +4439,12 @@ export declare namespace Realtime { type RealtimeToolsConfig as RealtimeToolsConfig, type RealtimeToolsConfigUnion as RealtimeToolsConfigUnion, type RealtimeTracingConfig as RealtimeTracingConfig, + type RealtimeTranscriptionSessionAudio as RealtimeTranscriptionSessionAudio, + type RealtimeTranscriptionSessionAudioInput as RealtimeTranscriptionSessionAudioInput, + type RealtimeTranscriptionSessionAudioInputTurnDetection as RealtimeTranscriptionSessionAudioInputTurnDetection, type RealtimeTranscriptionSessionCreateRequest as RealtimeTranscriptionSessionCreateRequest, type RealtimeTruncation as RealtimeTruncation, + type RealtimeTruncationRetentionRatio as RealtimeTruncationRetentionRatio, type ResponseAudioDeltaEvent as ResponseAudioDeltaEvent, type ResponseAudioDoneEvent as ResponseAudioDoneEvent, type ResponseAudioTranscriptDeltaEvent as ResponseAudioTranscriptDeltaEvent, @@ -4348,7 +4476,12 @@ export declare namespace Realtime { export { ClientSecrets as ClientSecrets, + type RealtimeSessionClientSecret as RealtimeSessionClientSecret, type RealtimeSessionCreateResponse as RealtimeSessionCreateResponse, + type RealtimeTranscriptionSessionClientSecret as RealtimeTranscriptionSessionClientSecret, + type RealtimeTranscriptionSessionCreateResponse as RealtimeTranscriptionSessionCreateResponse, + type RealtimeTranscriptionSessionInputAudioTranscription as RealtimeTranscriptionSessionInputAudioTranscription, + type RealtimeTranscriptionSessionTurnDetection as RealtimeTranscriptionSessionTurnDetection, type ClientSecretCreateResponse as ClientSecretCreateResponse, type ClientSecretCreateParams as ClientSecretCreateParams, }; diff --git a/src/version.ts b/src/version.ts index 51eae91f3..36168d9b4 100644 --- a/src/version.ts +++ b/src/version.ts @@ -1 +1 @@ -export const VERSION = '5.19.1'; // x-release-please-version +export const VERSION = '5.20.0'; // x-release-please-version