diff --git a/.release-please-manifest.json b/.release-please-manifest.json
index 48f0d34ba..dec479608 100644
--- a/.release-please-manifest.json
+++ b/.release-please-manifest.json
@@ -1,3 +1,3 @@
{
- ".": "5.19.1"
+ ".": "5.20.0"
}
diff --git a/.stats.yml b/.stats.yml
index c41be6ee5..36a3c7f58 100644
--- a/.stats.yml
+++ b/.stats.yml
@@ -1,4 +1,4 @@
configured_endpoints: 118
-openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/openai%2Fopenai-51afd6abbcb18c3086f62993f9379c18443b9e516cbc0548ddfb932e835657f8.yml
-openapi_spec_hash: dae6afeaefa15cb8700c7a870531e06f
-config_hash: b854932c0ea24b400bdd64e4376936bd
+openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/openai%2Fopenai-7807ec6037efcee1af7decbfd3974a42b761fb6c6a71b4050fe43484d7fcbac4.yml
+openapi_spec_hash: da6851e3891ad2659a50ed6a736fd32a
+config_hash: 74d955cdc2377213f5268ea309090f6c
diff --git a/CHANGELOG.md b/CHANGELOG.md
index d9b8e7d29..02919cc52 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,18 @@
# Changelog
+## 5.20.0 (2025-09-08)
+
+Full Changelog: [v5.19.1...v5.20.0](https://github.com/openai/openai-node/compare/v5.19.1...v5.20.0)
+
+### Features
+
+* **api:** ship the RealtimeGA API shape ([4286ddd](https://github.com/openai/openai-node/commit/4286ddd4f990dd26e15e510039457f17d787820d))
+
+
+### Chores
+
+* ci build action ([c8ce143](https://github.com/openai/openai-node/commit/c8ce143196fdbc7ee1c7832bce2417b6e3d25885))
+
## 5.19.1 (2025-09-03)
Full Changelog: [v5.19.0...v5.19.1](https://github.com/openai/openai-node/compare/v5.19.0...v5.19.1)
diff --git a/api.md b/api.md
index e8a4c861d..0e1134d94 100644
--- a/api.md
+++ b/api.md
@@ -776,6 +776,7 @@ Methods:
Types:
+- AudioTranscription
- ConversationCreatedEvent
- ConversationItem
- ConversationItemAdded
@@ -804,11 +805,16 @@ Types:
- McpListToolsCompleted
- McpListToolsFailed
- McpListToolsInProgress
+- Models
+- NoiseReductionType
- OutputAudioBufferClearEvent
- RateLimitsUpdatedEvent
- RealtimeAudioConfig
+- RealtimeAudioConfigInput
+- RealtimeAudioConfigOutput
+- RealtimeAudioFormats
+- RealtimeAudioInputTurnDetection
- RealtimeClientEvent
-- RealtimeClientSecretConfig
- RealtimeConversationItemAssistantMessage
- RealtimeConversationItemFunctionCall
- RealtimeConversationItemFunctionCallOutput
@@ -824,6 +830,9 @@ Types:
- RealtimeMcpToolExecutionError
- RealtimeMcphttpError
- RealtimeResponse
+- RealtimeResponseCreateAudioOutput
+- RealtimeResponseCreateMcpTool
+- RealtimeResponseCreateParams
- RealtimeResponseStatus
- RealtimeResponseUsage
- RealtimeResponseUsageInputTokenDetails
@@ -835,8 +844,12 @@ Types:
- RealtimeToolsConfig
- RealtimeToolsConfigUnion
- RealtimeTracingConfig
+- RealtimeTranscriptionSessionAudio
+- RealtimeTranscriptionSessionAudioInput
+- RealtimeTranscriptionSessionAudioInputTurnDetection
- RealtimeTranscriptionSessionCreateRequest
- RealtimeTruncation
+- RealtimeTruncationRetentionRatio
- ResponseAudioDeltaEvent
- ResponseAudioDoneEvent
- ResponseAudioTranscriptDeltaEvent
@@ -869,7 +882,12 @@ Types:
Types:
+- RealtimeSessionClientSecret
- RealtimeSessionCreateResponse
+- RealtimeTranscriptionSessionClientSecret
+- RealtimeTranscriptionSessionCreateResponse
+- RealtimeTranscriptionSessionInputAudioTranscription
+- RealtimeTranscriptionSessionTurnDetection
- ClientSecretCreateResponse
Methods:
diff --git a/jsr.json b/jsr.json
index d6d3f55a9..43571736b 100644
--- a/jsr.json
+++ b/jsr.json
@@ -1,6 +1,6 @@
{
"name": "@openai/openai",
- "version": "5.19.1",
+ "version": "5.20.0",
"exports": {
".": "./index.ts",
"./helpers/zod": "./helpers/zod.ts",
diff --git a/package.json b/package.json
index a8fd383fc..340fba521 100644
--- a/package.json
+++ b/package.json
@@ -1,6 +1,6 @@
{
"name": "openai",
- "version": "5.19.1",
+ "version": "5.20.0",
"description": "The official TypeScript library for the OpenAI API",
"author": "OpenAI ",
"types": "dist/index.d.ts",
diff --git a/scripts/utils/upload-artifact.sh b/scripts/utils/upload-artifact.sh
index a157309c4..1ef9d0dfd 100755
--- a/scripts/utils/upload-artifact.sh
+++ b/scripts/utils/upload-artifact.sh
@@ -12,7 +12,7 @@ if [[ "$SIGNED_URL" == "null" ]]; then
exit 1
fi
-UPLOAD_RESPONSE=$(tar -cz "${BUILD_PATH:-dist}" | curl -v -X PUT \
+UPLOAD_RESPONSE=$(tar "${BASE_PATH:+-C$BASE_PATH}" -cz "${ARTIFACT_PATH:-dist}" | curl -v -X PUT \
-H "Content-Type: application/gzip" \
--data-binary @- "$SIGNED_URL" 2>&1)
diff --git a/src/resources/realtime/client-secrets.ts b/src/resources/realtime/client-secrets.ts
index c48fe8243..6539260ac 100644
--- a/src/resources/realtime/client-secrets.ts
+++ b/src/resources/realtime/client-secrets.ts
@@ -2,13 +2,13 @@
import { APIResource } from '../../core/resource';
import * as RealtimeAPI from './realtime';
+import * as ResponsesAPI from '../responses/responses';
import { APIPromise } from '../../core/api-promise';
import { RequestOptions } from '../../internal/request-options';
export class ClientSecrets extends APIResource {
/**
- * Create a Realtime session and client secret for either realtime or
- * transcription.
+ * Create a Realtime client secret with an associated session configuration.
*/
create(body: ClientSecretCreateParams, options?: RequestOptions): APIPromise {
return this._client.post('/realtime/client_secrets', { body, ...options });
@@ -16,29 +16,43 @@ export class ClientSecrets extends APIResource {
}
/**
- * A Realtime session configuration object.
+ * Ephemeral key returned by the API.
*/
-export interface RealtimeSessionCreateResponse {
+export interface RealtimeSessionClientSecret {
/**
- * Unique identifier for the session that looks like `sess_1234567890abcdef`.
+ * Timestamp for when the token expires. Currently, all tokens expire after one
+ * minute.
*/
- id?: string;
+ expires_at: number;
/**
- * Configuration for input and output audio for the session.
+ * Ephemeral key usable in client environments to authenticate connections to the
+ * Realtime API. Use this in client-side environments rather than a standard API
+ * token, which should only be used server-side.
+ */
+ value: string;
+}
+
+/**
+ * A new Realtime session configuration, with an ephemeral key. Default TTL for
+ * keys is one minute.
+ */
+export interface RealtimeSessionCreateResponse {
+ /**
+ * Configuration for input and output audio.
*/
audio?: RealtimeSessionCreateResponse.Audio;
/**
- * Expiration timestamp for the session, in seconds since epoch.
+ * Ephemeral key returned by the API.
*/
- expires_at?: number;
+ client_secret?: RealtimeSessionClientSecret;
/**
* Additional fields to include in server outputs.
*
- * - `item.input_audio_transcription.logprobs`: Include logprobs for input audio
- * transcription.
+ * `item.input_audio_transcription.logprobs`: Include logprobs for input audio
+ * transcription.
*/
include?: Array<'item.input_audio_transcription.logprobs'>;
@@ -67,50 +81,67 @@ export interface RealtimeSessionCreateResponse {
/**
* The Realtime model used for this session.
*/
- model?: string;
+ model?:
+ | (string & {})
+ | 'gpt-realtime'
+ | 'gpt-realtime-2025-08-28'
+ | 'gpt-4o-realtime-preview'
+ | 'gpt-4o-realtime-preview-2024-10-01'
+ | 'gpt-4o-realtime-preview-2024-12-17'
+ | 'gpt-4o-realtime-preview-2025-06-03'
+ | 'gpt-4o-mini-realtime-preview'
+ | 'gpt-4o-mini-realtime-preview-2024-12-17';
/**
- * The object type. Always `realtime.session`.
+ * The set of modalities the model can respond with. It defaults to `["audio"]`,
+ * indicating that the model will respond with audio plus a transcript. `["text"]`
+ * can be used to make the model respond with text only. It is not possible to
+ * request both `text` and `audio` at the same time.
*/
- object?: string;
+ output_modalities?: Array<'text' | 'audio'>;
/**
- * The set of modalities the model can respond with. To disable audio, set this to
- * ["text"].
+ * Reference to a prompt template and its variables.
+ * [Learn more](https://platform.openai.com/docs/guides/text?api-mode=responses#reusable-prompts).
*/
- output_modalities?: Array<'text' | 'audio'>;
+ prompt?: ResponsesAPI.ResponsePrompt | null;
/**
- * How the model chooses tools. Options are `auto`, `none`, `required`, or specify
- * a function.
+ * How the model chooses tools. Provide one of the string modes or force a specific
+ * function/MCP tool.
*/
- tool_choice?: string;
+ tool_choice?: ResponsesAPI.ToolChoiceOptions | ResponsesAPI.ToolChoiceFunction | ResponsesAPI.ToolChoiceMcp;
/**
- * Tools (functions) available to the model.
+ * Tools available to the model.
*/
- tools?: Array;
+ tools?: Array;
/**
- * Configuration options for tracing. Set to null to disable tracing. Once tracing
- * is enabled for a session, the configuration cannot be modified.
+ * Realtime API can write session traces to the
+ * [Traces Dashboard](/logs?api=traces). Set to null to disable tracing. Once
+ * tracing is enabled for a session, the configuration cannot be modified.
*
* `auto` will create a trace for the session with default values for the workflow
* name, group id, and metadata.
*/
- tracing?: 'auto' | RealtimeSessionCreateResponse.TracingConfiguration;
+ tracing?: 'auto' | RealtimeSessionCreateResponse.TracingConfiguration | null;
/**
- * Configuration for turn detection. Can be set to `null` to turn off. Server VAD
- * means that the model will detect the start and end of speech based on audio
- * volume and respond at the end of user speech.
+ * Controls how the realtime conversation is truncated prior to model inference.
+ * The default is `auto`.
*/
- turn_detection?: RealtimeSessionCreateResponse.TurnDetection;
+ truncation?: RealtimeAPI.RealtimeTruncation;
+
+ /**
+ * The type of session to create. Always `realtime` for the Realtime API.
+ */
+ type?: 'realtime';
}
export namespace RealtimeSessionCreateResponse {
/**
- * Configuration for input and output audio for the session.
+ * Configuration for input and output audio.
*/
export interface Audio {
input?: Audio.Input;
@@ -121,79 +152,153 @@ export namespace RealtimeSessionCreateResponse {
export namespace Audio {
export interface Input {
/**
- * The format of input audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`.
+ * The format of the input audio.
*/
- format?: string;
+ format?: RealtimeAPI.RealtimeAudioFormats;
/**
- * Configuration for input audio noise reduction.
+ * Configuration for input audio noise reduction. This can be set to `null` to turn
+ * off. Noise reduction filters audio added to the input audio buffer before it is
+ * sent to VAD and the model. Filtering the audio can improve VAD and turn
+ * detection accuracy (reducing false positives) and model performance by improving
+ * perception of the input audio.
*/
noise_reduction?: Input.NoiseReduction;
/**
- * Configuration for input audio transcription.
+ * Configuration for input audio transcription, defaults to off and can be set to
+ * `null` to turn off once on. Input audio transcription is not native to the
+ * model, since the model consumes audio directly. Transcription runs
+ * asynchronously through
+ * [the /audio/transcriptions endpoint](https://platform.openai.com/docs/api-reference/audio/createTranscription)
+ * and should be treated as guidance of input audio content rather than precisely
+ * what the model heard. The client can optionally set the language and prompt for
+ * transcription, these offer additional guidance to the transcription service.
*/
- transcription?: Input.Transcription;
+ transcription?: RealtimeAPI.AudioTranscription;
/**
- * Configuration for turn detection.
+ * Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
+ * set to `null` to turn off, in which case the client must manually trigger model
+ * response. Server VAD means that the model will detect the start and end of
+ * speech based on audio volume and respond at the end of user speech. Semantic VAD
+ * is more advanced and uses a turn detection model (in conjunction with VAD) to
+ * semantically estimate whether the user has finished speaking, then dynamically
+ * sets a timeout based on this probability. For example, if user audio trails off
+ * with "uhhm", the model will score a low probability of turn end and wait longer
+ * for the user to continue speaking. This can be useful for more natural
+ * conversations, but may have a higher latency.
*/
turn_detection?: Input.TurnDetection;
}
export namespace Input {
/**
- * Configuration for input audio noise reduction.
+ * Configuration for input audio noise reduction. This can be set to `null` to turn
+ * off. Noise reduction filters audio added to the input audio buffer before it is
+ * sent to VAD and the model. Filtering the audio can improve VAD and turn
+ * detection accuracy (reducing false positives) and model performance by improving
+ * perception of the input audio.
*/
export interface NoiseReduction {
- type?: 'near_field' | 'far_field';
+ /**
+ * Type of noise reduction. `near_field` is for close-talking microphones such as
+ * headphones, `far_field` is for far-field microphones such as laptop or
+ * conference room microphones.
+ */
+ type?: RealtimeAPI.NoiseReductionType;
}
/**
- * Configuration for input audio transcription.
+ * Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
+ * set to `null` to turn off, in which case the client must manually trigger model
+ * response. Server VAD means that the model will detect the start and end of
+ * speech based on audio volume and respond at the end of user speech. Semantic VAD
+ * is more advanced and uses a turn detection model (in conjunction with VAD) to
+ * semantically estimate whether the user has finished speaking, then dynamically
+ * sets a timeout based on this probability. For example, if user audio trails off
+ * with "uhhm", the model will score a low probability of turn end and wait longer
+ * for the user to continue speaking. This can be useful for more natural
+ * conversations, but may have a higher latency.
*/
- export interface Transcription {
+ export interface TurnDetection {
/**
- * The language of the input audio.
+ * Whether or not to automatically generate a response when a VAD stop event
+ * occurs.
*/
- language?: string;
+ create_response?: boolean;
/**
- * The model to use for transcription.
+ * Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
+ * will wait longer for the user to continue speaking, `high` will respond more
+ * quickly. `auto` is the default and is equivalent to `medium`. `low`, `medium`,
+ * and `high` have max timeouts of 8s, 4s, and 2s respectively.
*/
- model?: string;
+ eagerness?: 'low' | 'medium' | 'high' | 'auto';
/**
- * Optional text to guide the model's style or continue a previous audio segment.
+ * Optional idle timeout after which turn detection will auto-timeout when no
+ * additional audio is received.
*/
- prompt?: string;
- }
+ idle_timeout_ms?: number | null;
- /**
- * Configuration for turn detection.
- */
- export interface TurnDetection {
+ /**
+ * Whether or not to automatically interrupt any ongoing response with output to
+ * the default conversation (i.e. `conversation` of `auto`) when a VAD start event
+ * occurs.
+ */
+ interrupt_response?: boolean;
+
+ /**
+ * Used only for `server_vad` mode. Amount of audio to include before the VAD
+ * detected speech (in milliseconds). Defaults to 300ms.
+ */
prefix_padding_ms?: number;
+ /**
+ * Used only for `server_vad` mode. Duration of silence to detect speech stop (in
+ * milliseconds). Defaults to 500ms. With shorter values the model will respond
+ * more quickly, but may jump in on short pauses from the user.
+ */
silence_duration_ms?: number;
+ /**
+ * Used only for `server_vad` mode. Activation threshold for VAD (0.0 to 1.0), this
+ * defaults to 0.5. A higher threshold will require louder audio to activate the
+ * model, and thus might perform better in noisy environments.
+ */
threshold?: number;
/**
- * Type of turn detection, only `server_vad` is currently supported.
+ * Type of turn detection.
*/
- type?: string;
+ type?: 'server_vad' | 'semantic_vad';
}
}
export interface Output {
/**
- * The format of output audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`.
+ * The format of the output audio.
*/
- format?: string;
+ format?: RealtimeAPI.RealtimeAudioFormats;
+ /**
+ * The speed of the model's spoken response as a multiple of the original speed.
+ * 1.0 is the default speed. 0.25 is the minimum speed. 1.5 is the maximum speed.
+ * This value can only be changed in between model turns, not while a response is
+ * in progress.
+ *
+ * This parameter is a post-processing adjustment to the audio after it is
+ * generated, it's also possible to prompt the model to speak faster or slower.
+ */
speed?: number;
+ /**
+ * The voice the model uses to respond. Voice cannot be changed during the session
+ * once the model has responded with audio at least once. Current voice options are
+ * `alloy`, `ash`, `ballad`, `coral`, `echo`, `sage`, `shimmer`, `verse`, `marin`,
+ * and `cedar`. We recommend `marin` and `cedar` for best quality.
+ */
voice?:
| (string & {})
| 'alloy'
@@ -209,229 +314,324 @@ export namespace RealtimeSessionCreateResponse {
}
}
- export interface Tool {
+ /**
+ * Give the model access to additional tools via remote Model Context Protocol
+ * (MCP) servers.
+ * [Learn more about MCP](https://platform.openai.com/docs/guides/tools-remote-mcp).
+ */
+ export interface McpTool {
/**
- * The description of the function, including guidance on when and how to call it,
- * and guidance about what to tell the user when calling (if anything).
+ * A label for this MCP server, used to identify it in tool calls.
*/
- description?: string;
+ server_label: string;
/**
- * The name of the function.
+ * The type of the MCP tool. Always `mcp`.
*/
- name?: string;
+ type: 'mcp';
/**
- * Parameters of the function in JSON Schema.
+ * List of allowed tool names or a filter object.
*/
- parameters?: unknown;
+ allowed_tools?: Array | McpTool.McpToolFilter | null;
/**
- * The type of the tool, i.e. `function`.
+ * An OAuth access token that can be used with a remote MCP server, either with a
+ * custom MCP server URL or a service connector. Your application must handle the
+ * OAuth authorization flow and provide the token here.
*/
- type?: 'function';
- }
+ authorization?: string;
- /**
- * Granular configuration for tracing.
- */
- export interface TracingConfiguration {
/**
- * The group id to attach to this trace to enable filtering and grouping in the
- * traces dashboard.
+ * Identifier for service connectors, like those available in ChatGPT. One of
+ * `server_url` or `connector_id` must be provided. Learn more about service
+ * connectors
+ * [here](https://platform.openai.com/docs/guides/tools-remote-mcp#connectors).
+ *
+ * Currently supported `connector_id` values are:
+ *
+ * - Dropbox: `connector_dropbox`
+ * - Gmail: `connector_gmail`
+ * - Google Calendar: `connector_googlecalendar`
+ * - Google Drive: `connector_googledrive`
+ * - Microsoft Teams: `connector_microsoftteams`
+ * - Outlook Calendar: `connector_outlookcalendar`
+ * - Outlook Email: `connector_outlookemail`
+ * - SharePoint: `connector_sharepoint`
*/
- group_id?: string;
+ connector_id?:
+ | 'connector_dropbox'
+ | 'connector_gmail'
+ | 'connector_googlecalendar'
+ | 'connector_googledrive'
+ | 'connector_microsoftteams'
+ | 'connector_outlookcalendar'
+ | 'connector_outlookemail'
+ | 'connector_sharepoint';
/**
- * The arbitrary metadata to attach to this trace to enable filtering in the traces
- * dashboard.
+ * Optional HTTP headers to send to the MCP server. Use for authentication or other
+ * purposes.
*/
- metadata?: unknown;
+ headers?: { [key: string]: string } | null;
/**
- * The name of the workflow to attach to this trace. This is used to name the trace
- * in the traces dashboard.
+ * Specify which of the MCP server's tools require approval.
*/
- workflow_name?: string;
+ require_approval?: McpTool.McpToolApprovalFilter | 'always' | 'never' | null;
+
+ /**
+ * Optional description of the MCP server, used to provide more context.
+ */
+ server_description?: string;
+
+ /**
+ * The URL for the MCP server. One of `server_url` or `connector_id` must be
+ * provided.
+ */
+ server_url?: string;
}
- /**
- * Configuration for turn detection. Can be set to `null` to turn off. Server VAD
- * means that the model will detect the start and end of speech based on audio
- * volume and respond at the end of user speech.
- */
- export interface TurnDetection {
+ export namespace McpTool {
/**
- * Amount of audio to include before the VAD detected speech (in milliseconds).
- * Defaults to 300ms.
+ * A filter object to specify which tools are allowed.
*/
- prefix_padding_ms?: number;
+ export interface McpToolFilter {
+ /**
+ * Indicates whether or not a tool modifies data or is read-only. If an MCP server
+ * is
+ * [annotated with `readOnlyHint`](https://modelcontextprotocol.io/specification/2025-06-18/schema#toolannotations-readonlyhint),
+ * it will match this filter.
+ */
+ read_only?: boolean;
+
+ /**
+ * List of allowed tool names.
+ */
+ tool_names?: Array;
+ }
+
+ /**
+ * Specify which of the MCP server's tools require approval. Can be `always`,
+ * `never`, or a filter object associated with tools that require approval.
+ */
+ export interface McpToolApprovalFilter {
+ /**
+ * A filter object to specify which tools are allowed.
+ */
+ always?: McpToolApprovalFilter.Always;
+
+ /**
+ * A filter object to specify which tools are allowed.
+ */
+ never?: McpToolApprovalFilter.Never;
+ }
+
+ export namespace McpToolApprovalFilter {
+ /**
+ * A filter object to specify which tools are allowed.
+ */
+ export interface Always {
+ /**
+ * Indicates whether or not a tool modifies data or is read-only. If an MCP server
+ * is
+ * [annotated with `readOnlyHint`](https://modelcontextprotocol.io/specification/2025-06-18/schema#toolannotations-readonlyhint),
+ * it will match this filter.
+ */
+ read_only?: boolean;
+
+ /**
+ * List of allowed tool names.
+ */
+ tool_names?: Array;
+ }
+
+ /**
+ * A filter object to specify which tools are allowed.
+ */
+ export interface Never {
+ /**
+ * Indicates whether or not a tool modifies data or is read-only. If an MCP server
+ * is
+ * [annotated with `readOnlyHint`](https://modelcontextprotocol.io/specification/2025-06-18/schema#toolannotations-readonlyhint),
+ * it will match this filter.
+ */
+ read_only?: boolean;
+
+ /**
+ * List of allowed tool names.
+ */
+ tool_names?: Array;
+ }
+ }
+ }
+ /**
+ * Granular configuration for tracing.
+ */
+ export interface TracingConfiguration {
/**
- * Duration of silence to detect speech stop (in milliseconds). Defaults to 500ms.
- * With shorter values the model will respond more quickly, but may jump in on
- * short pauses from the user.
+ * The group id to attach to this trace to enable filtering and grouping in the
+ * Traces Dashboard.
*/
- silence_duration_ms?: number;
+ group_id?: string;
/**
- * Activation threshold for VAD (0.0 to 1.0), this defaults to 0.5. A higher
- * threshold will require louder audio to activate the model, and thus might
- * perform better in noisy environments.
+ * The arbitrary metadata to attach to this trace to enable filtering in the Traces
+ * Dashboard.
*/
- threshold?: number;
+ metadata?: unknown;
/**
- * Type of turn detection, only `server_vad` is currently supported.
+ * The name of the workflow to attach to this trace. This is used to name the trace
+ * in the Traces Dashboard.
*/
- type?: string;
+ workflow_name?: string;
}
}
/**
- * Response from creating a session and client secret for the Realtime API.
+ * Ephemeral key returned by the API. Only present when the session is created on
+ * the server via REST API.
*/
-export interface ClientSecretCreateResponse {
+export interface RealtimeTranscriptionSessionClientSecret {
/**
- * Expiration timestamp for the client secret, in seconds since epoch.
+ * Timestamp for when the token expires. Currently, all tokens expire after one
+ * minute.
*/
expires_at: number;
/**
- * The session configuration for either a realtime or transcription session.
- */
- session:
- | RealtimeSessionCreateResponse
- | ClientSecretCreateResponse.RealtimeTranscriptionSessionCreateResponse;
-
- /**
- * The generated client secret value.
+ * Ephemeral key usable in client environments to authenticate connections to the
+ * Realtime API. Use this in client-side environments rather than a standard API
+ * token, which should only be used server-side.
*/
value: string;
}
-export namespace ClientSecretCreateResponse {
+/**
+ * A new Realtime transcription session configuration.
+ *
+ * When a session is created on the server via REST API, the session object also
+ * contains an ephemeral key. Default TTL for keys is 10 minutes. This property is
+ * not present when a session is updated via the WebSocket API.
+ */
+export interface RealtimeTranscriptionSessionCreateResponse {
/**
- * A Realtime transcription session configuration object.
+ * Ephemeral key returned by the API. Only present when the session is created on
+ * the server via REST API.
*/
- export interface RealtimeTranscriptionSessionCreateResponse {
- /**
- * Unique identifier for the session that looks like `sess_1234567890abcdef`.
- */
- id?: string;
-
- /**
- * Configuration for input audio for the session.
- */
- audio?: RealtimeTranscriptionSessionCreateResponse.Audio;
+ client_secret: RealtimeTranscriptionSessionClientSecret;
- /**
- * Expiration timestamp for the session, in seconds since epoch.
- */
- expires_at?: number;
+ /**
+ * The format of input audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`.
+ */
+ input_audio_format?: string;
- /**
- * Additional fields to include in server outputs.
- *
- * - `item.input_audio_transcription.logprobs`: Include logprobs for input audio
- * transcription.
- */
- include?: Array<'item.input_audio_transcription.logprobs'>;
+ /**
+ * Configuration of the transcription model.
+ */
+ input_audio_transcription?: RealtimeTranscriptionSessionInputAudioTranscription;
- /**
- * The object type. Always `realtime.transcription_session`.
- */
- object?: string;
- }
+ /**
+ * The set of modalities the model can respond with. To disable audio, set this to
+ * ["text"].
+ */
+ modalities?: Array<'text' | 'audio'>;
- export namespace RealtimeTranscriptionSessionCreateResponse {
- /**
- * Configuration for input audio for the session.
- */
- export interface Audio {
- input?: Audio.Input;
- }
+ /**
+ * Configuration for turn detection. Can be set to `null` to turn off. Server VAD
+ * means that the model will detect the start and end of speech based on audio
+ * volume and respond at the end of user speech.
+ */
+ turn_detection?: RealtimeTranscriptionSessionTurnDetection;
+}
- export namespace Audio {
- export interface Input {
- /**
- * The format of input audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`.
- */
- format?: string;
+/**
+ * Configuration of the transcription model.
+ */
+export interface RealtimeTranscriptionSessionInputAudioTranscription {
+ /**
+ * The language of the input audio. Supplying the input language in
+ * [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`)
+ * format will improve accuracy and latency.
+ */
+ language?: string;
- /**
- * Configuration for input audio noise reduction.
- */
- noise_reduction?: Input.NoiseReduction;
+ /**
+ * The model to use for transcription. Current options are `whisper-1`,
+ * `gpt-4o-transcribe-latest`, `gpt-4o-mini-transcribe`, and `gpt-4o-transcribe`.
+ */
+ model?: 'whisper-1' | 'gpt-4o-transcribe-latest' | 'gpt-4o-mini-transcribe' | 'gpt-4o-transcribe';
- /**
- * Configuration of the transcription model.
- */
- transcription?: Input.Transcription;
+ /**
+ * An optional text to guide the model's style or continue a previous audio
+ * segment. For `whisper-1`, the
+ * [prompt is a list of keywords](https://platform.openai.com/docs/guides/speech-to-text#prompting).
+ * For `gpt-4o-transcribe` models, the prompt is a free text string, for example
+ * "expect words related to technology".
+ */
+ prompt?: string;
+}
- /**
- * Configuration for turn detection.
- */
- turn_detection?: Input.TurnDetection;
- }
+/**
+ * Configuration for turn detection. Can be set to `null` to turn off. Server VAD
+ * means that the model will detect the start and end of speech based on audio
+ * volume and respond at the end of user speech.
+ */
+export interface RealtimeTranscriptionSessionTurnDetection {
+ /**
+ * Amount of audio to include before the VAD detected speech (in milliseconds).
+ * Defaults to 300ms.
+ */
+ prefix_padding_ms?: number;
- export namespace Input {
- /**
- * Configuration for input audio noise reduction.
- */
- export interface NoiseReduction {
- type?: 'near_field' | 'far_field';
- }
+ /**
+ * Duration of silence to detect speech stop (in milliseconds). Defaults to 500ms.
+ * With shorter values the model will respond more quickly, but may jump in on
+ * short pauses from the user.
+ */
+ silence_duration_ms?: number;
- /**
- * Configuration of the transcription model.
- */
- export interface Transcription {
- /**
- * The language of the input audio. Supplying the input language in
- * [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`)
- * format will improve accuracy and latency.
- */
- language?: string;
-
- /**
- * The model to use for transcription. Can be `gpt-4o-transcribe`,
- * `gpt-4o-mini-transcribe`, or `whisper-1`.
- */
- model?: 'gpt-4o-transcribe' | 'gpt-4o-mini-transcribe' | 'whisper-1';
-
- /**
- * An optional text to guide the model's style or continue a previous audio
- * segment. The
- * [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting)
- * should match the audio language.
- */
- prompt?: string;
- }
+ /**
+ * Activation threshold for VAD (0.0 to 1.0), this defaults to 0.5. A higher
+ * threshold will require louder audio to activate the model, and thus might
+ * perform better in noisy environments.
+ */
+ threshold?: number;
- /**
- * Configuration for turn detection.
- */
- export interface TurnDetection {
- prefix_padding_ms?: number;
+ /**
+ * Type of turn detection, only `server_vad` is currently supported.
+ */
+ type?: string;
+}
- silence_duration_ms?: number;
+/**
+ * Response from creating a session and client secret for the Realtime API.
+ */
+export interface ClientSecretCreateResponse {
+ /**
+ * Expiration timestamp for the client secret, in seconds since epoch.
+ */
+ expires_at: number;
- threshold?: number;
+ /**
+ * The session configuration for either a realtime or transcription session.
+ */
+ session: RealtimeSessionCreateResponse | RealtimeTranscriptionSessionCreateResponse;
- /**
- * Type of turn detection, only `server_vad` is currently supported.
- */
- type?: string;
- }
- }
- }
- }
+ /**
+ * The generated client secret value.
+ */
+ value: string;
}
export interface ClientSecretCreateParams {
/**
- * Configuration for the ephemeral token expiration.
+ * Configuration for the client secret expiration. Expiration refers to the time
+ * after which a client secret will no longer be valid for creating sessions. The
+ * session itself may continue after that time once started. A secret can be used
+ * to create multiple sessions until it expires.
*/
expires_after?: ClientSecretCreateParams.ExpiresAfter;
@@ -444,18 +644,23 @@ export interface ClientSecretCreateParams {
export namespace ClientSecretCreateParams {
/**
- * Configuration for the ephemeral token expiration.
+ * Configuration for the client secret expiration. Expiration refers to the time
+ * after which a client secret will no longer be valid for creating sessions. The
+ * session itself may continue after that time once started. A secret can be used
+ * to create multiple sessions until it expires.
*/
export interface ExpiresAfter {
/**
- * The anchor point for the ephemeral token expiration. Only `created_at` is
- * currently supported.
+ * The anchor point for the client secret expiration, meaning that `seconds` will
+ * be added to the `created_at` time of the client secret to produce an expiration
+ * timestamp. Only `created_at` is currently supported.
*/
anchor?: 'created_at';
/**
* The number of seconds from the anchor point to the expiration. Select a value
- * between `10` and `7200`.
+ * between `10` and `7200` (2 hours). This default to 600 seconds (10 minutes) if
+ * not specified.
*/
seconds?: number;
}
@@ -463,7 +668,12 @@ export namespace ClientSecretCreateParams {
export declare namespace ClientSecrets {
export {
+ type RealtimeSessionClientSecret as RealtimeSessionClientSecret,
type RealtimeSessionCreateResponse as RealtimeSessionCreateResponse,
+ type RealtimeTranscriptionSessionClientSecret as RealtimeTranscriptionSessionClientSecret,
+ type RealtimeTranscriptionSessionCreateResponse as RealtimeTranscriptionSessionCreateResponse,
+ type RealtimeTranscriptionSessionInputAudioTranscription as RealtimeTranscriptionSessionInputAudioTranscription,
+ type RealtimeTranscriptionSessionTurnDetection as RealtimeTranscriptionSessionTurnDetection,
type ClientSecretCreateResponse as ClientSecretCreateResponse,
type ClientSecretCreateParams as ClientSecretCreateParams,
};
diff --git a/src/resources/realtime/index.ts b/src/resources/realtime/index.ts
index a6c5db35e..550532500 100644
--- a/src/resources/realtime/index.ts
+++ b/src/resources/realtime/index.ts
@@ -2,7 +2,12 @@
export {
ClientSecrets,
+ type RealtimeSessionClientSecret,
type RealtimeSessionCreateResponse,
+ type RealtimeTranscriptionSessionClientSecret,
+ type RealtimeTranscriptionSessionCreateResponse,
+ type RealtimeTranscriptionSessionInputAudioTranscription,
+ type RealtimeTranscriptionSessionTurnDetection,
type ClientSecretCreateResponse,
type ClientSecretCreateParams,
} from './client-secrets';
diff --git a/src/resources/realtime/realtime.ts b/src/resources/realtime/realtime.ts
index e05f4fb6d..9dee11e11 100644
--- a/src/resources/realtime/realtime.ts
+++ b/src/resources/realtime/realtime.ts
@@ -8,7 +8,12 @@ import {
ClientSecretCreateParams,
ClientSecretCreateResponse,
ClientSecrets,
+ RealtimeSessionClientSecret,
RealtimeSessionCreateResponse,
+ RealtimeTranscriptionSessionClientSecret,
+ RealtimeTranscriptionSessionCreateResponse,
+ RealtimeTranscriptionSessionInputAudioTranscription,
+ RealtimeTranscriptionSessionTurnDetection,
} from './client-secrets';
import * as ResponsesAPI from '../responses/responses';
@@ -16,6 +21,30 @@ export class Realtime extends APIResource {
clientSecrets: ClientSecretsAPI.ClientSecrets = new ClientSecretsAPI.ClientSecrets(this._client);
}
+export interface AudioTranscription {
+ /**
+ * The language of the input audio. Supplying the input language in
+ * [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`)
+ * format will improve accuracy and latency.
+ */
+ language?: string;
+
+ /**
+ * The model to use for transcription. Current options are `whisper-1`,
+ * `gpt-4o-transcribe-latest`, `gpt-4o-mini-transcribe`, and `gpt-4o-transcribe`.
+ */
+ model?: 'whisper-1' | 'gpt-4o-transcribe-latest' | 'gpt-4o-mini-transcribe' | 'gpt-4o-transcribe';
+
+ /**
+ * An optional text to guide the model's style or continue a previous audio
+ * segment. For `whisper-1`, the
+ * [prompt is a list of keywords](https://platform.openai.com/docs/guides/speech-to-text#prompting).
+ * For `gpt-4o-transcribe` models, the prompt is a free text string, for example
+ * "expect words related to technology".
+ */
+ prompt?: string;
+}
+
/**
* Returned when a conversation is created. Emitted right after session creation.
*/
@@ -68,7 +97,20 @@ export type ConversationItem =
| RealtimeMcpApprovalRequest;
/**
- * Returned when a conversation item is added.
+ * Sent by the server when an Item is added to the default Conversation. This can
+ * happen in several cases:
+ *
+ * - When the client sends a `conversation.item.create` event.
+ * - When the input audio buffer is committed. In this case the item will be a user
+ * message containing the audio from the buffer.
+ * - When the model is generating a Response. In this case the
+ * `conversation.item.added` event will be sent when the model starts generating
+ * a specific Item, and thus it will not yet have any content (and `status` will
+ * be `in_progress`).
+ *
+ * The event will include the full content of the Item (except when model is
+ * generating a Response) except for audio data, which can be retrieved separately
+ * with a `conversation.item.retrieve` event if necessary.
*/
export interface ConversationItemAdded {
/**
@@ -212,6 +254,9 @@ export interface ConversationItemDeletedEvent {
/**
* Returned when a conversation item is finalized.
+ *
+ * The event will include the full content of the Item except for audio data, which
+ * can be retrieved separately with a `conversation.item.retrieve` event if needed.
*/
export interface ConversationItemDone {
/**
@@ -239,9 +284,9 @@ export interface ConversationItemDone {
/**
* This event is the output of audio transcription for user audio written to the
* user audio buffer. Transcription begins when the input audio buffer is committed
- * by the client or server (in `server_vad` mode). Transcription runs
- * asynchronously with Response creation, so this event may come before or after
- * the Response events.
+ * by the client or server (when VAD is enabled). Transcription runs asynchronously
+ * with Response creation, so this event may come before or after the Response
+ * events.
*
* Realtime API models accept audio natively, and thus input transcription is a
* separate process run on a separate ASR (Automatic Speech Recognition) model. The
@@ -260,7 +305,7 @@ export interface ConversationItemInputAudioTranscriptionCompletedEvent {
event_id: string;
/**
- * The ID of the user message item containing the audio.
+ * The ID of the item containing the audio that is being transcribed.
*/
item_id: string;
@@ -275,7 +320,8 @@ export interface ConversationItemInputAudioTranscriptionCompletedEvent {
type: 'conversation.item.input_audio_transcription.completed';
/**
- * Usage statistics for the transcription.
+ * Usage statistics for the transcription, this is billed according to the ASR
+ * model's pricing rather than the realtime model's pricing.
*/
usage:
| ConversationItemInputAudioTranscriptionCompletedEvent.TranscriptTextUsageTokens
@@ -353,7 +399,7 @@ export namespace ConversationItemInputAudioTranscriptionCompletedEvent {
/**
* Returned when the text value of an input audio transcription content part is
- * updated.
+ * updated with incremental transcription results.
*/
export interface ConversationItemInputAudioTranscriptionDeltaEvent {
/**
@@ -362,7 +408,7 @@ export interface ConversationItemInputAudioTranscriptionDeltaEvent {
event_id: string;
/**
- * The ID of the item.
+ * The ID of the item containing the audio that is being transcribed.
*/
item_id: string;
@@ -382,7 +428,12 @@ export interface ConversationItemInputAudioTranscriptionDeltaEvent {
delta?: string;
/**
- * The log probabilities of the transcription.
+ * The log probabilities of the transcription. These can be enabled by
+ * configurating the session with
+ * `"include": ["item.input_audio_transcription.logprobs"]`. Each entry in the
+ * array corresponds a log probability of which token would be selected for this
+ * chunk of transcription. This can help to identify if it was possible there were
+ * multiple valid options for a given chunk of transcription.
*/
logprobs?: Array | null;
}
@@ -542,7 +593,7 @@ export interface ConversationItemTruncateEvent {
audio_end_ms: number;
/**
- * The index of the content part to truncate. Set this to 0.
+ * The index of the content part to truncate. Set this to `0`.
*/
content_index: number;
@@ -701,14 +752,19 @@ export namespace ConversationItemWithReference {
/**
* Send this event to append audio bytes to the input audio buffer. The audio
- * buffer is temporary storage you can write to and later commit. In Server VAD
- * mode, the audio buffer is used to detect speech and the server will decide when
- * to commit. When Server VAD is disabled, you must commit the audio buffer
- * manually.
+ * buffer is temporary storage you can write to and later commit. A "commit" will
+ * create a new user message item in the conversation history from the buffer
+ * content and clear the buffer. Input audio transcription (if enabled) will be
+ * generated when the buffer is committed.
+ *
+ * If VAD is enabled the audio buffer is used to detect speech and the server will
+ * decide when to commit. When Server VAD is disabled, you must commit the audio
+ * buffer manually. Input audio noise reduction operates on writes to the audio
+ * buffer.
*
* The client may choose how much audio to place in each event up to a maximum of
* 15 MiB, for example streaming smaller chunks from the client may allow the VAD
- * to be more responsive. Unlike made other client events, the server will not send
+ * to be more responsive. Unlike most other client events, the server will not send
* a confirmation response to this event.
*/
export interface InputAudioBufferAppendEvent {
@@ -988,6 +1044,36 @@ export interface McpListToolsInProgress {
type: 'mcp_list_tools.in_progress';
}
+export interface Models {
+ /**
+ * The description of the function, including guidance on when and how to call it,
+ * and guidance about what to tell the user when calling (if anything).
+ */
+ description?: string;
+
+ /**
+ * The name of the function.
+ */
+ name?: string;
+
+ /**
+ * Parameters of the function in JSON Schema.
+ */
+ parameters?: unknown;
+
+ /**
+ * The type of the tool, i.e. `function`.
+ */
+ type?: 'function';
+}
+
+/**
+ * Type of noise reduction. `near_field` is for close-talking microphones such as
+ * headphones, `far_field` is for far-field microphones such as laptop or
+ * conference room microphones.
+ */
+export type NoiseReductionType = 'near_field' | 'far_field';
+
/**
* **WebRTC Only:** Emit to cut off the current audio response. This will trigger
* the server to stop generating audio and emit a `output_audio_buffer.cleared`
@@ -1058,212 +1144,217 @@ export namespace RateLimitsUpdatedEvent {
* Configuration for input and output audio.
*/
export interface RealtimeAudioConfig {
- input?: RealtimeAudioConfig.Input;
+ input?: RealtimeAudioConfigInput;
- output?: RealtimeAudioConfig.Output;
+ output?: RealtimeAudioConfigOutput;
}
-export namespace RealtimeAudioConfig {
- export interface Input {
- /**
- * The format of input audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`. For
- * `pcm16`, input audio must be 16-bit PCM at a 24kHz sample rate, single channel
- * (mono), and little-endian byte order.
- */
- format?: 'pcm16' | 'g711_ulaw' | 'g711_alaw';
+export interface RealtimeAudioConfigInput {
+ /**
+ * The format of the input audio.
+ */
+ format?: RealtimeAudioFormats;
- /**
- * Configuration for input audio noise reduction. This can be set to `null` to turn
- * off. Noise reduction filters audio added to the input audio buffer before it is
- * sent to VAD and the model. Filtering the audio can improve VAD and turn
- * detection accuracy (reducing false positives) and model performance by improving
- * perception of the input audio.
- */
- noise_reduction?: Input.NoiseReduction;
+ /**
+ * Configuration for input audio noise reduction. This can be set to `null` to turn
+ * off. Noise reduction filters audio added to the input audio buffer before it is
+ * sent to VAD and the model. Filtering the audio can improve VAD and turn
+ * detection accuracy (reducing false positives) and model performance by improving
+ * perception of the input audio.
+ */
+ noise_reduction?: RealtimeAudioConfigInput.NoiseReduction;
- /**
- * Configuration for input audio transcription, defaults to off and can be set to
- * `null` to turn off once on. Input audio transcription is not native to the
- * model, since the model consumes audio directly. Transcription runs
- * asynchronously through
- * [the /audio/transcriptions endpoint](https://platform.openai.com/docs/api-reference/audio/createTranscription)
- * and should be treated as guidance of input audio content rather than precisely
- * what the model heard. The client can optionally set the language and prompt for
- * transcription, these offer additional guidance to the transcription service.
- */
- transcription?: Input.Transcription;
+ /**
+ * Configuration for input audio transcription, defaults to off and can be set to
+ * `null` to turn off once on. Input audio transcription is not native to the
+ * model, since the model consumes audio directly. Transcription runs
+ * asynchronously through
+ * [the /audio/transcriptions endpoint](https://platform.openai.com/docs/api-reference/audio/createTranscription)
+ * and should be treated as guidance of input audio content rather than precisely
+ * what the model heard. The client can optionally set the language and prompt for
+ * transcription, these offer additional guidance to the transcription service.
+ */
+ transcription?: AudioTranscription;
+
+ /**
+ * Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
+ * set to `null` to turn off, in which case the client must manually trigger model
+ * response. Server VAD means that the model will detect the start and end of
+ * speech based on audio volume and respond at the end of user speech. Semantic VAD
+ * is more advanced and uses a turn detection model (in conjunction with VAD) to
+ * semantically estimate whether the user has finished speaking, then dynamically
+ * sets a timeout based on this probability. For example, if user audio trails off
+ * with "uhhm", the model will score a low probability of turn end and wait longer
+ * for the user to continue speaking. This can be useful for more natural
+ * conversations, but may have a higher latency.
+ */
+ turn_detection?: RealtimeAudioInputTurnDetection;
+}
+export namespace RealtimeAudioConfigInput {
+ /**
+ * Configuration for input audio noise reduction. This can be set to `null` to turn
+ * off. Noise reduction filters audio added to the input audio buffer before it is
+ * sent to VAD and the model. Filtering the audio can improve VAD and turn
+ * detection accuracy (reducing false positives) and model performance by improving
+ * perception of the input audio.
+ */
+ export interface NoiseReduction {
/**
- * Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
- * set to `null` to turn off, in which case the client must manually trigger model
- * response. Server VAD means that the model will detect the start and end of
- * speech based on audio volume and respond at the end of user speech. Semantic VAD
- * is more advanced and uses a turn detection model (in conjunction with VAD) to
- * semantically estimate whether the user has finished speaking, then dynamically
- * sets a timeout based on this probability. For example, if user audio trails off
- * with "uhhm", the model will score a low probability of turn end and wait longer
- * for the user to continue speaking. This can be useful for more natural
- * conversations, but may have a higher latency.
+ * Type of noise reduction. `near_field` is for close-talking microphones such as
+ * headphones, `far_field` is for far-field microphones such as laptop or
+ * conference room microphones.
*/
- turn_detection?: Input.TurnDetection;
+ type?: RealtimeAPI.NoiseReductionType;
}
+}
+
+export interface RealtimeAudioConfigOutput {
+ /**
+ * The format of the output audio.
+ */
+ format?: RealtimeAudioFormats;
+
+ /**
+ * The speed of the model's spoken response as a multiple of the original speed.
+ * 1.0 is the default speed. 0.25 is the minimum speed. 1.5 is the maximum speed.
+ * This value can only be changed in between model turns, not while a response is
+ * in progress.
+ *
+ * This parameter is a post-processing adjustment to the audio after it is
+ * generated, it's also possible to prompt the model to speak faster or slower.
+ */
+ speed?: number;
+
+ /**
+ * The voice the model uses to respond. Voice cannot be changed during the session
+ * once the model has responded with audio at least once. Current voice options are
+ * `alloy`, `ash`, `ballad`, `coral`, `echo`, `sage`, `shimmer`, `verse`, `marin`,
+ * and `cedar`. We recommend `marin` and `cedar` for best quality.
+ */
+ voice?:
+ | (string & {})
+ | 'alloy'
+ | 'ash'
+ | 'ballad'
+ | 'coral'
+ | 'echo'
+ | 'sage'
+ | 'shimmer'
+ | 'verse'
+ | 'marin'
+ | 'cedar';
+}
+
+/**
+ * The PCM audio format. Only a 24kHz sample rate is supported.
+ */
+export type RealtimeAudioFormats =
+ | RealtimeAudioFormats.AudioPCM
+ | RealtimeAudioFormats.AudioPCMU
+ | RealtimeAudioFormats.AudioPCMA;
- export namespace Input {
+export namespace RealtimeAudioFormats {
+ /**
+ * The PCM audio format. Only a 24kHz sample rate is supported.
+ */
+ export interface AudioPCM {
/**
- * Configuration for input audio noise reduction. This can be set to `null` to turn
- * off. Noise reduction filters audio added to the input audio buffer before it is
- * sent to VAD and the model. Filtering the audio can improve VAD and turn
- * detection accuracy (reducing false positives) and model performance by improving
- * perception of the input audio.
+ * The sample rate of the audio. Always `24000`.
*/
- export interface NoiseReduction {
- /**
- * Type of noise reduction. `near_field` is for close-talking microphones such as
- * headphones, `far_field` is for far-field microphones such as laptop or
- * conference room microphones.
- */
- type?: 'near_field' | 'far_field';
- }
+ rate?: 24000;
/**
- * Configuration for input audio transcription, defaults to off and can be set to
- * `null` to turn off once on. Input audio transcription is not native to the
- * model, since the model consumes audio directly. Transcription runs
- * asynchronously through
- * [the /audio/transcriptions endpoint](https://platform.openai.com/docs/api-reference/audio/createTranscription)
- * and should be treated as guidance of input audio content rather than precisely
- * what the model heard. The client can optionally set the language and prompt for
- * transcription, these offer additional guidance to the transcription service.
+ * The audio format. Always `audio/pcm`.
*/
- export interface Transcription {
- /**
- * The language of the input audio. Supplying the input language in
- * [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`)
- * format will improve accuracy and latency.
- */
- language?: string;
-
- /**
- * The model to use for transcription. Current options are `whisper-1`,
- * `gpt-4o-transcribe-latest`, `gpt-4o-mini-transcribe`, `gpt-4o-transcribe`, and
- * `gpt-4o-transcribe-diarize`.
- */
- model?:
- | 'whisper-1'
- | 'gpt-4o-transcribe-latest'
- | 'gpt-4o-mini-transcribe'
- | 'gpt-4o-transcribe'
- | 'gpt-4o-transcribe-diarize';
-
- /**
- * An optional text to guide the model's style or continue a previous audio
- * segment. For `whisper-1`, the
- * [prompt is a list of keywords](https://platform.openai.com/docs/guides/speech-to-text#prompting).
- * For `gpt-4o-transcribe` models, the prompt is a free text string, for example
- * "expect words related to technology".
- */
- prompt?: string;
- }
+ type?: 'audio/pcm';
+ }
+ /**
+ * The G.711 μ-law format.
+ */
+ export interface AudioPCMU {
/**
- * Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
- * set to `null` to turn off, in which case the client must manually trigger model
- * response. Server VAD means that the model will detect the start and end of
- * speech based on audio volume and respond at the end of user speech. Semantic VAD
- * is more advanced and uses a turn detection model (in conjunction with VAD) to
- * semantically estimate whether the user has finished speaking, then dynamically
- * sets a timeout based on this probability. For example, if user audio trails off
- * with "uhhm", the model will score a low probability of turn end and wait longer
- * for the user to continue speaking. This can be useful for more natural
- * conversations, but may have a higher latency.
+ * The audio format. Always `audio/pcmu`.
*/
- export interface TurnDetection {
- /**
- * Whether or not to automatically generate a response when a VAD stop event
- * occurs.
- */
- create_response?: boolean;
-
- /**
- * Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
- * will wait longer for the user to continue speaking, `high` will respond more
- * quickly. `auto` is the default and is equivalent to `medium`.
- */
- eagerness?: 'low' | 'medium' | 'high' | 'auto';
+ type?: 'audio/pcmu';
+ }
- /**
- * Optional idle timeout after which turn detection will auto-timeout when no
- * additional audio is received.
- */
- idle_timeout_ms?: number | null;
+ /**
+ * The G.711 A-law format.
+ */
+ export interface AudioPCMA {
+ /**
+ * The audio format. Always `audio/pcma`.
+ */
+ type?: 'audio/pcma';
+ }
+}
- /**
- * Whether or not to automatically interrupt any ongoing response with output to
- * the default conversation (i.e. `conversation` of `auto`) when a VAD start event
- * occurs.
- */
- interrupt_response?: boolean;
+/**
+ * Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
+ * set to `null` to turn off, in which case the client must manually trigger model
+ * response. Server VAD means that the model will detect the start and end of
+ * speech based on audio volume and respond at the end of user speech. Semantic VAD
+ * is more advanced and uses a turn detection model (in conjunction with VAD) to
+ * semantically estimate whether the user has finished speaking, then dynamically
+ * sets a timeout based on this probability. For example, if user audio trails off
+ * with "uhhm", the model will score a low probability of turn end and wait longer
+ * for the user to continue speaking. This can be useful for more natural
+ * conversations, but may have a higher latency.
+ */
+export interface RealtimeAudioInputTurnDetection {
+ /**
+ * Whether or not to automatically generate a response when a VAD stop event
+ * occurs.
+ */
+ create_response?: boolean;
- /**
- * Used only for `server_vad` mode. Amount of audio to include before the VAD
- * detected speech (in milliseconds). Defaults to 300ms.
- */
- prefix_padding_ms?: number;
+ /**
+ * Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
+ * will wait longer for the user to continue speaking, `high` will respond more
+ * quickly. `auto` is the default and is equivalent to `medium`. `low`, `medium`,
+ * and `high` have max timeouts of 8s, 4s, and 2s respectively.
+ */
+ eagerness?: 'low' | 'medium' | 'high' | 'auto';
- /**
- * Used only for `server_vad` mode. Duration of silence to detect speech stop (in
- * milliseconds). Defaults to 500ms. With shorter values the model will respond
- * more quickly, but may jump in on short pauses from the user.
- */
- silence_duration_ms?: number;
+ /**
+ * Optional idle timeout after which turn detection will auto-timeout when no
+ * additional audio is received.
+ */
+ idle_timeout_ms?: number | null;
- /**
- * Used only for `server_vad` mode. Activation threshold for VAD (0.0 to 1.0), this
- * defaults to 0.5. A higher threshold will require louder audio to activate the
- * model, and thus might perform better in noisy environments.
- */
- threshold?: number;
+ /**
+ * Whether or not to automatically interrupt any ongoing response with output to
+ * the default conversation (i.e. `conversation` of `auto`) when a VAD start event
+ * occurs.
+ */
+ interrupt_response?: boolean;
- /**
- * Type of turn detection.
- */
- type?: 'server_vad' | 'semantic_vad';
- }
- }
+ /**
+ * Used only for `server_vad` mode. Amount of audio to include before the VAD
+ * detected speech (in milliseconds). Defaults to 300ms.
+ */
+ prefix_padding_ms?: number;
- export interface Output {
- /**
- * The format of output audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`.
- * For `pcm16`, output audio is sampled at a rate of 24kHz.
- */
- format?: 'pcm16' | 'g711_ulaw' | 'g711_alaw';
+ /**
+ * Used only for `server_vad` mode. Duration of silence to detect speech stop (in
+ * milliseconds). Defaults to 500ms. With shorter values the model will respond
+ * more quickly, but may jump in on short pauses from the user.
+ */
+ silence_duration_ms?: number;
- /**
- * The speed of the model's spoken response. 1.0 is the default speed. 0.25 is the
- * minimum speed. 1.5 is the maximum speed. This value can only be changed in
- * between model turns, not while a response is in progress.
- */
- speed?: number;
+ /**
+ * Used only for `server_vad` mode. Activation threshold for VAD (0.0 to 1.0), this
+ * defaults to 0.5. A higher threshold will require louder audio to activate the
+ * model, and thus might perform better in noisy environments.
+ */
+ threshold?: number;
- /**
- * The voice the model uses to respond. Voice cannot be changed during the session
- * once the model has responded with audio at least once. Current voice options are
- * `alloy`, `ash`, `ballad`, `coral`, `echo`, `sage`, `shimmer`, `verse`, `marin`,
- * and `cedar`.
- */
- voice?:
- | (string & {})
- | 'alloy'
- | 'ash'
- | 'ballad'
- | 'coral'
- | 'echo'
- | 'sage'
- | 'shimmer'
- | 'verse'
- | 'marin'
- | 'cedar';
- }
+ /**
+ * Type of turn detection.
+ */
+ type?: 'server_vad' | 'semantic_vad';
}
/**
@@ -1283,35 +1374,6 @@ export type RealtimeClientEvent =
| SessionUpdateEvent
| TranscriptionSessionUpdate;
-/**
- * Configuration options for the generated client secret.
- */
-export interface RealtimeClientSecretConfig {
- /**
- * Configuration for the ephemeral token expiration.
- */
- expires_after?: RealtimeClientSecretConfig.ExpiresAfter;
-}
-
-export namespace RealtimeClientSecretConfig {
- /**
- * Configuration for the ephemeral token expiration.
- */
- export interface ExpiresAfter {
- /**
- * The anchor point for the ephemeral token expiration. Only `created_at` is
- * currently supported.
- */
- anchor: 'created_at';
-
- /**
- * The number of seconds from the anchor point to the expiration. Select a value
- * between `10` and `7200`.
- */
- seconds?: number;
- }
-}
-
/**
* An assistant message item in a Realtime conversation.
*/
@@ -1332,12 +1394,14 @@ export interface RealtimeConversationItemAssistantMessage {
type: 'message';
/**
- * The unique ID of the item.
+ * The unique ID of the item. This may be provided by the client or generated by
+ * the server.
*/
id?: string;
/**
- * Identifier for the API object being returned - always `realtime.item`.
+ * Identifier for the API object being returned - always `realtime.item`. Optional
+ * when creating a new item.
*/
object?: 'realtime.item';
@@ -1349,15 +1413,29 @@ export interface RealtimeConversationItemAssistantMessage {
export namespace RealtimeConversationItemAssistantMessage {
export interface Content {
+ /**
+ * Base64-encoded audio bytes, these will be parsed as the format specified in the
+ * session output audio type configuration. This defaults to PCM 16-bit 24kHz mono
+ * if not specified.
+ */
+ audio?: string;
+
/**
* The text content.
*/
text?: string;
/**
- * The content type. Always `text` for assistant messages.
+ * The transcript of the audio content, this will always be present if the output
+ * type is `audio`.
+ */
+ transcript?: string;
+
+ /**
+ * The content type, `output_text` or `output_audio` depending on the session
+ * `output_modalities` configuration.
*/
- type?: 'text';
+ type?: 'output_text' | 'output_audio';
}
}
@@ -1366,7 +1444,9 @@ export namespace RealtimeConversationItemAssistantMessage {
*/
export interface RealtimeConversationItemFunctionCall {
/**
- * The arguments of the function call.
+ * The arguments of the function call. This is a JSON-encoded string representing
+ * the arguments passed to the function, for example
+ * `{"arg1": "value1", "arg2": 42}`.
*/
arguments: string;
@@ -1381,7 +1461,8 @@ export interface RealtimeConversationItemFunctionCall {
type: 'function_call';
/**
- * The unique ID of the item.
+ * The unique ID of the item. This may be provided by the client or generated by
+ * the server.
*/
id?: string;
@@ -1391,7 +1472,8 @@ export interface RealtimeConversationItemFunctionCall {
call_id?: string;
/**
- * Identifier for the API object being returned - always `realtime.item`.
+ * Identifier for the API object being returned - always `realtime.item`. Optional
+ * when creating a new item.
*/
object?: 'realtime.item';
@@ -1411,7 +1493,8 @@ export interface RealtimeConversationItemFunctionCallOutput {
call_id: string;
/**
- * The output of the function call.
+ * The output of the function call, this is free text and can contain any
+ * information or simply be empty.
*/
output: string;
@@ -1421,12 +1504,14 @@ export interface RealtimeConversationItemFunctionCallOutput {
type: 'function_call_output';
/**
- * The unique ID of the item.
+ * The unique ID of the item. This may be provided by the client or generated by
+ * the server.
*/
id?: string;
/**
- * Identifier for the API object being returned - always `realtime.item`.
+ * Identifier for the API object being returned - always `realtime.item`. Optional
+ * when creating a new item.
*/
object?: 'realtime.item';
@@ -1437,7 +1522,12 @@ export interface RealtimeConversationItemFunctionCallOutput {
}
/**
- * A system message item in a Realtime conversation.
+ * A system message in a Realtime conversation can be used to provide additional
+ * context or instructions to the model. This is similar but distinct from the
+ * instruction prompt provided at the start of a conversation, as system messages
+ * can be added at any point in the conversation. For major changes to the
+ * conversation's behavior, use instructions, but for smaller updates (e.g. "the
+ * user is now asking about a different topic"), use system messages.
*/
export interface RealtimeConversationItemSystemMessage {
/**
@@ -1456,12 +1546,14 @@ export interface RealtimeConversationItemSystemMessage {
type: 'message';
/**
- * The unique ID of the item.
+ * The unique ID of the item. This may be provided by the client or generated by
+ * the server.
*/
id?: string;
/**
- * Identifier for the API object being returned - always `realtime.item`.
+ * Identifier for the API object being returned - always `realtime.item`. Optional
+ * when creating a new item.
*/
object?: 'realtime.item';
@@ -1505,12 +1597,14 @@ export interface RealtimeConversationItemUserMessage {
type: 'message';
/**
- * The unique ID of the item.
+ * The unique ID of the item. This may be provided by the client or generated by
+ * the server.
*/
id?: string;
/**
- * Identifier for the API object being returned - always `realtime.item`.
+ * Identifier for the API object being returned - always `realtime.item`. Optional
+ * when creating a new item.
*/
object?: 'realtime.item';
@@ -1523,24 +1617,40 @@ export interface RealtimeConversationItemUserMessage {
export namespace RealtimeConversationItemUserMessage {
export interface Content {
/**
- * Base64-encoded audio bytes (for `input_audio`).
+ * Base64-encoded audio bytes (for `input_audio`), these will be parsed as the
+ * format specified in the session input audio type configuration. This defaults to
+ * PCM 16-bit 24kHz mono if not specified.
*/
audio?: string;
+ /**
+ * The detail level of the image (for `input_image`). `auto` will default to
+ * `high`.
+ */
+ detail?: 'auto' | 'low' | 'high';
+
+ /**
+ * Base64-encoded image bytes (for `input_image`) as a data URI. For example
+ * `...`. Supported formats are PNG
+ * and JPEG.
+ */
+ image_url?: string;
+
/**
* The text content (for `input_text`).
*/
text?: string;
/**
- * Transcript of the audio (for `input_audio`).
+ * Transcript of the audio (for `input_audio`). This is not sent to the model, but
+ * will be attached to the message item for reference.
*/
transcript?: string;
/**
- * The content type (`input_text` or `input_audio`).
+ * The content type (`input_text`, `input_audio`, or `input_image`).
*/
- type?: 'input_text' | 'input_audio';
+ type?: 'input_text' | 'input_audio' | 'input_image';
}
}
@@ -1780,18 +1890,22 @@ export interface RealtimeMcphttpError {
*/
export interface RealtimeResponse {
/**
- * The unique ID of the response.
+ * The unique ID of the response, will look like `resp_1234`.
*/
id?: string;
+ /**
+ * Configuration for audio output.
+ */
+ audio?: RealtimeResponse.Audio;
+
/**
* Which conversation the response is added to, determined by the `conversation`
* field in the `response.create` event. If `auto`, the response will be added to
* the default conversation and the value of `conversation_id` will be an id like
* `conv_1234`. If `none`, the response will not be added to any conversation and
* the value of `conversation_id` will be `null`. If responses are being triggered
- * by server VAD, the response will be added to the default conversation, thus the
- * `conversation_id` will be an id like `conv_1234`.
+ * automatically by VAD the response will be added to the default conversation
*/
conversation_id?: string;
@@ -1812,14 +1926,7 @@ export interface RealtimeResponse {
metadata?: Shared.Metadata | null;
/**
- * The set of modalities the model used to respond. If there are multiple
- * modalities, the model will pick one, for example if `modalities` is
- * `["text", "audio"]`, the model could be responding in either text or audio.
- */
- modalities?: Array<'text' | 'audio'>;
-
- /**
- * The object type, must be `realtime.response`.
+ * The object type, must be `realtime.response`.
*/
object?: 'realtime.response';
@@ -1829,9 +1936,12 @@ export interface RealtimeResponse {
output?: Array;
/**
- * The format of output audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`.
+ * The set of modalities the model used to respond, currently the only possible
+ * values are `[\"audio\"]`, `[\"text\"]`. Audio output always include a text
+ * transcript. Setting the output to mode `text` will disable audio output from the
+ * model.
*/
- output_audio_format?: 'pcm16' | 'g711_ulaw' | 'g711_alaw';
+ output_modalities?: Array<'text' | 'audio'>;
/**
* The final status of the response (`completed`, `cancelled`, `failed`, or
@@ -1844,11 +1954,6 @@ export interface RealtimeResponse {
*/
status_details?: RealtimeResponseStatus;
- /**
- * Sampling temperature for the model, limited to [0.6, 1.2]. Defaults to 0.8.
- */
- temperature?: number;
-
/**
* Usage statistics for the Response, this will correspond to billing. A Realtime
* API session will maintain a conversation context and append new Items to the
@@ -1856,23 +1961,313 @@ export interface RealtimeResponse {
* become the input for later turns.
*/
usage?: RealtimeResponseUsage;
+}
+export namespace RealtimeResponse {
/**
- * The voice the model used to respond. Current voice options are `alloy`, `ash`,
- * `ballad`, `coral`, `echo`, `sage`, `shimmer`, and `verse`.
+ * Configuration for audio output.
*/
- voice?:
- | (string & {})
- | 'alloy'
- | 'ash'
- | 'ballad'
- | 'coral'
- | 'echo'
- | 'sage'
- | 'shimmer'
- | 'verse'
- | 'marin'
- | 'cedar';
+ export interface Audio {
+ output?: Audio.Output;
+ }
+
+ export namespace Audio {
+ export interface Output {
+ /**
+ * The format of the output audio.
+ */
+ format?: RealtimeAPI.RealtimeAudioFormats;
+
+ /**
+ * The voice the model uses to respond. Voice cannot be changed during the session
+ * once the model has responded with audio at least once. Current voice options are
+ * `alloy`, `ash`, `ballad`, `coral`, `echo`, `sage`, `shimmer`, `verse`, `marin`,
+ * and `cedar`. We recommend `marin` and `cedar` for best quality.
+ */
+ voice?:
+ | (string & {})
+ | 'alloy'
+ | 'ash'
+ | 'ballad'
+ | 'coral'
+ | 'echo'
+ | 'sage'
+ | 'shimmer'
+ | 'verse'
+ | 'marin'
+ | 'cedar';
+ }
+ }
+}
+
+/**
+ * Configuration for audio input and output.
+ */
+export interface RealtimeResponseCreateAudioOutput {
+ output?: RealtimeResponseCreateAudioOutput.Output;
+}
+
+export namespace RealtimeResponseCreateAudioOutput {
+ export interface Output {
+ /**
+ * The format of the output audio.
+ */
+ format?: RealtimeAPI.RealtimeAudioFormats;
+
+ /**
+ * The voice the model uses to respond. Voice cannot be changed during the session
+ * once the model has responded with audio at least once. Current voice options are
+ * `alloy`, `ash`, `ballad`, `coral`, `echo`, `sage`, `shimmer`, `verse`, `marin`,
+ * and `cedar`. We recommend `marin` and `cedar` for best quality.
+ */
+ voice?:
+ | (string & {})
+ | 'alloy'
+ | 'ash'
+ | 'ballad'
+ | 'coral'
+ | 'echo'
+ | 'sage'
+ | 'shimmer'
+ | 'verse'
+ | 'marin'
+ | 'cedar';
+ }
+}
+
+/**
+ * Give the model access to additional tools via remote Model Context Protocol
+ * (MCP) servers.
+ * [Learn more about MCP](https://platform.openai.com/docs/guides/tools-remote-mcp).
+ */
+export interface RealtimeResponseCreateMcpTool {
+ /**
+ * A label for this MCP server, used to identify it in tool calls.
+ */
+ server_label: string;
+
+ /**
+ * The type of the MCP tool. Always `mcp`.
+ */
+ type: 'mcp';
+
+ /**
+ * List of allowed tool names or a filter object.
+ */
+ allowed_tools?: Array | RealtimeResponseCreateMcpTool.McpToolFilter | null;
+
+ /**
+ * An OAuth access token that can be used with a remote MCP server, either with a
+ * custom MCP server URL or a service connector. Your application must handle the
+ * OAuth authorization flow and provide the token here.
+ */
+ authorization?: string;
+
+ /**
+ * Identifier for service connectors, like those available in ChatGPT. One of
+ * `server_url` or `connector_id` must be provided. Learn more about service
+ * connectors
+ * [here](https://platform.openai.com/docs/guides/tools-remote-mcp#connectors).
+ *
+ * Currently supported `connector_id` values are:
+ *
+ * - Dropbox: `connector_dropbox`
+ * - Gmail: `connector_gmail`
+ * - Google Calendar: `connector_googlecalendar`
+ * - Google Drive: `connector_googledrive`
+ * - Microsoft Teams: `connector_microsoftteams`
+ * - Outlook Calendar: `connector_outlookcalendar`
+ * - Outlook Email: `connector_outlookemail`
+ * - SharePoint: `connector_sharepoint`
+ */
+ connector_id?:
+ | 'connector_dropbox'
+ | 'connector_gmail'
+ | 'connector_googlecalendar'
+ | 'connector_googledrive'
+ | 'connector_microsoftteams'
+ | 'connector_outlookcalendar'
+ | 'connector_outlookemail'
+ | 'connector_sharepoint';
+
+ /**
+ * Optional HTTP headers to send to the MCP server. Use for authentication or other
+ * purposes.
+ */
+ headers?: { [key: string]: string } | null;
+
+ /**
+ * Specify which of the MCP server's tools require approval.
+ */
+ require_approval?: RealtimeResponseCreateMcpTool.McpToolApprovalFilter | 'always' | 'never' | null;
+
+ /**
+ * Optional description of the MCP server, used to provide more context.
+ */
+ server_description?: string;
+
+ /**
+ * The URL for the MCP server. One of `server_url` or `connector_id` must be
+ * provided.
+ */
+ server_url?: string;
+}
+
+export namespace RealtimeResponseCreateMcpTool {
+ /**
+ * A filter object to specify which tools are allowed.
+ */
+ export interface McpToolFilter {
+ /**
+ * Indicates whether or not a tool modifies data or is read-only. If an MCP server
+ * is
+ * [annotated with `readOnlyHint`](https://modelcontextprotocol.io/specification/2025-06-18/schema#toolannotations-readonlyhint),
+ * it will match this filter.
+ */
+ read_only?: boolean;
+
+ /**
+ * List of allowed tool names.
+ */
+ tool_names?: Array;
+ }
+
+ /**
+ * Specify which of the MCP server's tools require approval. Can be `always`,
+ * `never`, or a filter object associated with tools that require approval.
+ */
+ export interface McpToolApprovalFilter {
+ /**
+ * A filter object to specify which tools are allowed.
+ */
+ always?: McpToolApprovalFilter.Always;
+
+ /**
+ * A filter object to specify which tools are allowed.
+ */
+ never?: McpToolApprovalFilter.Never;
+ }
+
+ export namespace McpToolApprovalFilter {
+ /**
+ * A filter object to specify which tools are allowed.
+ */
+ export interface Always {
+ /**
+ * Indicates whether or not a tool modifies data or is read-only. If an MCP server
+ * is
+ * [annotated with `readOnlyHint`](https://modelcontextprotocol.io/specification/2025-06-18/schema#toolannotations-readonlyhint),
+ * it will match this filter.
+ */
+ read_only?: boolean;
+
+ /**
+ * List of allowed tool names.
+ */
+ tool_names?: Array;
+ }
+
+ /**
+ * A filter object to specify which tools are allowed.
+ */
+ export interface Never {
+ /**
+ * Indicates whether or not a tool modifies data or is read-only. If an MCP server
+ * is
+ * [annotated with `readOnlyHint`](https://modelcontextprotocol.io/specification/2025-06-18/schema#toolannotations-readonlyhint),
+ * it will match this filter.
+ */
+ read_only?: boolean;
+
+ /**
+ * List of allowed tool names.
+ */
+ tool_names?: Array;
+ }
+ }
+}
+
+/**
+ * Create a new Realtime response with these parameters
+ */
+export interface RealtimeResponseCreateParams {
+ /**
+ * Configuration for audio input and output.
+ */
+ audio?: RealtimeResponseCreateAudioOutput;
+
+ /**
+ * Controls which conversation the response is added to. Currently supports `auto`
+ * and `none`, with `auto` as the default value. The `auto` value means that the
+ * contents of the response will be added to the default conversation. Set this to
+ * `none` to create an out-of-band response which will not add items to default
+ * conversation.
+ */
+ conversation?: (string & {}) | 'auto' | 'none';
+
+ /**
+ * Input items to include in the prompt for the model. Using this field creates a
+ * new context for this Response instead of using the default conversation. An
+ * empty array `[]` will clear the context for this Response. Note that this can
+ * include references to items that previously appeared in the session using their
+ * id.
+ */
+ input?: Array;
+
+ /**
+ * The default system instructions (i.e. system message) prepended to model calls.
+ * This field allows the client to guide the model on desired responses. The model
+ * can be instructed on response content and format, (e.g. "be extremely succinct",
+ * "act friendly", "here are examples of good responses") and on audio behavior
+ * (e.g. "talk quickly", "inject emotion into your voice", "laugh frequently"). The
+ * instructions are not guaranteed to be followed by the model, but they provide
+ * guidance to the model on the desired behavior. Note that the server sets default
+ * instructions which will be used if this field is not set and are visible in the
+ * `session.created` event at the start of the session.
+ */
+ instructions?: string;
+
+ /**
+ * Maximum number of output tokens for a single assistant response, inclusive of
+ * tool calls. Provide an integer between 1 and 4096 to limit output tokens, or
+ * `inf` for the maximum available tokens for a given model. Defaults to `inf`.
+ */
+ max_output_tokens?: number | 'inf';
+
+ /**
+ * Set of 16 key-value pairs that can be attached to an object. This can be useful
+ * for storing additional information about the object in a structured format, and
+ * querying for objects via API or the dashboard.
+ *
+ * Keys are strings with a maximum length of 64 characters. Values are strings with
+ * a maximum length of 512 characters.
+ */
+ metadata?: Shared.Metadata | null;
+
+ /**
+ * The set of modalities the model used to respond, currently the only possible
+ * values are `[\"audio\"]`, `[\"text\"]`. Audio output always include a text
+ * transcript. Setting the output to mode `text` will disable audio output from the
+ * model.
+ */
+ output_modalities?: Array<'text' | 'audio'>;
+
+ /**
+ * Reference to a prompt template and its variables.
+ * [Learn more](https://platform.openai.com/docs/guides/text?api-mode=responses#reusable-prompts).
+ */
+ prompt?: ResponsesAPI.ResponsePrompt | null;
+
+ /**
+ * How the model chooses tools. Provide one of the string modes or force a specific
+ * function/MCP tool.
+ */
+ tool_choice?: ResponsesAPI.ToolChoiceOptions | ResponsesAPI.ToolChoiceFunction | ResponsesAPI.ToolChoiceMcp;
+
+ /**
+ * Tools available to the model.
+ */
+ tools?: Array;
}
/**
@@ -1927,7 +2322,10 @@ export namespace RealtimeResponseStatus {
*/
export interface RealtimeResponseUsage {
/**
- * Details about the input tokens used in the Response.
+ * Details about the input tokens used in the Response. Cached tokens are tokens
+ * from previous turns in the conversation that are included as context for the
+ * current response. Cached tokens here are counted as a subset of input tokens,
+ * meaning input tokens will include cached and uncached tokens.
*/
input_token_details?: RealtimeResponseUsageInputTokenDetails;
@@ -1956,25 +2354,60 @@ export interface RealtimeResponseUsage {
}
/**
- * Details about the input tokens used in the Response.
+ * Details about the input tokens used in the Response. Cached tokens are tokens
+ * from previous turns in the conversation that are included as context for the
+ * current response. Cached tokens here are counted as a subset of input tokens,
+ * meaning input tokens will include cached and uncached tokens.
*/
export interface RealtimeResponseUsageInputTokenDetails {
/**
- * The number of audio tokens used in the Response.
+ * The number of audio tokens used as input for the Response.
*/
audio_tokens?: number;
/**
- * The number of cached tokens used in the Response.
+ * The number of cached tokens used as input for the Response.
*/
cached_tokens?: number;
/**
- * The number of text tokens used in the Response.
+ * Details about the cached tokens used as input for the Response.
+ */
+ cached_tokens_details?: RealtimeResponseUsageInputTokenDetails.CachedTokensDetails;
+
+ /**
+ * The number of image tokens used as input for the Response.
+ */
+ image_tokens?: number;
+
+ /**
+ * The number of text tokens used as input for the Response.
*/
text_tokens?: number;
}
+export namespace RealtimeResponseUsageInputTokenDetails {
+ /**
+ * Details about the cached tokens used as input for the Response.
+ */
+ export interface CachedTokensDetails {
+ /**
+ * The number of cached audio tokens used as input for the Response.
+ */
+ audio_tokens?: number;
+
+ /**
+ * The number of cached image tokens used as input for the Response.
+ */
+ image_tokens?: number;
+
+ /**
+ * The number of cached text tokens used as input for the Response.
+ */
+ text_tokens?: number;
+ }
+}
+
/**
* Details about the output tokens used in the Response.
*/
@@ -2045,7 +2478,10 @@ export type RealtimeServerEvent =
export namespace RealtimeServerEvent {
/**
* Returned when a conversation item is retrieved with
- * `conversation.item.retrieve`.
+ * `conversation.item.retrieve`. This is provided as a way to fetch the server's
+ * representation of an item, for example to get access to the post-processed audio
+ * data after noise cancellation and VAD. It includes the full content of the Item,
+ * including audio data.
*/
export interface ConversationItemRetrieved {
/**
@@ -2184,7 +2620,7 @@ export interface RealtimeSession {
* what the model heard. The client can optionally set the language and prompt for
* transcription, these offer additional guidance to the transcription service.
*/
- input_audio_transcription?: RealtimeSession.InputAudioTranscription | null;
+ input_audio_transcription?: AudioTranscription | null;
/**
* The default system instructions (i.e. system message) prepended to model calls.
@@ -2266,7 +2702,7 @@ export interface RealtimeSession {
/**
* Tools (functions) available to the model.
*/
- tools?: Array;
+ tools?: Array;
/**
* Configuration options for tracing. Set to null to disable tracing. Once tracing
@@ -2324,64 +2760,7 @@ export namespace RealtimeSession {
* headphones, `far_field` is for far-field microphones such as laptop or
* conference room microphones.
*/
- type?: 'near_field' | 'far_field';
- }
-
- /**
- * Configuration for input audio transcription, defaults to off and can be set to
- * `null` to turn off once on. Input audio transcription is not native to the
- * model, since the model consumes audio directly. Transcription runs
- * asynchronously through
- * [the /audio/transcriptions endpoint](https://platform.openai.com/docs/api-reference/audio/createTranscription)
- * and should be treated as guidance of input audio content rather than precisely
- * what the model heard. The client can optionally set the language and prompt for
- * transcription, these offer additional guidance to the transcription service.
- */
- export interface InputAudioTranscription {
- /**
- * The language of the input audio. Supplying the input language in
- * [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`)
- * format will improve accuracy and latency.
- */
- language?: string;
-
- /**
- * The model to use for transcription, current options are `gpt-4o-transcribe`,
- * `gpt-4o-mini-transcribe`, and `whisper-1`.
- */
- model?: string;
-
- /**
- * An optional text to guide the model's style or continue a previous audio
- * segment. For `whisper-1`, the
- * [prompt is a list of keywords](https://platform.openai.com/docs/guides/speech-to-text#prompting).
- * For `gpt-4o-transcribe` models, the prompt is a free text string, for example
- * "expect words related to technology".
- */
- prompt?: string;
- }
-
- export interface Tool {
- /**
- * The description of the function, including guidance on when and how to call it,
- * and guidance about what to tell the user when calling (if anything).
- */
- description?: string;
-
- /**
- * The name of the function.
- */
- name?: string;
-
- /**
- * Parameters of the function in JSON Schema.
- */
- parameters?: unknown;
-
- /**
- * The type of the tool, i.e. `function`.
- */
- type?: 'function';
+ type?: RealtimeAPI.NoiseReductionType;
}
/**
@@ -2477,22 +2856,6 @@ export namespace RealtimeSession {
* Realtime session object configuration.
*/
export interface RealtimeSessionCreateRequest {
- /**
- * The Realtime model used for this session.
- */
- model:
- | (string & {})
- | 'gpt-realtime'
- | 'gpt-realtime-2025-08-28'
- | 'gpt-4o-realtime'
- | 'gpt-4o-mini-realtime'
- | 'gpt-4o-realtime-preview'
- | 'gpt-4o-realtime-preview-2024-10-01'
- | 'gpt-4o-realtime-preview-2024-12-17'
- | 'gpt-4o-realtime-preview-2025-06-03'
- | 'gpt-4o-mini-realtime-preview'
- | 'gpt-4o-mini-realtime-preview-2024-12-17';
-
/**
* The type of session to create. Always `realtime` for the Realtime API.
*/
@@ -2503,16 +2866,11 @@ export interface RealtimeSessionCreateRequest {
*/
audio?: RealtimeAudioConfig;
- /**
- * Configuration options for the generated client secret.
- */
- client_secret?: RealtimeClientSecretConfig;
-
/**
* Additional fields to include in server outputs.
*
- * - `item.input_audio_transcription.logprobs`: Include logprobs for input audio
- * transcription.
+ * `item.input_audio_transcription.logprobs`: Include logprobs for input audio
+ * transcription.
*/
include?: Array<'item.input_audio_transcription.logprobs'>;
@@ -2539,8 +2897,24 @@ export interface RealtimeSessionCreateRequest {
max_output_tokens?: number | 'inf';
/**
- * The set of modalities the model can respond with. To disable audio, set this to
- * ["text"].
+ * The Realtime model used for this session.
+ */
+ model?:
+ | (string & {})
+ | 'gpt-realtime'
+ | 'gpt-realtime-2025-08-28'
+ | 'gpt-4o-realtime-preview'
+ | 'gpt-4o-realtime-preview-2024-10-01'
+ | 'gpt-4o-realtime-preview-2024-12-17'
+ | 'gpt-4o-realtime-preview-2025-06-03'
+ | 'gpt-4o-mini-realtime-preview'
+ | 'gpt-4o-mini-realtime-preview-2024-12-17';
+
+ /**
+ * The set of modalities the model can respond with. It defaults to `["audio"]`,
+ * indicating that the model will respond with audio plus a transcript. `["text"]`
+ * can be used to make the model respond with text only. It is not possible to
+ * request both `text` and `audio` at the same time.
*/
output_modalities?: Array<'text' | 'audio'>;
@@ -2550,12 +2924,6 @@ export interface RealtimeSessionCreateRequest {
*/
prompt?: ResponsesAPI.ResponsePrompt | null;
- /**
- * Sampling temperature for the model, limited to [0.6, 1.2]. For audio models a
- * temperature of 0.8 is highly recommended for best performance.
- */
- temperature?: number;
-
/**
* How the model chooses tools. Provide one of the string modes or force a specific
* function/MCP tool.
@@ -2568,8 +2936,9 @@ export interface RealtimeSessionCreateRequest {
tools?: RealtimeToolsConfig;
/**
- * Configuration options for tracing. Set to null to disable tracing. Once tracing
- * is enabled for a session, the configuration cannot be modified.
+ * Realtime API can write session traces to the
+ * [Traces Dashboard](/logs?api=traces). Set to null to disable tracing. Once
+ * tracing is enabled for a session, the configuration cannot be modified.
*
* `auto` will create a trace for the session with default values for the workflow
* name, group id, and metadata.
@@ -2578,8 +2947,7 @@ export interface RealtimeSessionCreateRequest {
/**
* Controls how the realtime conversation is truncated prior to model inference.
- * The default is `auto`. When set to `retention_ratio`, the server retains a
- * fraction of the conversation tokens prior to the instructions.
+ * The default is `auto`.
*/
truncation?: RealtimeTruncation;
}
@@ -2603,32 +2971,9 @@ export type RealtimeToolsConfig = Array;
* (MCP) servers.
* [Learn more about MCP](https://platform.openai.com/docs/guides/tools-remote-mcp).
*/
-export type RealtimeToolsConfigUnion = RealtimeToolsConfigUnion.Function | RealtimeToolsConfigUnion.Mcp;
+export type RealtimeToolsConfigUnion = Models | RealtimeToolsConfigUnion.Mcp;
export namespace RealtimeToolsConfigUnion {
- export interface Function {
- /**
- * The description of the function, including guidance on when and how to call it,
- * and guidance about what to tell the user when calling (if anything).
- */
- description?: string;
-
- /**
- * The name of the function.
- */
- name?: string;
-
- /**
- * Parameters of the function in JSON Schema.
- */
- parameters?: unknown;
-
- /**
- * The type of the tool, i.e. `function`.
- */
- type?: 'function';
- }
-
/**
* Give the model access to additional tools via remote Model Context Protocol
* (MCP) servers.
@@ -2783,8 +3128,9 @@ export namespace RealtimeToolsConfigUnion {
}
/**
- * Configuration options for tracing. Set to null to disable tracing. Once tracing
- * is enabled for a session, the configuration cannot be modified.
+ * Realtime API can write session traces to the
+ * [Traces Dashboard](/logs?api=traces). Set to null to disable tracing. Once
+ * tracing is enabled for a session, the configuration cannot be modified.
*
* `auto` will create a trace for the session with default values for the workflow
* name, group id, and metadata.
@@ -2798,54 +3144,36 @@ export namespace RealtimeTracingConfig {
export interface TracingConfiguration {
/**
* The group id to attach to this trace to enable filtering and grouping in the
- * traces dashboard.
+ * Traces Dashboard.
*/
group_id?: string;
/**
- * The arbitrary metadata to attach to this trace to enable filtering in the traces
- * dashboard.
+ * The arbitrary metadata to attach to this trace to enable filtering in the Traces
+ * Dashboard.
*/
metadata?: unknown;
/**
* The name of the workflow to attach to this trace. This is used to name the trace
- * in the traces dashboard.
+ * in the Traces Dashboard.
*/
workflow_name?: string;
}
}
/**
- * Realtime transcription session object configuration.
+ * Configuration for input and output audio.
*/
-export interface RealtimeTranscriptionSessionCreateRequest {
- /**
- * ID of the model to use. The options are `gpt-4o-transcribe`,
- * `gpt-4o-mini-transcribe`, and `whisper-1` (which is powered by our open source
- * Whisper V2 model).
- */
- model: (string & {}) | 'whisper-1' | 'gpt-4o-transcribe' | 'gpt-4o-mini-transcribe';
-
- /**
- * The type of session to create. Always `transcription` for transcription
- * sessions.
- */
- type: 'transcription';
+export interface RealtimeTranscriptionSessionAudio {
+ input?: RealtimeTranscriptionSessionAudioInput;
+}
+export interface RealtimeTranscriptionSessionAudioInput {
/**
- * The set of items to include in the transcription. Current available items are:
- *
- * - `item.input_audio_transcription.logprobs`
+ * The PCM audio format. Only a 24kHz sample rate is supported.
*/
- include?: Array<'item.input_audio_transcription.logprobs'>;
-
- /**
- * The format of input audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`. For
- * `pcm16`, input audio must be 16-bit PCM at a 24kHz sample rate, single channel
- * (mono), and little-endian byte order.
- */
- input_audio_format?: 'pcm16' | 'g711_ulaw' | 'g711_alaw';
+ format?: RealtimeAudioFormats;
/**
* Configuration for input audio noise reduction. This can be set to `null` to turn
@@ -2854,24 +3182,36 @@ export interface RealtimeTranscriptionSessionCreateRequest {
* detection accuracy (reducing false positives) and model performance by improving
* perception of the input audio.
*/
- input_audio_noise_reduction?: RealtimeTranscriptionSessionCreateRequest.InputAudioNoiseReduction;
+ noise_reduction?: RealtimeTranscriptionSessionAudioInput.NoiseReduction;
/**
- * Configuration for input audio transcription. The client can optionally set the
- * language and prompt for transcription, these offer additional guidance to the
- * transcription service.
+ * Configuration for input audio transcription, defaults to off and can be set to
+ * `null` to turn off once on. Input audio transcription is not native to the
+ * model, since the model consumes audio directly. Transcription runs
+ * asynchronously through
+ * [the /audio/transcriptions endpoint](https://platform.openai.com/docs/api-reference/audio/createTranscription)
+ * and should be treated as guidance of input audio content rather than precisely
+ * what the model heard. The client can optionally set the language and prompt for
+ * transcription, these offer additional guidance to the transcription service.
*/
- input_audio_transcription?: RealtimeTranscriptionSessionCreateRequest.InputAudioTranscription;
+ transcription?: AudioTranscription;
/**
- * Configuration for turn detection. Can be set to `null` to turn off. Server VAD
- * means that the model will detect the start and end of speech based on audio
- * volume and respond at the end of user speech.
+ * Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
+ * set to `null` to turn off, in which case the client must manually trigger model
+ * response. Server VAD means that the model will detect the start and end of
+ * speech based on audio volume and respond at the end of user speech. Semantic VAD
+ * is more advanced and uses a turn detection model (in conjunction with VAD) to
+ * semantically estimate whether the user has finished speaking, then dynamically
+ * sets a timeout based on this probability. For example, if user audio trails off
+ * with "uhhm", the model will score a low probability of turn end and wait longer
+ * for the user to continue speaking. This can be useful for more natural
+ * conversations, but may have a higher latency.
*/
- turn_detection?: RealtimeTranscriptionSessionCreateRequest.TurnDetection;
+ turn_detection?: RealtimeTranscriptionSessionAudioInputTurnDetection;
}
-export namespace RealtimeTranscriptionSessionCreateRequest {
+export namespace RealtimeTranscriptionSessionAudioInput {
/**
* Configuration for input audio noise reduction. This can be set to `null` to turn
* off. Noise reduction filters audio added to the input audio buffer before it is
@@ -2879,105 +3219,127 @@ export namespace RealtimeTranscriptionSessionCreateRequest {
* detection accuracy (reducing false positives) and model performance by improving
* perception of the input audio.
*/
- export interface InputAudioNoiseReduction {
+ export interface NoiseReduction {
/**
* Type of noise reduction. `near_field` is for close-talking microphones such as
* headphones, `far_field` is for far-field microphones such as laptop or
* conference room microphones.
*/
- type?: 'near_field' | 'far_field';
+ type?: RealtimeAPI.NoiseReductionType;
}
+}
+/**
+ * Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
+ * set to `null` to turn off, in which case the client must manually trigger model
+ * response. Server VAD means that the model will detect the start and end of
+ * speech based on audio volume and respond at the end of user speech. Semantic VAD
+ * is more advanced and uses a turn detection model (in conjunction with VAD) to
+ * semantically estimate whether the user has finished speaking, then dynamically
+ * sets a timeout based on this probability. For example, if user audio trails off
+ * with "uhhm", the model will score a low probability of turn end and wait longer
+ * for the user to continue speaking. This can be useful for more natural
+ * conversations, but may have a higher latency.
+ */
+export interface RealtimeTranscriptionSessionAudioInputTurnDetection {
/**
- * Configuration for input audio transcription. The client can optionally set the
- * language and prompt for transcription, these offer additional guidance to the
- * transcription service.
+ * Whether or not to automatically generate a response when a VAD stop event
+ * occurs.
*/
- export interface InputAudioTranscription {
- /**
- * The language of the input audio. Supplying the input language in
- * [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`)
- * format will improve accuracy and latency.
- */
- language?: string;
+ create_response?: boolean;
- /**
- * The model to use for transcription, current options are `gpt-4o-transcribe`,
- * `gpt-4o-mini-transcribe`, and `whisper-1`.
- */
- model?: 'gpt-4o-transcribe' | 'gpt-4o-mini-transcribe' | 'whisper-1';
+ /**
+ * Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
+ * will wait longer for the user to continue speaking, `high` will respond more
+ * quickly. `auto` is the default and is equivalent to `medium`.
+ */
+ eagerness?: 'low' | 'medium' | 'high' | 'auto';
- /**
- * An optional text to guide the model's style or continue a previous audio
- * segment. For `whisper-1`, the
- * [prompt is a list of keywords](https://platform.openai.com/docs/guides/speech-to-text#prompting).
- * For `gpt-4o-transcribe` models, the prompt is a free text string, for example
- * "expect words related to technology".
- */
- prompt?: string;
- }
+ /**
+ * Optional idle timeout after which turn detection will auto-timeout when no
+ * additional audio is received.
+ */
+ idle_timeout_ms?: number | null;
/**
- * Configuration for turn detection. Can be set to `null` to turn off. Server VAD
- * means that the model will detect the start and end of speech based on audio
- * volume and respond at the end of user speech.
+ * Whether or not to automatically interrupt any ongoing response with output to
+ * the default conversation (i.e. `conversation` of `auto`) when a VAD start event
+ * occurs.
*/
- export interface TurnDetection {
- /**
- * Amount of audio to include before the VAD detected speech (in milliseconds).
- * Defaults to 300ms.
- */
- prefix_padding_ms?: number;
+ interrupt_response?: boolean;
- /**
- * Duration of silence to detect speech stop (in milliseconds). Defaults to 500ms.
- * With shorter values the model will respond more quickly, but may jump in on
- * short pauses from the user.
- */
- silence_duration_ms?: number;
+ /**
+ * Used only for `server_vad` mode. Amount of audio to include before the VAD
+ * detected speech (in milliseconds). Defaults to 300ms.
+ */
+ prefix_padding_ms?: number;
+
+ /**
+ * Used only for `server_vad` mode. Duration of silence to detect speech stop (in
+ * milliseconds). Defaults to 500ms. With shorter values the model will respond
+ * more quickly, but may jump in on short pauses from the user.
+ */
+ silence_duration_ms?: number;
+
+ /**
+ * Used only for `server_vad` mode. Activation threshold for VAD (0.0 to 1.0), this
+ * defaults to 0.5. A higher threshold will require louder audio to activate the
+ * model, and thus might perform better in noisy environments.
+ */
+ threshold?: number;
+
+ /**
+ * Type of turn detection.
+ */
+ type?: 'server_vad' | 'semantic_vad';
+}
+
+/**
+ * Realtime transcription session object configuration.
+ */
+export interface RealtimeTranscriptionSessionCreateRequest {
+ /**
+ * The type of session to create. Always `transcription` for transcription
+ * sessions.
+ */
+ type: 'transcription';
- /**
- * Activation threshold for VAD (0.0 to 1.0), this defaults to 0.5. A higher
- * threshold will require louder audio to activate the model, and thus might
- * perform better in noisy environments.
- */
- threshold?: number;
+ /**
+ * Configuration for input and output audio.
+ */
+ audio?: RealtimeTranscriptionSessionAudio;
- /**
- * Type of turn detection. Only `server_vad` is currently supported for
- * transcription sessions.
- */
- type?: 'server_vad';
- }
+ /**
+ * Additional fields to include in server outputs.
+ *
+ * `item.input_audio_transcription.logprobs`: Include logprobs for input audio
+ * transcription.
+ */
+ include?: Array<'item.input_audio_transcription.logprobs'>;
}
/**
* Controls how the realtime conversation is truncated prior to model inference.
- * The default is `auto`. When set to `retention_ratio`, the server retains a
- * fraction of the conversation tokens prior to the instructions.
+ * The default is `auto`.
*/
-export type RealtimeTruncation = 'auto' | 'disabled' | RealtimeTruncation.RetentionRatioTruncation;
+export type RealtimeTruncation = 'auto' | 'disabled' | RealtimeTruncationRetentionRatio;
-export namespace RealtimeTruncation {
+/**
+ * Retain a fraction of the conversation tokens when the conversation exceeds the
+ * input token limit. This allows you to amortize truncations across multiple
+ * turns, which can help improve cached token usage.
+ */
+export interface RealtimeTruncationRetentionRatio {
/**
- * Retain a fraction of the conversation tokens.
+ * Fraction of post-instruction conversation tokens to retain (0.0 - 1.0) when the
+ * conversation exceeds the input token limit.
*/
- export interface RetentionRatioTruncation {
- /**
- * Fraction of pre-instruction conversation tokens to retain (0.0 - 1.0).
- */
- retention_ratio: number;
-
- /**
- * Use retention ratio truncation.
- */
- type: 'retention_ratio';
+ retention_ratio: number;
- /**
- * Optional cap on tokens allowed after the instructions.
- */
- post_instructions_token_limit?: number | null;
- }
+ /**
+ * Use retention ratio truncation.
+ */
+ type: 'retention_ratio';
}
/**
@@ -3141,7 +3503,9 @@ export interface ResponseAudioTranscriptDoneEvent {
/**
* Send this event to cancel an in-progress response. The server will respond with
* a `response.done` event with a status of `response.status=cancelled`. If there
- * is no response to cancel, the server will respond with an error.
+ * is no response to cancel, the server will respond with an error. It's safe to
+ * call `response.cancel` even if no response is in progress, an error will be
+ * returned the session will remain unaffected.
*/
export interface ResponseCancelEvent {
/**
@@ -3304,15 +3668,26 @@ export namespace ResponseContentPartDoneEvent {
*
* A Response will include at least one Item, and may have two, in which case the
* second will be a function call. These Items will be appended to the conversation
- * history.
+ * history by default.
*
* The server will respond with a `response.created` event, events for Items and
* content created, and finally a `response.done` event to indicate the Response is
* complete.
*
- * The `response.create` event includes inference configuration like
- * `instructions`, and `temperature`. These fields will override the Session's
- * configuration for this Response only.
+ * The `response.create` event includes inference configuration like `instructions`
+ * and `tools`. If these are set, they will override the Session's configuration
+ * for this Response only.
+ *
+ * Responses can be created out-of-band of the default Conversation, meaning that
+ * they can have arbitrary input, and it's possible to disable writing the output
+ * to the Conversation. Only one Response can write to the default Conversation at
+ * a time, but otherwise multiple Responses can be created in parallel. The
+ * `metadata` field is a good way to disambiguate multiple simultaneous Responses.
+ *
+ * Clients can set `conversation` to `none` to create a Response that does not
+ * write to the default Conversation. Arbitrary input can be provided with the
+ * `input` field, which is an array accepting raw Items and references to existing
+ * Items.
*/
export interface ResponseCreateEvent {
/**
@@ -3328,142 +3703,7 @@ export interface ResponseCreateEvent {
/**
* Create a new Realtime response with these parameters
*/
- response?: ResponseCreateEvent.Response;
-}
-
-export namespace ResponseCreateEvent {
- /**
- * Create a new Realtime response with these parameters
- */
- export interface Response {
- /**
- * Controls which conversation the response is added to. Currently supports `auto`
- * and `none`, with `auto` as the default value. The `auto` value means that the
- * contents of the response will be added to the default conversation. Set this to
- * `none` to create an out-of-band response which will not add items to default
- * conversation.
- */
- conversation?: (string & {}) | 'auto' | 'none';
-
- /**
- * Input items to include in the prompt for the model. Using this field creates a
- * new context for this Response instead of using the default conversation. An
- * empty array `[]` will clear the context for this Response. Note that this can
- * include references to items from the default conversation.
- */
- input?: Array;
-
- /**
- * The default system instructions (i.e. system message) prepended to model calls.
- * This field allows the client to guide the model on desired responses. The model
- * can be instructed on response content and format, (e.g. "be extremely succinct",
- * "act friendly", "here are examples of good responses") and on audio behavior
- * (e.g. "talk quickly", "inject emotion into your voice", "laugh frequently"). The
- * instructions are not guaranteed to be followed by the model, but they provide
- * guidance to the model on the desired behavior.
- *
- * Note that the server sets default instructions which will be used if this field
- * is not set and are visible in the `session.created` event at the start of the
- * session.
- */
- instructions?: string;
-
- /**
- * Maximum number of output tokens for a single assistant response, inclusive of
- * tool calls. Provide an integer between 1 and 4096 to limit output tokens, or
- * `inf` for the maximum available tokens for a given model. Defaults to `inf`.
- */
- max_output_tokens?: number | 'inf';
-
- /**
- * Set of 16 key-value pairs that can be attached to an object. This can be useful
- * for storing additional information about the object in a structured format, and
- * querying for objects via API or the dashboard.
- *
- * Keys are strings with a maximum length of 64 characters. Values are strings with
- * a maximum length of 512 characters.
- */
- metadata?: Shared.Metadata | null;
-
- /**
- * The set of modalities the model can respond with. To disable audio, set this to
- * ["text"].
- */
- modalities?: Array<'text' | 'audio'>;
-
- /**
- * The format of output audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`.
- */
- output_audio_format?: 'pcm16' | 'g711_ulaw' | 'g711_alaw';
-
- /**
- * Reference to a prompt template and its variables.
- * [Learn more](https://platform.openai.com/docs/guides/text?api-mode=responses#reusable-prompts).
- */
- prompt?: ResponsesAPI.ResponsePrompt | null;
-
- /**
- * Sampling temperature for the model, limited to [0.6, 1.2]. Defaults to 0.8.
- */
- temperature?: number;
-
- /**
- * How the model chooses tools. Provide one of the string modes or force a specific
- * function/MCP tool.
- */
- tool_choice?:
- | ResponsesAPI.ToolChoiceOptions
- | ResponsesAPI.ToolChoiceFunction
- | ResponsesAPI.ToolChoiceMcp;
-
- /**
- * Tools (functions) available to the model.
- */
- tools?: Array;
-
- /**
- * The voice the model uses to respond. Voice cannot be changed during the session
- * once the model has responded with audio at least once. Current voice options are
- * `alloy`, `ash`, `ballad`, `coral`, `echo`, `sage`, `shimmer`, and `verse`.
- */
- voice?:
- | (string & {})
- | 'alloy'
- | 'ash'
- | 'ballad'
- | 'coral'
- | 'echo'
- | 'sage'
- | 'shimmer'
- | 'verse'
- | 'marin'
- | 'cedar';
- }
-
- export namespace Response {
- export interface Tool {
- /**
- * The description of the function, including guidance on when and how to call it,
- * and guidance about what to tell the user when calling (if anything).
- */
- description?: string;
-
- /**
- * The name of the function.
- */
- name?: string;
-
- /**
- * Parameters of the function in JSON Schema.
- */
- parameters?: unknown;
-
- /**
- * The type of the tool, i.e. `function`.
- */
- type?: 'function';
- }
- }
+ response?: RealtimeResponseCreateParams;
}
/**
@@ -3491,6 +3731,13 @@ export interface ResponseCreatedEvent {
* Returned when a Response is done streaming. Always emitted, no matter the final
* state. The Response object included in the `response.done` event will include
* all output Items in the Response but will omit the raw audio data.
+ *
+ * Clients should check the `status` field of the Response to determine if it was
+ * successful (`completed`) or if there was another outcome: `cancelled`, `failed`,
+ * or `incomplete`.
+ *
+ * A response will contain all output items that were generated during the
+ * response, excluding any audio content.
*/
export interface ResponseDoneEvent {
/**
@@ -3894,9 +4141,9 @@ export interface SessionCreatedEvent {
event_id: string;
/**
- * Realtime session object.
+ * The session configuration.
*/
- session: RealtimeSession;
+ session: RealtimeSessionCreateRequest | RealtimeTranscriptionSessionCreateRequest;
/**
* The event type, must be `session.created`.
@@ -3905,21 +4152,22 @@ export interface SessionCreatedEvent {
}
/**
- * Send this event to update the session’s default configuration. The client may
- * send this event at any time to update any field, except for `voice`. However,
- * note that once a session has been initialized with a particular `model`, it
- * can’t be changed to another model using `session.update`.
+ * Send this event to update the session’s configuration. The client may send this
+ * event at any time to update any field except for `voice` and `model`. `voice`
+ * can be updated only if there have been no other audio outputs yet.
*
* When the server receives a `session.update`, it will respond with a
* `session.updated` event showing the full, effective configuration. Only the
- * fields that are present are updated. To clear a field like `instructions`, pass
- * an empty string.
+ * fields that are present in the `session.update` are updated. To clear a field
+ * like `instructions`, pass an empty string. To clear a field like `tools`, pass
+ * an empty array. To clear a field like `turn_detection`, pass `null`.
*/
export interface SessionUpdateEvent {
/**
- * Realtime session object configuration.
+ * Update the Realtime session. Choose either a realtime session or a transcription
+ * session.
*/
- session: RealtimeSessionCreateRequest;
+ session: RealtimeSessionCreateRequest | RealtimeTranscriptionSessionCreateRequest;
/**
* The event type, must be `session.update`.
@@ -3927,7 +4175,10 @@ export interface SessionUpdateEvent {
type: 'session.update';
/**
- * Optional client-generated ID used to identify this event.
+ * Optional client-generated ID used to identify this event. This is an arbitrary
+ * string that a client may assign. It will be passed back if there is an error
+ * with the event, but the corresponding `session.updated` event will not include
+ * it.
*/
event_id?: string;
}
@@ -3943,9 +4194,9 @@ export interface SessionUpdatedEvent {
event_id: string;
/**
- * Realtime session object.
+ * The session configuration.
*/
- session: RealtimeSession;
+ session: RealtimeSessionCreateRequest | RealtimeTranscriptionSessionCreateRequest;
/**
* The event type, must be `session.updated`.
@@ -3963,9 +4214,13 @@ export interface TranscriptionSessionCreated {
event_id: string;
/**
- * A Realtime transcription session configuration object.
+ * A new Realtime transcription session configuration.
+ *
+ * When a session is created on the server via REST API, the session object also
+ * contains an ephemeral key. Default TTL for keys is 10 minutes. This property is
+ * not present when a session is updated via the WebSocket API.
*/
- session: TranscriptionSessionCreated.Session;
+ session: ClientSecretsAPI.RealtimeTranscriptionSessionCreateResponse;
/**
* The event type, must be `transcription_session.created`.
@@ -3973,125 +4228,6 @@ export interface TranscriptionSessionCreated {
type: 'transcription_session.created';
}
-export namespace TranscriptionSessionCreated {
- /**
- * A Realtime transcription session configuration object.
- */
- export interface Session {
- /**
- * Unique identifier for the session that looks like `sess_1234567890abcdef`.
- */
- id?: string;
-
- /**
- * Configuration for input audio for the session.
- */
- audio?: Session.Audio;
-
- /**
- * Expiration timestamp for the session, in seconds since epoch.
- */
- expires_at?: number;
-
- /**
- * Additional fields to include in server outputs.
- *
- * - `item.input_audio_transcription.logprobs`: Include logprobs for input audio
- * transcription.
- */
- include?: Array<'item.input_audio_transcription.logprobs'>;
-
- /**
- * The object type. Always `realtime.transcription_session`.
- */
- object?: string;
- }
-
- export namespace Session {
- /**
- * Configuration for input audio for the session.
- */
- export interface Audio {
- input?: Audio.Input;
- }
-
- export namespace Audio {
- export interface Input {
- /**
- * The format of input audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`.
- */
- format?: string;
-
- /**
- * Configuration for input audio noise reduction.
- */
- noise_reduction?: Input.NoiseReduction;
-
- /**
- * Configuration of the transcription model.
- */
- transcription?: Input.Transcription;
-
- /**
- * Configuration for turn detection.
- */
- turn_detection?: Input.TurnDetection;
- }
-
- export namespace Input {
- /**
- * Configuration for input audio noise reduction.
- */
- export interface NoiseReduction {
- type?: 'near_field' | 'far_field';
- }
-
- /**
- * Configuration of the transcription model.
- */
- export interface Transcription {
- /**
- * The language of the input audio. Supplying the input language in
- * [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`)
- * format will improve accuracy and latency.
- */
- language?: string;
-
- /**
- * The model to use for transcription. Can be `gpt-4o-transcribe`,
- * `gpt-4o-mini-transcribe`, or `whisper-1`.
- */
- model?: 'gpt-4o-transcribe' | 'gpt-4o-mini-transcribe' | 'whisper-1';
-
- /**
- * An optional text to guide the model's style or continue a previous audio
- * segment. The
- * [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting)
- * should match the audio language.
- */
- prompt?: string;
- }
-
- /**
- * Configuration for turn detection.
- */
- export interface TurnDetection {
- prefix_padding_ms?: number;
-
- silence_duration_ms?: number;
-
- threshold?: number;
-
- /**
- * Type of turn detection, only `server_vad` is currently supported.
- */
- type?: string;
- }
- }
- }
- }
-}
-
/**
* Send this event to update a transcription session.
*/
@@ -4099,7 +4235,7 @@ export interface TranscriptionSessionUpdate {
/**
* Realtime transcription session object configuration.
*/
- session: RealtimeTranscriptionSessionCreateRequest;
+ session: TranscriptionSessionUpdate.Session;
/**
* The event type, must be `transcription_session.update`.
@@ -4112,150 +4248,130 @@ export interface TranscriptionSessionUpdate {
event_id?: string;
}
-/**
- * Returned when a transcription session is updated with a
- * `transcription_session.update` event, unless there is an error.
- */
-export interface TranscriptionSessionUpdatedEvent {
- /**
- * The unique ID of the server event.
- */
- event_id: string;
-
- /**
- * A Realtime transcription session configuration object.
- */
- session: TranscriptionSessionUpdatedEvent.Session;
-
- /**
- * The event type, must be `transcription_session.updated`.
- */
- type: 'transcription_session.updated';
-}
-
-export namespace TranscriptionSessionUpdatedEvent {
+export namespace TranscriptionSessionUpdate {
/**
- * A Realtime transcription session configuration object.
+ * Realtime transcription session object configuration.
*/
export interface Session {
/**
- * Unique identifier for the session that looks like `sess_1234567890abcdef`.
+ * The set of items to include in the transcription. Current available items are:
+ * `item.input_audio_transcription.logprobs`
*/
- id?: string;
+ include?: Array<'item.input_audio_transcription.logprobs'>;
/**
- * Configuration for input audio for the session.
+ * The format of input audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`. For
+ * `pcm16`, input audio must be 16-bit PCM at a 24kHz sample rate, single channel
+ * (mono), and little-endian byte order.
*/
- audio?: Session.Audio;
+ input_audio_format?: 'pcm16' | 'g711_ulaw' | 'g711_alaw';
/**
- * Expiration timestamp for the session, in seconds since epoch.
+ * Configuration for input audio noise reduction. This can be set to `null` to turn
+ * off. Noise reduction filters audio added to the input audio buffer before it is
+ * sent to VAD and the model. Filtering the audio can improve VAD and turn
+ * detection accuracy (reducing false positives) and model performance by improving
+ * perception of the input audio.
*/
- expires_at?: number;
+ input_audio_noise_reduction?: Session.InputAudioNoiseReduction;
/**
- * Additional fields to include in server outputs.
- *
- * - `item.input_audio_transcription.logprobs`: Include logprobs for input audio
- * transcription.
+ * Configuration for input audio transcription. The client can optionally set the
+ * language and prompt for transcription, these offer additional guidance to the
+ * transcription service.
*/
- include?: Array<'item.input_audio_transcription.logprobs'>;
+ input_audio_transcription?: RealtimeAPI.AudioTranscription;
/**
- * The object type. Always `realtime.transcription_session`.
+ * Configuration for turn detection. Can be set to `null` to turn off. Server VAD
+ * means that the model will detect the start and end of speech based on audio
+ * volume and respond at the end of user speech.
*/
- object?: string;
+ turn_detection?: Session.TurnDetection;
}
export namespace Session {
/**
- * Configuration for input audio for the session.
+ * Configuration for input audio noise reduction. This can be set to `null` to turn
+ * off. Noise reduction filters audio added to the input audio buffer before it is
+ * sent to VAD and the model. Filtering the audio can improve VAD and turn
+ * detection accuracy (reducing false positives) and model performance by improving
+ * perception of the input audio.
*/
- export interface Audio {
- input?: Audio.Input;
+ export interface InputAudioNoiseReduction {
+ /**
+ * Type of noise reduction. `near_field` is for close-talking microphones such as
+ * headphones, `far_field` is for far-field microphones such as laptop or
+ * conference room microphones.
+ */
+ type?: RealtimeAPI.NoiseReductionType;
}
- export namespace Audio {
- export interface Input {
- /**
- * The format of input audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`.
- */
- format?: string;
-
- /**
- * Configuration for input audio noise reduction.
- */
- noise_reduction?: Input.NoiseReduction;
-
- /**
- * Configuration of the transcription model.
- */
- transcription?: Input.Transcription;
-
- /**
- * Configuration for turn detection.
- */
- turn_detection?: Input.TurnDetection;
- }
+ /**
+ * Configuration for turn detection. Can be set to `null` to turn off. Server VAD
+ * means that the model will detect the start and end of speech based on audio
+ * volume and respond at the end of user speech.
+ */
+ export interface TurnDetection {
+ /**
+ * Amount of audio to include before the VAD detected speech (in milliseconds).
+ * Defaults to 300ms.
+ */
+ prefix_padding_ms?: number;
- export namespace Input {
- /**
- * Configuration for input audio noise reduction.
- */
- export interface NoiseReduction {
- type?: 'near_field' | 'far_field';
- }
+ /**
+ * Duration of silence to detect speech stop (in milliseconds). Defaults to 500ms.
+ * With shorter values the model will respond more quickly, but may jump in on
+ * short pauses from the user.
+ */
+ silence_duration_ms?: number;
- /**
- * Configuration of the transcription model.
- */
- export interface Transcription {
- /**
- * The language of the input audio. Supplying the input language in
- * [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`)
- * format will improve accuracy and latency.
- */
- language?: string;
-
- /**
- * The model to use for transcription. Can be `gpt-4o-transcribe`,
- * `gpt-4o-mini-transcribe`, or `whisper-1`.
- */
- model?: 'gpt-4o-transcribe' | 'gpt-4o-mini-transcribe' | 'whisper-1';
-
- /**
- * An optional text to guide the model's style or continue a previous audio
- * segment. The
- * [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting)
- * should match the audio language.
- */
- prompt?: string;
- }
+ /**
+ * Activation threshold for VAD (0.0 to 1.0), this defaults to 0.5. A higher
+ * threshold will require louder audio to activate the model, and thus might
+ * perform better in noisy environments.
+ */
+ threshold?: number;
- /**
- * Configuration for turn detection.
- */
- export interface TurnDetection {
- prefix_padding_ms?: number;
+ /**
+ * Type of turn detection. Only `server_vad` is currently supported for
+ * transcription sessions.
+ */
+ type?: 'server_vad';
+ }
+ }
+}
- silence_duration_ms?: number;
+/**
+ * Returned when a transcription session is updated with a
+ * `transcription_session.update` event, unless there is an error.
+ */
+export interface TranscriptionSessionUpdatedEvent {
+ /**
+ * The unique ID of the server event.
+ */
+ event_id: string;
- threshold?: number;
+ /**
+ * A new Realtime transcription session configuration.
+ *
+ * When a session is created on the server via REST API, the session object also
+ * contains an ephemeral key. Default TTL for keys is 10 minutes. This property is
+ * not present when a session is updated via the WebSocket API.
+ */
+ session: ClientSecretsAPI.RealtimeTranscriptionSessionCreateResponse;
- /**
- * Type of turn detection, only `server_vad` is currently supported.
- */
- type?: string;
- }
- }
- }
- }
+ /**
+ * The event type, must be `transcription_session.updated`.
+ */
+ type: 'transcription_session.updated';
}
Realtime.ClientSecrets = ClientSecrets;
export declare namespace Realtime {
export {
+ type AudioTranscription as AudioTranscription,
type ConversationCreatedEvent as ConversationCreatedEvent,
type ConversationItem as ConversationItem,
type ConversationItemAdded as ConversationItemAdded,
@@ -4284,11 +4400,16 @@ export declare namespace Realtime {
type McpListToolsCompleted as McpListToolsCompleted,
type McpListToolsFailed as McpListToolsFailed,
type McpListToolsInProgress as McpListToolsInProgress,
+ type Models as Models,
+ type NoiseReductionType as NoiseReductionType,
type OutputAudioBufferClearEvent as OutputAudioBufferClearEvent,
type RateLimitsUpdatedEvent as RateLimitsUpdatedEvent,
type RealtimeAudioConfig as RealtimeAudioConfig,
+ type RealtimeAudioConfigInput as RealtimeAudioConfigInput,
+ type RealtimeAudioConfigOutput as RealtimeAudioConfigOutput,
+ type RealtimeAudioFormats as RealtimeAudioFormats,
+ type RealtimeAudioInputTurnDetection as RealtimeAudioInputTurnDetection,
type RealtimeClientEvent as RealtimeClientEvent,
- type RealtimeClientSecretConfig as RealtimeClientSecretConfig,
type RealtimeConversationItemAssistantMessage as RealtimeConversationItemAssistantMessage,
type RealtimeConversationItemFunctionCall as RealtimeConversationItemFunctionCall,
type RealtimeConversationItemFunctionCallOutput as RealtimeConversationItemFunctionCallOutput,
@@ -4304,6 +4425,9 @@ export declare namespace Realtime {
type RealtimeMcpToolExecutionError as RealtimeMcpToolExecutionError,
type RealtimeMcphttpError as RealtimeMcphttpError,
type RealtimeResponse as RealtimeResponse,
+ type RealtimeResponseCreateAudioOutput as RealtimeResponseCreateAudioOutput,
+ type RealtimeResponseCreateMcpTool as RealtimeResponseCreateMcpTool,
+ type RealtimeResponseCreateParams as RealtimeResponseCreateParams,
type RealtimeResponseStatus as RealtimeResponseStatus,
type RealtimeResponseUsage as RealtimeResponseUsage,
type RealtimeResponseUsageInputTokenDetails as RealtimeResponseUsageInputTokenDetails,
@@ -4315,8 +4439,12 @@ export declare namespace Realtime {
type RealtimeToolsConfig as RealtimeToolsConfig,
type RealtimeToolsConfigUnion as RealtimeToolsConfigUnion,
type RealtimeTracingConfig as RealtimeTracingConfig,
+ type RealtimeTranscriptionSessionAudio as RealtimeTranscriptionSessionAudio,
+ type RealtimeTranscriptionSessionAudioInput as RealtimeTranscriptionSessionAudioInput,
+ type RealtimeTranscriptionSessionAudioInputTurnDetection as RealtimeTranscriptionSessionAudioInputTurnDetection,
type RealtimeTranscriptionSessionCreateRequest as RealtimeTranscriptionSessionCreateRequest,
type RealtimeTruncation as RealtimeTruncation,
+ type RealtimeTruncationRetentionRatio as RealtimeTruncationRetentionRatio,
type ResponseAudioDeltaEvent as ResponseAudioDeltaEvent,
type ResponseAudioDoneEvent as ResponseAudioDoneEvent,
type ResponseAudioTranscriptDeltaEvent as ResponseAudioTranscriptDeltaEvent,
@@ -4348,7 +4476,12 @@ export declare namespace Realtime {
export {
ClientSecrets as ClientSecrets,
+ type RealtimeSessionClientSecret as RealtimeSessionClientSecret,
type RealtimeSessionCreateResponse as RealtimeSessionCreateResponse,
+ type RealtimeTranscriptionSessionClientSecret as RealtimeTranscriptionSessionClientSecret,
+ type RealtimeTranscriptionSessionCreateResponse as RealtimeTranscriptionSessionCreateResponse,
+ type RealtimeTranscriptionSessionInputAudioTranscription as RealtimeTranscriptionSessionInputAudioTranscription,
+ type RealtimeTranscriptionSessionTurnDetection as RealtimeTranscriptionSessionTurnDetection,
type ClientSecretCreateResponse as ClientSecretCreateResponse,
type ClientSecretCreateParams as ClientSecretCreateParams,
};
diff --git a/src/version.ts b/src/version.ts
index 51eae91f3..36168d9b4 100644
--- a/src/version.ts
+++ b/src/version.ts
@@ -1 +1 @@
-export const VERSION = '5.19.1'; // x-release-please-version
+export const VERSION = '5.20.0'; // x-release-please-version