diff --git a/.release-please-manifest.json b/.release-please-manifest.json
index a0c8c3e19..7443ba284 100644
--- a/.release-please-manifest.json
+++ b/.release-please-manifest.json
@@ -1,3 +1,3 @@
{
- ".": "5.16.0"
+ ".": "5.17.0"
}
diff --git a/.stats.yml b/.stats.yml
index 5ad90ac5a..ebe81d146 100644
--- a/.stats.yml
+++ b/.stats.yml
@@ -1,4 +1,4 @@
-configured_endpoints: 119
-openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/openai%2Fopenai-8517ffa1004e31ca2523d617629e64be6fe4f13403ddfd9db5b3be002656cbde.yml
-openapi_spec_hash: b64dd8c8b23082a7aa2a3e5c5fffd8bd
-config_hash: fe0ea26680ac2075a6cd66416aefe7db
+configured_endpoints: 118
+openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/openai%2Fopenai-356b4364203ff36d7724074cd04f6e684253bfcc3c9d969122d730aa7bc51b46.yml
+openapi_spec_hash: 4ab8e96f52699bc3d2b0c4432aa92af8
+config_hash: b854932c0ea24b400bdd64e4376936bd
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 2dd01aa0c..c358929fe 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,18 @@
# Changelog
+## 5.17.0 (2025-09-02)
+
+Full Changelog: [v5.16.0...v5.17.0](https://github.com/openai/openai-node/compare/v5.16.0...v5.17.0)
+
+### Features
+
+* **api:** realtime API updates ([e817255](https://github.com/openai/openai-node/commit/e817255e6ff9e3ad6bd08b001644c335e0459537))
+
+
+### Chores
+
+* **internal:** update global Error reference ([e566ff3](https://github.com/openai/openai-node/commit/e566ff321642a100756224b75a67d44e262e5bea))
+
## 5.16.0 (2025-08-26)
Full Changelog: [v5.15.0...v5.16.0](https://github.com/openai/openai-node/compare/v5.15.0...v5.16.0)
diff --git a/README.md b/README.md
index 9864a4829..351d9c0dc 100644
--- a/README.md
+++ b/README.md
@@ -264,14 +264,14 @@ const { data: stream, request_id } = await openai.chat.completions
.withResponse();
```
-## Realtime API Beta
+## Realtime API
The Realtime API enables you to build low-latency, multi-modal conversational experiences. It currently supports text and audio as both input and output, as well as [function calling](https://platform.openai.com/docs/guides/function-calling) through a `WebSocket` connection.
```ts
-import { OpenAIRealtimeWebSocket } from 'openai/beta/realtime/websocket';
+import { OpenAIRealtimeWebSocket } from 'openai/realtime/websocket';
-const rt = new OpenAIRealtimeWebSocket({ model: 'gpt-4o-realtime-preview-2024-12-17' });
+const rt = new OpenAIRealtimeWebSocket({ model: 'gpt-realtime' });
rt.on('response.text.delta', (event) => process.stdout.write(event.delta));
```
@@ -401,14 +401,14 @@ while (page.hasNextPage()) {
}
```
-## Realtime API Beta
+## Realtime API
The Realtime API enables you to build low-latency, multi-modal conversational experiences. It currently supports text and audio as both input and output, as well as [function calling](https://platform.openai.com/docs/guides/function-calling) through a `WebSocket` connection.
```ts
-import { OpenAIRealtimeWebSocket } from 'openai/beta/realtime/websocket';
+import { OpenAIRealtimeWebSocket } from 'openai/realtime/websocket';
-const rt = new OpenAIRealtimeWebSocket({ model: 'gpt-4o-realtime-preview-2024-12-17' });
+const rt = new OpenAIRealtimeWebSocket({ model: 'gpt-realtime' });
rt.on('response.text.delta', (event) => process.stdout.write(event.delta));
```
diff --git a/api.md b/api.md
index d133b6162..e8a4c861d 100644
--- a/api.md
+++ b/api.md
@@ -381,6 +381,7 @@ Types:
- FineTuningJobCancelledWebhookEvent
- FineTuningJobFailedWebhookEvent
- FineTuningJobSucceededWebhookEvent
+- RealtimeCallIncomingWebhookEvent
- ResponseCancelledWebhookEvent
- ResponseCompletedWebhookEvent
- ResponseFailedWebhookEvent
@@ -751,6 +752,7 @@ Types:
- ToolChoiceMcp
- ToolChoiceOptions
- ToolChoiceTypes
+- WebSearchPreviewTool
- WebSearchTool
Methods:
@@ -770,6 +772,110 @@ Methods:
- client.responses.inputItems.list(responseID, { ...params }) -> ResponseItemsPage
+# Realtime
+
+Types:
+
+- ConversationCreatedEvent
+- ConversationItem
+- ConversationItemAdded
+- ConversationItemCreateEvent
+- ConversationItemCreatedEvent
+- ConversationItemDeleteEvent
+- ConversationItemDeletedEvent
+- ConversationItemDone
+- ConversationItemInputAudioTranscriptionCompletedEvent
+- ConversationItemInputAudioTranscriptionDeltaEvent
+- ConversationItemInputAudioTranscriptionFailedEvent
+- ConversationItemInputAudioTranscriptionSegment
+- ConversationItemRetrieveEvent
+- ConversationItemTruncateEvent
+- ConversationItemTruncatedEvent
+- ConversationItemWithReference
+- InputAudioBufferAppendEvent
+- InputAudioBufferClearEvent
+- InputAudioBufferClearedEvent
+- InputAudioBufferCommitEvent
+- InputAudioBufferCommittedEvent
+- InputAudioBufferSpeechStartedEvent
+- InputAudioBufferSpeechStoppedEvent
+- InputAudioBufferTimeoutTriggered
+- LogProbProperties
+- McpListToolsCompleted
+- McpListToolsFailed
+- McpListToolsInProgress
+- OutputAudioBufferClearEvent
+- RateLimitsUpdatedEvent
+- RealtimeAudioConfig
+- RealtimeClientEvent
+- RealtimeClientSecretConfig
+- RealtimeConversationItemAssistantMessage
+- RealtimeConversationItemFunctionCall
+- RealtimeConversationItemFunctionCallOutput
+- RealtimeConversationItemSystemMessage
+- RealtimeConversationItemUserMessage
+- RealtimeError
+- RealtimeErrorEvent
+- RealtimeMcpApprovalRequest
+- RealtimeMcpApprovalResponse
+- RealtimeMcpListTools
+- RealtimeMcpProtocolError
+- RealtimeMcpToolCall
+- RealtimeMcpToolExecutionError
+- RealtimeMcphttpError
+- RealtimeResponse
+- RealtimeResponseStatus
+- RealtimeResponseUsage
+- RealtimeResponseUsageInputTokenDetails
+- RealtimeResponseUsageOutputTokenDetails
+- RealtimeServerEvent
+- RealtimeSession
+- RealtimeSessionCreateRequest
+- RealtimeToolChoiceConfig
+- RealtimeToolsConfig
+- RealtimeToolsConfigUnion
+- RealtimeTracingConfig
+- RealtimeTranscriptionSessionCreateRequest
+- RealtimeTruncation
+- ResponseAudioDeltaEvent
+- ResponseAudioDoneEvent
+- ResponseAudioTranscriptDeltaEvent
+- ResponseAudioTranscriptDoneEvent
+- ResponseCancelEvent
+- ResponseContentPartAddedEvent
+- ResponseContentPartDoneEvent
+- ResponseCreateEvent
+- ResponseCreatedEvent
+- ResponseDoneEvent
+- ResponseFunctionCallArgumentsDeltaEvent
+- ResponseFunctionCallArgumentsDoneEvent
+- ResponseMcpCallArgumentsDelta
+- ResponseMcpCallArgumentsDone
+- ResponseMcpCallCompleted
+- ResponseMcpCallFailed
+- ResponseMcpCallInProgress
+- ResponseOutputItemAddedEvent
+- ResponseOutputItemDoneEvent
+- ResponseTextDeltaEvent
+- ResponseTextDoneEvent
+- SessionCreatedEvent
+- SessionUpdateEvent
+- SessionUpdatedEvent
+- TranscriptionSessionCreated
+- TranscriptionSessionUpdate
+- TranscriptionSessionUpdatedEvent
+
+## ClientSecrets
+
+Types:
+
+- RealtimeSessionCreateResponse
+- ClientSecretCreateResponse
+
+Methods:
+
+- client.realtime.clientSecrets.create({ ...params }) -> ClientSecretCreateResponse
+
# Conversations
Types:
diff --git a/examples/azure/realtime/websocket.ts b/examples/azure/realtime/websocket.ts
index 91fe3b7b9..146f7f94e 100644
--- a/examples/azure/realtime/websocket.ts
+++ b/examples/azure/realtime/websocket.ts
@@ -1,4 +1,4 @@
-import { OpenAIRealtimeWebSocket } from 'openai/beta/realtime/websocket';
+import { OpenAIRealtimeWebSocket } from 'openai/realtime/websocket';
import { AzureOpenAI } from 'openai';
import { DefaultAzureCredential, getBearerTokenProvider } from '@azure/identity';
import 'dotenv/config';
@@ -21,8 +21,9 @@ async function main() {
rt.send({
type: 'session.update',
session: {
- modalities: ['text'],
+ output_modalities: ['text'],
model: 'gpt-4o-realtime-preview',
+ type: 'realtime',
},
});
@@ -49,8 +50,8 @@ async function main() {
console.log();
});
- rt.on('response.text.delta', (event) => process.stdout.write(event.delta));
- rt.on('response.text.done', () => console.log());
+ rt.on('response.output_text.delta', (event) => process.stdout.write(event.delta));
+ rt.on('response.output_text.done', () => console.log());
rt.on('response.done', () => rt.close());
diff --git a/examples/azure/realtime/ws.ts b/examples/azure/realtime/ws.ts
index 8b22aeef0..83f8c6297 100644
--- a/examples/azure/realtime/ws.ts
+++ b/examples/azure/realtime/ws.ts
@@ -1,5 +1,5 @@
import { DefaultAzureCredential, getBearerTokenProvider } from '@azure/identity';
-import { OpenAIRealtimeWS } from 'openai/beta/realtime/ws';
+import { OpenAIRealtimeWS } from 'openai/realtime/ws';
import { AzureOpenAI } from 'openai';
import 'dotenv/config';
@@ -21,8 +21,9 @@ async function main() {
rt.send({
type: 'session.update',
session: {
- modalities: ['text'],
+ output_modalities: ['text'],
model: 'gpt-4o-realtime-preview',
+ type: 'realtime',
},
});
@@ -49,8 +50,8 @@ async function main() {
console.log();
});
- rt.on('response.text.delta', (event) => process.stdout.write(event.delta));
- rt.on('response.text.done', () => console.log());
+ rt.on('response.output_text.delta', (event) => process.stdout.write(event.delta));
+ rt.on('response.output_text.done', () => console.log());
rt.on('response.done', () => rt.close());
diff --git a/examples/realtime/websocket.ts b/examples/realtime/websocket.ts
index 6fb4740af..bf61db9ac 100644
--- a/examples/realtime/websocket.ts
+++ b/examples/realtime/websocket.ts
@@ -1,7 +1,7 @@
-import { OpenAIRealtimeWebSocket } from 'openai/beta/realtime/websocket';
+import { OpenAIRealtimeWebSocket } from 'openai/realtime/websocket';
async function main() {
- const rt = new OpenAIRealtimeWebSocket({ model: 'gpt-4o-realtime-preview-2024-12-17' });
+ const rt = new OpenAIRealtimeWebSocket({ model: 'gpt-realtime' });
// access the underlying `ws.WebSocket` instance
rt.socket.addEventListener('open', () => {
@@ -9,8 +9,9 @@ async function main() {
rt.send({
type: 'session.update',
session: {
- modalities: ['text'],
+ output_modalities: ['text'],
model: 'gpt-4o-realtime-preview',
+ type: 'realtime',
},
});
@@ -37,8 +38,8 @@ async function main() {
console.log();
});
- rt.on('response.text.delta', (event) => process.stdout.write(event.delta));
- rt.on('response.text.done', () => console.log());
+ rt.on('response.output_text.delta', (event) => process.stdout.write(event.delta));
+ rt.on('response.output_text.done', () => console.log());
rt.on('response.done', () => rt.close());
diff --git a/examples/realtime/ws.ts b/examples/realtime/ws.ts
index 6cc950b76..ba22e262a 100644
--- a/examples/realtime/ws.ts
+++ b/examples/realtime/ws.ts
@@ -1,7 +1,7 @@
-import { OpenAIRealtimeWS } from 'openai/beta/realtime/ws';
+import { OpenAIRealtimeWS } from 'openai/realtime/ws';
async function main() {
- const rt = new OpenAIRealtimeWS({ model: 'gpt-4o-realtime-preview-2024-12-17' });
+ const rt = new OpenAIRealtimeWS({ model: 'gpt-realtime' });
// access the underlying `ws.WebSocket` instance
rt.socket.on('open', () => {
@@ -9,8 +9,9 @@ async function main() {
rt.send({
type: 'session.update',
session: {
- modalities: ['text'],
+ output_modalities: ['text'],
model: 'gpt-4o-realtime-preview',
+ type: 'realtime',
},
});
@@ -37,8 +38,8 @@ async function main() {
console.log();
});
- rt.on('response.text.delta', (event) => process.stdout.write(event.delta));
- rt.on('response.text.done', () => console.log());
+ rt.on('response.output_text.delta', (event) => process.stdout.write(event.delta));
+ rt.on('response.output_text.done', () => console.log());
rt.on('response.done', () => rt.close());
diff --git a/jsr.json b/jsr.json
index 2996d8f66..cf46e84e3 100644
--- a/jsr.json
+++ b/jsr.json
@@ -1,6 +1,6 @@
{
"name": "@openai/openai",
- "version": "5.16.0",
+ "version": "5.17.0",
"exports": {
".": "./index.ts",
"./helpers/zod": "./helpers/zod.ts",
diff --git a/jsr.json.orig b/jsr.json.orig
index 3e7c40d5f..30eac2430 100644
--- a/jsr.json.orig
+++ b/jsr.json.orig
@@ -5,6 +5,7 @@
".": "./index.ts",
"./helpers/zod": "./helpers/zod.ts",
"./beta/realtime/websocket": "./beta/realtime/websocket.ts"
+ "./realtime/websocket": "./realtime/websocket.ts"
},
"imports": {
"zod": "npm:zod@3"
diff --git a/package.json b/package.json
index b3a4f4685..ccff023c1 100644
--- a/package.json
+++ b/package.json
@@ -1,6 +1,6 @@
{
"name": "openai",
- "version": "5.16.0",
+ "version": "5.17.0",
"description": "The official TypeScript library for the OpenAI API",
"author": "OpenAI ",
"types": "dist/index.d.ts",
diff --git a/realtime.md b/realtime.md
index 9842ad453..1f47600e4 100644
--- a/realtime.md
+++ b/realtime.md
@@ -1,4 +1,4 @@
-## Realtime API beta
+## Realtime API
The Realtime API enables you to build low-latency, multi-modal conversational experiences. It currently supports text and audio as both input and output, as well as [function calling](https://platform.openai.com/docs/guides/function-calling) through a `WebSocket` connection.
@@ -10,9 +10,9 @@ Basic text based example with `ws`:
```ts
// requires `yarn add ws @types/ws`
-import { OpenAIRealtimeWS } from 'openai/beta/realtime/ws';
+import { OpenAIRealtimeWS } from 'openai/realtime/ws';
-const rt = new OpenAIRealtimeWS({ model: 'gpt-4o-realtime-preview-2024-12-17' });
+const rt = new OpenAIRealtimeWS({ model: 'gpt-realtime' });
// access the underlying `ws.WebSocket` instance
rt.socket.on('open', () => {
@@ -59,9 +59,9 @@ rt.socket.on('close', () => console.log('\nConnection closed!'));
To use the web API `WebSocket` implementation, replace `OpenAIRealtimeWS` with `OpenAIRealtimeWebSocket` and adjust any `rt.socket` access:
```ts
-import { OpenAIRealtimeWebSocket } from 'openai/beta/realtime/websocket';
+import { OpenAIRealtimeWebSocket } from 'openai/realtime/websocket';
-const rt = new OpenAIRealtimeWebSocket({ model: 'gpt-4o-realtime-preview-2024-12-17' });
+const rt = new OpenAIRealtimeWebSocket({ model: 'gpt-realtime' });
// ...
rt.socket.addEventListener('open', () => {
// ...
@@ -77,7 +77,7 @@ When an error is encountered, either on the client side or returned from the ser
It is **highly recommended** that you register an `error` event listener and handle errors appropriately as typically the underlying connection is still usable.
```ts
-const rt = new OpenAIRealtimeWS({ model: 'gpt-4o-realtime-preview-2024-12-17' });
+const rt = new OpenAIRealtimeWS({ model: 'gpt-realtime' });
rt.on('error', (err) => {
// in a real world scenario this should be logged somewhere as you
// likely want to continue processing events regardless of any errors
diff --git a/scripts/detect-breaking-changes b/scripts/detect-breaking-changes
index 9f5a00452..85607de43 100755
--- a/scripts/detect-breaking-changes
+++ b/scripts/detect-breaking-changes
@@ -44,6 +44,8 @@ TEST_PATHS=(
tests/api-resources/uploads/parts.test.ts
tests/api-resources/responses/responses.test.ts
tests/api-resources/responses/input-items.test.ts
+ tests/api-resources/realtime/realtime.test.ts
+ tests/api-resources/realtime/client-secrets.test.ts
tests/api-resources/conversations/conversations.test.ts
tests/api-resources/conversations/items.test.ts
tests/api-resources/evals/evals.test.ts
diff --git a/src/client.ts b/src/client.ts
index 78e29664a..a853d2890 100644
--- a/src/client.ts
+++ b/src/client.ts
@@ -121,6 +121,7 @@ import {
} from './resources/evals/evals';
import { FineTuning } from './resources/fine-tuning/fine-tuning';
import { Graders } from './resources/graders/graders';
+import { Realtime } from './resources/realtime/realtime';
import { Responses } from './resources/responses/responses';
import {
Upload,
@@ -574,7 +575,7 @@ export class OpenAI {
const response = await this.fetchWithTimeout(url, req, timeout, controller).catch(castToError);
const headersTime = Date.now();
- if (response instanceof Error) {
+ if (response instanceof globalThis.Error) {
const retryMessage = `retrying, ${retriesRemaining} attempts remaining`;
if (options.signal?.aborted) {
throw new Errors.APIUserAbortError();
@@ -962,6 +963,7 @@ export class OpenAI {
batches: API.Batches = new API.Batches(this);
uploads: API.Uploads = new API.Uploads(this);
responses: API.Responses = new API.Responses(this);
+ realtime: API.Realtime = new API.Realtime(this);
conversations: API.Conversations = new API.Conversations(this);
evals: API.Evals = new API.Evals(this);
containers: API.Containers = new API.Containers(this);
@@ -983,6 +985,7 @@ OpenAI.Beta = Beta;
OpenAI.Batches = Batches;
OpenAI.Uploads = UploadsAPIUploads;
OpenAI.Responses = Responses;
+OpenAI.Realtime = Realtime;
OpenAI.Conversations = Conversations;
OpenAI.Evals = Evals;
OpenAI.Containers = Containers;
@@ -1165,6 +1168,8 @@ export declare namespace OpenAI {
export { Responses as Responses };
+ export { Realtime as Realtime };
+
export { Conversations as Conversations };
export {
diff --git a/src/realtime/index.ts b/src/realtime/index.ts
new file mode 100644
index 000000000..75f0f3088
--- /dev/null
+++ b/src/realtime/index.ts
@@ -0,0 +1 @@
+export { OpenAIRealtimeError } from './internal-base';
diff --git a/src/realtime/internal-base.ts b/src/realtime/internal-base.ts
new file mode 100644
index 000000000..92cc1d1c6
--- /dev/null
+++ b/src/realtime/internal-base.ts
@@ -0,0 +1,98 @@
+import {
+ RealtimeClientEvent,
+ RealtimeServerEvent,
+ RealtimeErrorEvent,
+ RealtimeError,
+} from '../resources/realtime/realtime';
+import { EventEmitter } from '../lib/EventEmitter';
+import { OpenAIError } from '../error';
+import OpenAI, { AzureOpenAI } from '../index';
+
+export class OpenAIRealtimeError extends OpenAIError {
+ /**
+ * The error data that the API sent back in an `error` event.
+ */
+ error?: RealtimeError | undefined;
+
+ /**
+ * The unique ID of the server event.
+ */
+ event_id?: string | undefined;
+
+ constructor(message: string, event: RealtimeErrorEvent | null) {
+ super(message);
+
+ this.error = event?.error;
+ this.event_id = event?.event_id;
+ }
+}
+
+type Simplify = { [KeyType in keyof T]: T[KeyType] } & {};
+
+type RealtimeEvents = Simplify<
+ {
+ event: (event: RealtimeServerEvent) => void;
+ error: (error: OpenAIRealtimeError) => void;
+ } & {
+ [EventType in Exclude]: (
+ event: Extract,
+ ) => unknown;
+ }
+>;
+
+export abstract class OpenAIRealtimeEmitter extends EventEmitter {
+ /**
+ * Send an event to the API.
+ */
+ abstract send(event: RealtimeClientEvent): void;
+
+ /**
+ * Close the websocket connection.
+ */
+ abstract close(props?: { code: number; reason: string }): void;
+
+ protected _onError(event: null, message: string, cause: any): void;
+ protected _onError(event: RealtimeErrorEvent, message?: string | undefined): void;
+ protected _onError(event: RealtimeErrorEvent | null, message?: string | undefined, cause?: any): void {
+ message =
+ event?.error ?
+ `${event.error.message} code=${event.error.code} param=${event.error.param} type=${event.error.type} event_id=${event.error.event_id}`
+ : message ?? 'unknown error';
+
+ if (!this._hasListener('error')) {
+ const error = new OpenAIRealtimeError(
+ message +
+ `\n\nTo resolve these unhandled rejection errors you should bind an \`error\` callback, e.g. \`rt.on('error', (error) => ...)\` `,
+ event,
+ );
+ // @ts-ignore
+ error.cause = cause;
+ Promise.reject(error);
+ return;
+ }
+
+ const error = new OpenAIRealtimeError(message, event);
+ // @ts-ignore
+ error.cause = cause;
+
+ this._emit('error', error);
+ }
+}
+
+export function isAzure(client: Pick): client is AzureOpenAI {
+ return client instanceof AzureOpenAI;
+}
+
+export function buildRealtimeURL(client: Pick, model: string): URL {
+ const path = '/realtime';
+ const baseURL = client.baseURL;
+ const url = new URL(baseURL + (baseURL.endsWith('/') ? path.slice(1) : path));
+ url.protocol = 'wss';
+ if (isAzure(client)) {
+ url.searchParams.set('api-version', client.apiVersion);
+ url.searchParams.set('deployment', model);
+ } else {
+ url.searchParams.set('model', model);
+ }
+ return url;
+}
diff --git a/src/realtime/websocket.ts b/src/realtime/websocket.ts
new file mode 100644
index 000000000..c83b2cf05
--- /dev/null
+++ b/src/realtime/websocket.ts
@@ -0,0 +1,142 @@
+import { AzureOpenAI, OpenAI } from '../index';
+import { OpenAIError } from '../error';
+import type { RealtimeClientEvent, RealtimeServerEvent } from '../resources/realtime/realtime';
+import { OpenAIRealtimeEmitter, buildRealtimeURL, isAzure } from './internal-base';
+import { isRunningInBrowser } from '../internal/detect-platform';
+
+interface MessageEvent {
+ data: string;
+}
+
+type _WebSocket =
+ typeof globalThis extends (
+ {
+ WebSocket: infer ws extends abstract new (...args: any) => any;
+ }
+ ) ?
+ // @ts-ignore
+ InstanceType
+ : any;
+
+export class OpenAIRealtimeWebSocket extends OpenAIRealtimeEmitter {
+ url: URL;
+ socket: _WebSocket;
+
+ constructor(
+ props: {
+ model: string;
+ dangerouslyAllowBrowser?: boolean;
+ /**
+ * Callback to mutate the URL, needed for Azure.
+ * @internal
+ */
+ onURL?: (url: URL) => void;
+ },
+ client?: Pick,
+ ) {
+ super();
+
+ const dangerouslyAllowBrowser =
+ props.dangerouslyAllowBrowser ??
+ (client as any)?._options?.dangerouslyAllowBrowser ??
+ (client?.apiKey.startsWith('ek_') ? true : null);
+
+ if (!dangerouslyAllowBrowser && isRunningInBrowser()) {
+ throw new OpenAIError(
+ "It looks like you're running in a browser-like environment.\n\nThis is disabled by default, as it risks exposing your secret API credentials to attackers.\n\nYou can avoid this error by creating an ephemeral session token:\nhttps://platform.openai.com/docs/api-reference/realtime-sessions\n",
+ );
+ }
+
+ client ??= new OpenAI({ dangerouslyAllowBrowser });
+
+ this.url = buildRealtimeURL(client, props.model);
+ props.onURL?.(this.url);
+
+ // @ts-ignore
+ this.socket = new WebSocket(this.url.toString(), [
+ 'realtime',
+ ...(isAzure(client) ? [] : [`openai-insecure-api-key.${client.apiKey}`]),
+ ]);
+
+ this.socket.addEventListener('message', (websocketEvent: MessageEvent) => {
+ const event = (() => {
+ try {
+ return JSON.parse(websocketEvent.data.toString()) as RealtimeServerEvent;
+ } catch (err) {
+ this._onError(null, 'could not parse websocket event', err);
+ return null;
+ }
+ })();
+
+ if (event) {
+ this._emit('event', event);
+
+ if (event.type === 'error') {
+ this._onError(event);
+ } else {
+ // @ts-expect-error TS isn't smart enough to get the relationship right here
+ this._emit(event.type, event);
+ }
+ }
+ });
+
+ this.socket.addEventListener('error', (event: any) => {
+ this._onError(null, event.message, null);
+ });
+
+ if (isAzure(client)) {
+ if (this.url.searchParams.get('Authorization') !== null) {
+ this.url.searchParams.set('Authorization', '');
+ } else {
+ this.url.searchParams.set('api-key', '');
+ }
+ }
+ }
+
+ static async azure(
+ client: Pick,
+ options: { deploymentName?: string; dangerouslyAllowBrowser?: boolean } = {},
+ ): Promise {
+ const token = await client._getAzureADToken();
+ function onURL(url: URL) {
+ if (client.apiKey !== '') {
+ url.searchParams.set('api-key', client.apiKey);
+ } else {
+ if (token) {
+ url.searchParams.set('Authorization', `Bearer ${token}`);
+ } else {
+ throw new Error('AzureOpenAI is not instantiated correctly. No API key or token provided.');
+ }
+ }
+ }
+ const deploymentName = options.deploymentName ?? client.deploymentName;
+ if (!deploymentName) {
+ throw new Error('No deployment name provided');
+ }
+ const { dangerouslyAllowBrowser } = options;
+ return new OpenAIRealtimeWebSocket(
+ {
+ model: deploymentName,
+ onURL,
+ ...(dangerouslyAllowBrowser ? { dangerouslyAllowBrowser } : {}),
+ },
+ client,
+ );
+ }
+
+ send(event: RealtimeClientEvent) {
+ try {
+ this.socket.send(JSON.stringify(event));
+ } catch (err) {
+ this._onError(null, 'could not send data', err);
+ }
+ }
+
+ close(props?: { code: number; reason: string }) {
+ try {
+ this.socket.close(props?.code ?? 1000, props?.reason ?? 'OK');
+ } catch (err) {
+ this._onError(null, 'could not close the connection', err);
+ }
+ }
+}
diff --git a/src/realtime/ws.ts b/src/realtime/ws.ts
new file mode 100644
index 000000000..5226d6601
--- /dev/null
+++ b/src/realtime/ws.ts
@@ -0,0 +1,95 @@
+import * as WS from 'ws';
+import { AzureOpenAI, OpenAI } from '../index';
+import type { RealtimeClientEvent, RealtimeServerEvent } from '../resources/realtime/realtime';
+import { OpenAIRealtimeEmitter, buildRealtimeURL, isAzure } from './internal-base';
+
+export class OpenAIRealtimeWS extends OpenAIRealtimeEmitter {
+ url: URL;
+ socket: WS.WebSocket;
+
+ constructor(
+ props: { model: string; options?: WS.ClientOptions | undefined },
+ client?: Pick,
+ ) {
+ super();
+ client ??= new OpenAI();
+
+ this.url = buildRealtimeURL(client, props.model);
+ this.socket = new WS.WebSocket(this.url, {
+ ...props.options,
+ headers: {
+ ...props.options?.headers,
+ ...(isAzure(client) ? {} : { Authorization: `Bearer ${client.apiKey}` }),
+ },
+ });
+
+ this.socket.on('message', (wsEvent) => {
+ const event = (() => {
+ try {
+ return JSON.parse(wsEvent.toString()) as RealtimeServerEvent;
+ } catch (err) {
+ this._onError(null, 'could not parse websocket event', err);
+ return null;
+ }
+ })();
+
+ if (event) {
+ this._emit('event', event);
+
+ if (event.type === 'error') {
+ this._onError(event);
+ } else {
+ // @ts-expect-error TS isn't smart enough to get the relationship right here
+ this._emit(event.type, event);
+ }
+ }
+ });
+
+ this.socket.on('error', (err) => {
+ this._onError(null, err.message, err);
+ });
+ }
+
+ static async azure(
+ client: Pick,
+ options: { deploymentName?: string; options?: WS.ClientOptions | undefined } = {},
+ ): Promise {
+ const deploymentName = options.deploymentName ?? client.deploymentName;
+ if (!deploymentName) {
+ throw new Error('No deployment name provided');
+ }
+ return new OpenAIRealtimeWS(
+ { model: deploymentName, options: { headers: await getAzureHeaders(client) } },
+ client,
+ );
+ }
+
+ send(event: RealtimeClientEvent) {
+ try {
+ this.socket.send(JSON.stringify(event));
+ } catch (err) {
+ this._onError(null, 'could not send data', err);
+ }
+ }
+
+ close(props?: { code: number; reason: string }) {
+ try {
+ this.socket.close(props?.code ?? 1000, props?.reason ?? 'OK');
+ } catch (err) {
+ this._onError(null, 'could not close the connection', err);
+ }
+ }
+}
+
+async function getAzureHeaders(client: Pick) {
+ if (client.apiKey !== '') {
+ return { 'api-key': client.apiKey };
+ } else {
+ const token = await client._getAzureADToken();
+ if (token) {
+ return { Authorization: `Bearer ${token}` };
+ } else {
+ throw new Error('AzureOpenAI is not instantiated correctly. No API key or token provided.');
+ }
+ }
+}
diff --git a/src/resources/audio/speech.ts b/src/resources/audio/speech.ts
index f533a558b..e68e806e0 100644
--- a/src/resources/audio/speech.ts
+++ b/src/resources/audio/speech.ts
@@ -51,7 +51,18 @@ export interface SpeechCreateParams {
* `verse`. Previews of the voices are available in the
* [Text to speech guide](https://platform.openai.com/docs/guides/text-to-speech#voice-options).
*/
- voice: (string & {}) | 'alloy' | 'ash' | 'ballad' | 'coral' | 'echo' | 'sage' | 'shimmer' | 'verse';
+ voice:
+ | (string & {})
+ | 'alloy'
+ | 'ash'
+ | 'ballad'
+ | 'coral'
+ | 'echo'
+ | 'sage'
+ | 'shimmer'
+ | 'verse'
+ | 'marin'
+ | 'cedar';
/**
* Control the voice of your generated audio with additional instructions. Does not
diff --git a/src/resources/beta/realtime/realtime.ts b/src/resources/beta/realtime/realtime.ts
index 4635c6762..b7fe85dc0 100644
--- a/src/resources/beta/realtime/realtime.ts
+++ b/src/resources/beta/realtime/realtime.ts
@@ -17,6 +17,9 @@ import {
TranscriptionSessions,
} from './transcription-sessions';
+/**
+ * @deprecated Realtime has now launched and is generally available. The old beta API is now deprecated.
+ */
export class Realtime extends APIResource {
sessions: SessionsAPI.Sessions = new SessionsAPI.Sessions(this._client);
transcriptionSessions: TranscriptionSessionsAPI.TranscriptionSessions =
diff --git a/src/resources/chat/completions/completions.ts b/src/resources/chat/completions/completions.ts
index a71e574e9..17269f25b 100644
--- a/src/resources/chat/completions/completions.ts
+++ b/src/resources/chat/completions/completions.ts
@@ -489,7 +489,18 @@ export interface ChatCompletionAudioParam {
* The voice the model uses to respond. Supported voices are `alloy`, `ash`,
* `ballad`, `coral`, `echo`, `fable`, `nova`, `onyx`, `sage`, and `shimmer`.
*/
- voice: (string & {}) | 'alloy' | 'ash' | 'ballad' | 'coral' | 'echo' | 'sage' | 'shimmer' | 'verse';
+ voice:
+ | (string & {})
+ | 'alloy'
+ | 'ash'
+ | 'ballad'
+ | 'coral'
+ | 'echo'
+ | 'sage'
+ | 'shimmer'
+ | 'verse'
+ | 'marin'
+ | 'cedar';
}
/**
diff --git a/src/resources/index.ts b/src/resources/index.ts
index 129b1cbd0..fbbc0e3bb 100644
--- a/src/resources/index.ts
+++ b/src/resources/index.ts
@@ -95,6 +95,7 @@ export {
type ModerationCreateResponse,
type ModerationCreateParams,
} from './moderations';
+export { Realtime } from './realtime/realtime';
export { Responses } from './responses/responses';
export { Uploads, type Upload, type UploadCreateParams, type UploadCompleteParams } from './uploads/uploads';
export {
diff --git a/src/resources/realtime.ts b/src/resources/realtime.ts
new file mode 100644
index 000000000..1c5df27d9
--- /dev/null
+++ b/src/resources/realtime.ts
@@ -0,0 +1,3 @@
+// File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
+
+export * from './realtime/index';
diff --git a/src/resources/realtime/client-secrets.ts b/src/resources/realtime/client-secrets.ts
new file mode 100644
index 000000000..c48fe8243
--- /dev/null
+++ b/src/resources/realtime/client-secrets.ts
@@ -0,0 +1,470 @@
+// File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
+
+import { APIResource } from '../../core/resource';
+import * as RealtimeAPI from './realtime';
+import { APIPromise } from '../../core/api-promise';
+import { RequestOptions } from '../../internal/request-options';
+
+export class ClientSecrets extends APIResource {
+ /**
+ * Create a Realtime session and client secret for either realtime or
+ * transcription.
+ */
+ create(body: ClientSecretCreateParams, options?: RequestOptions): APIPromise {
+ return this._client.post('/realtime/client_secrets', { body, ...options });
+ }
+}
+
+/**
+ * A Realtime session configuration object.
+ */
+export interface RealtimeSessionCreateResponse {
+ /**
+ * Unique identifier for the session that looks like `sess_1234567890abcdef`.
+ */
+ id?: string;
+
+ /**
+ * Configuration for input and output audio for the session.
+ */
+ audio?: RealtimeSessionCreateResponse.Audio;
+
+ /**
+ * Expiration timestamp for the session, in seconds since epoch.
+ */
+ expires_at?: number;
+
+ /**
+ * Additional fields to include in server outputs.
+ *
+ * - `item.input_audio_transcription.logprobs`: Include logprobs for input audio
+ * transcription.
+ */
+ include?: Array<'item.input_audio_transcription.logprobs'>;
+
+ /**
+ * The default system instructions (i.e. system message) prepended to model calls.
+ * This field allows the client to guide the model on desired responses. The model
+ * can be instructed on response content and format, (e.g. "be extremely succinct",
+ * "act friendly", "here are examples of good responses") and on audio behavior
+ * (e.g. "talk quickly", "inject emotion into your voice", "laugh frequently"). The
+ * instructions are not guaranteed to be followed by the model, but they provide
+ * guidance to the model on the desired behavior.
+ *
+ * Note that the server sets default instructions which will be used if this field
+ * is not set and are visible in the `session.created` event at the start of the
+ * session.
+ */
+ instructions?: string;
+
+ /**
+ * Maximum number of output tokens for a single assistant response, inclusive of
+ * tool calls. Provide an integer between 1 and 4096 to limit output tokens, or
+ * `inf` for the maximum available tokens for a given model. Defaults to `inf`.
+ */
+ max_output_tokens?: number | 'inf';
+
+ /**
+ * The Realtime model used for this session.
+ */
+ model?: string;
+
+ /**
+ * The object type. Always `realtime.session`.
+ */
+ object?: string;
+
+ /**
+ * The set of modalities the model can respond with. To disable audio, set this to
+ * ["text"].
+ */
+ output_modalities?: Array<'text' | 'audio'>;
+
+ /**
+ * How the model chooses tools. Options are `auto`, `none`, `required`, or specify
+ * a function.
+ */
+ tool_choice?: string;
+
+ /**
+ * Tools (functions) available to the model.
+ */
+ tools?: Array;
+
+ /**
+ * Configuration options for tracing. Set to null to disable tracing. Once tracing
+ * is enabled for a session, the configuration cannot be modified.
+ *
+ * `auto` will create a trace for the session with default values for the workflow
+ * name, group id, and metadata.
+ */
+ tracing?: 'auto' | RealtimeSessionCreateResponse.TracingConfiguration;
+
+ /**
+ * Configuration for turn detection. Can be set to `null` to turn off. Server VAD
+ * means that the model will detect the start and end of speech based on audio
+ * volume and respond at the end of user speech.
+ */
+ turn_detection?: RealtimeSessionCreateResponse.TurnDetection;
+}
+
+export namespace RealtimeSessionCreateResponse {
+ /**
+ * Configuration for input and output audio for the session.
+ */
+ export interface Audio {
+ input?: Audio.Input;
+
+ output?: Audio.Output;
+ }
+
+ export namespace Audio {
+ export interface Input {
+ /**
+ * The format of input audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`.
+ */
+ format?: string;
+
+ /**
+ * Configuration for input audio noise reduction.
+ */
+ noise_reduction?: Input.NoiseReduction;
+
+ /**
+ * Configuration for input audio transcription.
+ */
+ transcription?: Input.Transcription;
+
+ /**
+ * Configuration for turn detection.
+ */
+ turn_detection?: Input.TurnDetection;
+ }
+
+ export namespace Input {
+ /**
+ * Configuration for input audio noise reduction.
+ */
+ export interface NoiseReduction {
+ type?: 'near_field' | 'far_field';
+ }
+
+ /**
+ * Configuration for input audio transcription.
+ */
+ export interface Transcription {
+ /**
+ * The language of the input audio.
+ */
+ language?: string;
+
+ /**
+ * The model to use for transcription.
+ */
+ model?: string;
+
+ /**
+ * Optional text to guide the model's style or continue a previous audio segment.
+ */
+ prompt?: string;
+ }
+
+ /**
+ * Configuration for turn detection.
+ */
+ export interface TurnDetection {
+ prefix_padding_ms?: number;
+
+ silence_duration_ms?: number;
+
+ threshold?: number;
+
+ /**
+ * Type of turn detection, only `server_vad` is currently supported.
+ */
+ type?: string;
+ }
+ }
+
+ export interface Output {
+ /**
+ * The format of output audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`.
+ */
+ format?: string;
+
+ speed?: number;
+
+ voice?:
+ | (string & {})
+ | 'alloy'
+ | 'ash'
+ | 'ballad'
+ | 'coral'
+ | 'echo'
+ | 'sage'
+ | 'shimmer'
+ | 'verse'
+ | 'marin'
+ | 'cedar';
+ }
+ }
+
+ export interface Tool {
+ /**
+ * The description of the function, including guidance on when and how to call it,
+ * and guidance about what to tell the user when calling (if anything).
+ */
+ description?: string;
+
+ /**
+ * The name of the function.
+ */
+ name?: string;
+
+ /**
+ * Parameters of the function in JSON Schema.
+ */
+ parameters?: unknown;
+
+ /**
+ * The type of the tool, i.e. `function`.
+ */
+ type?: 'function';
+ }
+
+ /**
+ * Granular configuration for tracing.
+ */
+ export interface TracingConfiguration {
+ /**
+ * The group id to attach to this trace to enable filtering and grouping in the
+ * traces dashboard.
+ */
+ group_id?: string;
+
+ /**
+ * The arbitrary metadata to attach to this trace to enable filtering in the traces
+ * dashboard.
+ */
+ metadata?: unknown;
+
+ /**
+ * The name of the workflow to attach to this trace. This is used to name the trace
+ * in the traces dashboard.
+ */
+ workflow_name?: string;
+ }
+
+ /**
+ * Configuration for turn detection. Can be set to `null` to turn off. Server VAD
+ * means that the model will detect the start and end of speech based on audio
+ * volume and respond at the end of user speech.
+ */
+ export interface TurnDetection {
+ /**
+ * Amount of audio to include before the VAD detected speech (in milliseconds).
+ * Defaults to 300ms.
+ */
+ prefix_padding_ms?: number;
+
+ /**
+ * Duration of silence to detect speech stop (in milliseconds). Defaults to 500ms.
+ * With shorter values the model will respond more quickly, but may jump in on
+ * short pauses from the user.
+ */
+ silence_duration_ms?: number;
+
+ /**
+ * Activation threshold for VAD (0.0 to 1.0), this defaults to 0.5. A higher
+ * threshold will require louder audio to activate the model, and thus might
+ * perform better in noisy environments.
+ */
+ threshold?: number;
+
+ /**
+ * Type of turn detection, only `server_vad` is currently supported.
+ */
+ type?: string;
+ }
+}
+
+/**
+ * Response from creating a session and client secret for the Realtime API.
+ */
+export interface ClientSecretCreateResponse {
+ /**
+ * Expiration timestamp for the client secret, in seconds since epoch.
+ */
+ expires_at: number;
+
+ /**
+ * The session configuration for either a realtime or transcription session.
+ */
+ session:
+ | RealtimeSessionCreateResponse
+ | ClientSecretCreateResponse.RealtimeTranscriptionSessionCreateResponse;
+
+ /**
+ * The generated client secret value.
+ */
+ value: string;
+}
+
+export namespace ClientSecretCreateResponse {
+ /**
+ * A Realtime transcription session configuration object.
+ */
+ export interface RealtimeTranscriptionSessionCreateResponse {
+ /**
+ * Unique identifier for the session that looks like `sess_1234567890abcdef`.
+ */
+ id?: string;
+
+ /**
+ * Configuration for input audio for the session.
+ */
+ audio?: RealtimeTranscriptionSessionCreateResponse.Audio;
+
+ /**
+ * Expiration timestamp for the session, in seconds since epoch.
+ */
+ expires_at?: number;
+
+ /**
+ * Additional fields to include in server outputs.
+ *
+ * - `item.input_audio_transcription.logprobs`: Include logprobs for input audio
+ * transcription.
+ */
+ include?: Array<'item.input_audio_transcription.logprobs'>;
+
+ /**
+ * The object type. Always `realtime.transcription_session`.
+ */
+ object?: string;
+ }
+
+ export namespace RealtimeTranscriptionSessionCreateResponse {
+ /**
+ * Configuration for input audio for the session.
+ */
+ export interface Audio {
+ input?: Audio.Input;
+ }
+
+ export namespace Audio {
+ export interface Input {
+ /**
+ * The format of input audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`.
+ */
+ format?: string;
+
+ /**
+ * Configuration for input audio noise reduction.
+ */
+ noise_reduction?: Input.NoiseReduction;
+
+ /**
+ * Configuration of the transcription model.
+ */
+ transcription?: Input.Transcription;
+
+ /**
+ * Configuration for turn detection.
+ */
+ turn_detection?: Input.TurnDetection;
+ }
+
+ export namespace Input {
+ /**
+ * Configuration for input audio noise reduction.
+ */
+ export interface NoiseReduction {
+ type?: 'near_field' | 'far_field';
+ }
+
+ /**
+ * Configuration of the transcription model.
+ */
+ export interface Transcription {
+ /**
+ * The language of the input audio. Supplying the input language in
+ * [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`)
+ * format will improve accuracy and latency.
+ */
+ language?: string;
+
+ /**
+ * The model to use for transcription. Can be `gpt-4o-transcribe`,
+ * `gpt-4o-mini-transcribe`, or `whisper-1`.
+ */
+ model?: 'gpt-4o-transcribe' | 'gpt-4o-mini-transcribe' | 'whisper-1';
+
+ /**
+ * An optional text to guide the model's style or continue a previous audio
+ * segment. The
+ * [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting)
+ * should match the audio language.
+ */
+ prompt?: string;
+ }
+
+ /**
+ * Configuration for turn detection.
+ */
+ export interface TurnDetection {
+ prefix_padding_ms?: number;
+
+ silence_duration_ms?: number;
+
+ threshold?: number;
+
+ /**
+ * Type of turn detection, only `server_vad` is currently supported.
+ */
+ type?: string;
+ }
+ }
+ }
+ }
+}
+
+export interface ClientSecretCreateParams {
+ /**
+ * Configuration for the ephemeral token expiration.
+ */
+ expires_after?: ClientSecretCreateParams.ExpiresAfter;
+
+ /**
+ * Session configuration to use for the client secret. Choose either a realtime
+ * session or a transcription session.
+ */
+ session?: RealtimeAPI.RealtimeSessionCreateRequest | RealtimeAPI.RealtimeTranscriptionSessionCreateRequest;
+}
+
+export namespace ClientSecretCreateParams {
+ /**
+ * Configuration for the ephemeral token expiration.
+ */
+ export interface ExpiresAfter {
+ /**
+ * The anchor point for the ephemeral token expiration. Only `created_at` is
+ * currently supported.
+ */
+ anchor?: 'created_at';
+
+ /**
+ * The number of seconds from the anchor point to the expiration. Select a value
+ * between `10` and `7200`.
+ */
+ seconds?: number;
+ }
+}
+
+export declare namespace ClientSecrets {
+ export {
+ type RealtimeSessionCreateResponse as RealtimeSessionCreateResponse,
+ type ClientSecretCreateResponse as ClientSecretCreateResponse,
+ type ClientSecretCreateParams as ClientSecretCreateParams,
+ };
+}
diff --git a/src/resources/realtime/index.ts b/src/resources/realtime/index.ts
new file mode 100644
index 000000000..a6c5db35e
--- /dev/null
+++ b/src/resources/realtime/index.ts
@@ -0,0 +1,9 @@
+// File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
+
+export {
+ ClientSecrets,
+ type RealtimeSessionCreateResponse,
+ type ClientSecretCreateResponse,
+ type ClientSecretCreateParams,
+} from './client-secrets';
+export { Realtime } from './realtime';
diff --git a/src/resources/realtime/realtime.ts b/src/resources/realtime/realtime.ts
new file mode 100644
index 000000000..562b2d739
--- /dev/null
+++ b/src/resources/realtime/realtime.ts
@@ -0,0 +1,4351 @@
+// File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
+
+import { APIResource } from '../../core/resource';
+import * as RealtimeAPI from './realtime';
+import * as Shared from '../shared';
+import * as ClientSecretsAPI from './client-secrets';
+import {
+ ClientSecretCreateParams,
+ ClientSecretCreateResponse,
+ ClientSecrets,
+ RealtimeSessionCreateResponse,
+} from './client-secrets';
+import * as ResponsesAPI from '../responses/responses';
+
+export class Realtime extends APIResource {
+ clientSecrets: ClientSecretsAPI.ClientSecrets = new ClientSecretsAPI.ClientSecrets(this._client);
+}
+
+/**
+ * Returned when a conversation is created. Emitted right after session creation.
+ */
+export interface ConversationCreatedEvent {
+ /**
+ * The conversation resource.
+ */
+ conversation: ConversationCreatedEvent.Conversation;
+
+ /**
+ * The unique ID of the server event.
+ */
+ event_id: string;
+
+ /**
+ * The event type, must be `conversation.created`.
+ */
+ type: 'conversation.created';
+}
+
+export namespace ConversationCreatedEvent {
+ /**
+ * The conversation resource.
+ */
+ export interface Conversation {
+ /**
+ * The unique ID of the conversation.
+ */
+ id?: string;
+
+ /**
+ * The object type, must be `realtime.conversation`.
+ */
+ object?: 'realtime.conversation';
+ }
+}
+
+/**
+ * A single item within a Realtime conversation.
+ */
+export type ConversationItem =
+ | RealtimeConversationItemSystemMessage
+ | RealtimeConversationItemUserMessage
+ | RealtimeConversationItemAssistantMessage
+ | RealtimeConversationItemFunctionCall
+ | RealtimeConversationItemFunctionCallOutput
+ | RealtimeMcpApprovalResponse
+ | RealtimeMcpListTools
+ | RealtimeMcpToolCall
+ | RealtimeMcpApprovalRequest;
+
+/**
+ * Returned when a conversation item is added.
+ */
+export interface ConversationItemAdded {
+ /**
+ * The unique ID of the server event.
+ */
+ event_id: string;
+
+ /**
+ * A single item within a Realtime conversation.
+ */
+ item: ConversationItem;
+
+ /**
+ * The event type, must be `conversation.item.added`.
+ */
+ type: 'conversation.item.added';
+
+ /**
+ * The ID of the item that precedes this one, if any. This is used to maintain
+ * ordering when items are inserted.
+ */
+ previous_item_id?: string | null;
+}
+
+/**
+ * Add a new Item to the Conversation's context, including messages, function
+ * calls, and function call responses. This event can be used both to populate a
+ * "history" of the conversation and to add new items mid-stream, but has the
+ * current limitation that it cannot populate assistant audio messages.
+ *
+ * If successful, the server will respond with a `conversation.item.created` event,
+ * otherwise an `error` event will be sent.
+ */
+export interface ConversationItemCreateEvent {
+ /**
+ * A single item within a Realtime conversation.
+ */
+ item: ConversationItem;
+
+ /**
+ * The event type, must be `conversation.item.create`.
+ */
+ type: 'conversation.item.create';
+
+ /**
+ * Optional client-generated ID used to identify this event.
+ */
+ event_id?: string;
+
+ /**
+ * The ID of the preceding item after which the new item will be inserted. If not
+ * set, the new item will be appended to the end of the conversation. If set to
+ * `root`, the new item will be added to the beginning of the conversation. If set
+ * to an existing ID, it allows an item to be inserted mid-conversation. If the ID
+ * cannot be found, an error will be returned and the item will not be added.
+ */
+ previous_item_id?: string;
+}
+
+/**
+ * Returned when a conversation item is created. There are several scenarios that
+ * produce this event:
+ *
+ * - The server is generating a Response, which if successful will produce either
+ * one or two Items, which will be of type `message` (role `assistant`) or type
+ * `function_call`.
+ * - The input audio buffer has been committed, either by the client or the server
+ * (in `server_vad` mode). The server will take the content of the input audio
+ * buffer and add it to a new user message Item.
+ * - The client has sent a `conversation.item.create` event to add a new Item to
+ * the Conversation.
+ */
+export interface ConversationItemCreatedEvent {
+ /**
+ * The unique ID of the server event.
+ */
+ event_id: string;
+
+ /**
+ * A single item within a Realtime conversation.
+ */
+ item: ConversationItem;
+
+ /**
+ * The event type, must be `conversation.item.created`.
+ */
+ type: 'conversation.item.created';
+
+ /**
+ * The ID of the preceding item in the Conversation context, allows the client to
+ * understand the order of the conversation. Can be `null` if the item has no
+ * predecessor.
+ */
+ previous_item_id?: string | null;
+}
+
+/**
+ * Send this event when you want to remove any item from the conversation history.
+ * The server will respond with a `conversation.item.deleted` event, unless the
+ * item does not exist in the conversation history, in which case the server will
+ * respond with an error.
+ */
+export interface ConversationItemDeleteEvent {
+ /**
+ * The ID of the item to delete.
+ */
+ item_id: string;
+
+ /**
+ * The event type, must be `conversation.item.delete`.
+ */
+ type: 'conversation.item.delete';
+
+ /**
+ * Optional client-generated ID used to identify this event.
+ */
+ event_id?: string;
+}
+
+/**
+ * Returned when an item in the conversation is deleted by the client with a
+ * `conversation.item.delete` event. This event is used to synchronize the server's
+ * understanding of the conversation history with the client's view.
+ */
+export interface ConversationItemDeletedEvent {
+ /**
+ * The unique ID of the server event.
+ */
+ event_id: string;
+
+ /**
+ * The ID of the item that was deleted.
+ */
+ item_id: string;
+
+ /**
+ * The event type, must be `conversation.item.deleted`.
+ */
+ type: 'conversation.item.deleted';
+}
+
+/**
+ * Returned when a conversation item is finalized.
+ */
+export interface ConversationItemDone {
+ /**
+ * The unique ID of the server event.
+ */
+ event_id: string;
+
+ /**
+ * A single item within a Realtime conversation.
+ */
+ item: ConversationItem;
+
+ /**
+ * The event type, must be `conversation.item.done`.
+ */
+ type: 'conversation.item.done';
+
+ /**
+ * The ID of the item that precedes this one, if any. This is used to maintain
+ * ordering when items are inserted.
+ */
+ previous_item_id?: string | null;
+}
+
+/**
+ * This event is the output of audio transcription for user audio written to the
+ * user audio buffer. Transcription begins when the input audio buffer is committed
+ * by the client or server (in `server_vad` mode). Transcription runs
+ * asynchronously with Response creation, so this event may come before or after
+ * the Response events.
+ *
+ * Realtime API models accept audio natively, and thus input transcription is a
+ * separate process run on a separate ASR (Automatic Speech Recognition) model. The
+ * transcript may diverge somewhat from the model's interpretation, and should be
+ * treated as a rough guide.
+ */
+export interface ConversationItemInputAudioTranscriptionCompletedEvent {
+ /**
+ * The index of the content part containing the audio.
+ */
+ content_index: number;
+
+ /**
+ * The unique ID of the server event.
+ */
+ event_id: string;
+
+ /**
+ * The ID of the user message item containing the audio.
+ */
+ item_id: string;
+
+ /**
+ * The transcribed text.
+ */
+ transcript: string;
+
+ /**
+ * The event type, must be `conversation.item.input_audio_transcription.completed`.
+ */
+ type: 'conversation.item.input_audio_transcription.completed';
+
+ /**
+ * Usage statistics for the transcription.
+ */
+ usage:
+ | ConversationItemInputAudioTranscriptionCompletedEvent.TranscriptTextUsageTokens
+ | ConversationItemInputAudioTranscriptionCompletedEvent.TranscriptTextUsageDuration;
+
+ /**
+ * The log probabilities of the transcription.
+ */
+ logprobs?: Array | null;
+}
+
+export namespace ConversationItemInputAudioTranscriptionCompletedEvent {
+ /**
+ * Usage statistics for models billed by token usage.
+ */
+ export interface TranscriptTextUsageTokens {
+ /**
+ * Number of input tokens billed for this request.
+ */
+ input_tokens: number;
+
+ /**
+ * Number of output tokens generated.
+ */
+ output_tokens: number;
+
+ /**
+ * Total number of tokens used (input + output).
+ */
+ total_tokens: number;
+
+ /**
+ * The type of the usage object. Always `tokens` for this variant.
+ */
+ type: 'tokens';
+
+ /**
+ * Details about the input tokens billed for this request.
+ */
+ input_token_details?: TranscriptTextUsageTokens.InputTokenDetails;
+ }
+
+ export namespace TranscriptTextUsageTokens {
+ /**
+ * Details about the input tokens billed for this request.
+ */
+ export interface InputTokenDetails {
+ /**
+ * Number of audio tokens billed for this request.
+ */
+ audio_tokens?: number;
+
+ /**
+ * Number of text tokens billed for this request.
+ */
+ text_tokens?: number;
+ }
+ }
+
+ /**
+ * Usage statistics for models billed by audio input duration.
+ */
+ export interface TranscriptTextUsageDuration {
+ /**
+ * Duration of the input audio in seconds.
+ */
+ seconds: number;
+
+ /**
+ * The type of the usage object. Always `duration` for this variant.
+ */
+ type: 'duration';
+ }
+}
+
+/**
+ * Returned when the text value of an input audio transcription content part is
+ * updated.
+ */
+export interface ConversationItemInputAudioTranscriptionDeltaEvent {
+ /**
+ * The unique ID of the server event.
+ */
+ event_id: string;
+
+ /**
+ * The ID of the item.
+ */
+ item_id: string;
+
+ /**
+ * The event type, must be `conversation.item.input_audio_transcription.delta`.
+ */
+ type: 'conversation.item.input_audio_transcription.delta';
+
+ /**
+ * The index of the content part in the item's content array.
+ */
+ content_index?: number;
+
+ /**
+ * The text delta.
+ */
+ delta?: string;
+
+ /**
+ * The log probabilities of the transcription.
+ */
+ logprobs?: Array | null;
+}
+
+/**
+ * Returned when input audio transcription is configured, and a transcription
+ * request for a user message failed. These events are separate from other `error`
+ * events so that the client can identify the related Item.
+ */
+export interface ConversationItemInputAudioTranscriptionFailedEvent {
+ /**
+ * The index of the content part containing the audio.
+ */
+ content_index: number;
+
+ /**
+ * Details of the transcription error.
+ */
+ error: ConversationItemInputAudioTranscriptionFailedEvent.Error;
+
+ /**
+ * The unique ID of the server event.
+ */
+ event_id: string;
+
+ /**
+ * The ID of the user message item.
+ */
+ item_id: string;
+
+ /**
+ * The event type, must be `conversation.item.input_audio_transcription.failed`.
+ */
+ type: 'conversation.item.input_audio_transcription.failed';
+}
+
+export namespace ConversationItemInputAudioTranscriptionFailedEvent {
+ /**
+ * Details of the transcription error.
+ */
+ export interface Error {
+ /**
+ * Error code, if any.
+ */
+ code?: string;
+
+ /**
+ * A human-readable error message.
+ */
+ message?: string;
+
+ /**
+ * Parameter related to the error, if any.
+ */
+ param?: string;
+
+ /**
+ * The type of error.
+ */
+ type?: string;
+ }
+}
+
+/**
+ * Returned when an input audio transcription segment is identified for an item.
+ */
+export interface ConversationItemInputAudioTranscriptionSegment {
+ /**
+ * The segment identifier.
+ */
+ id: string;
+
+ /**
+ * The index of the input audio content part within the item.
+ */
+ content_index: number;
+
+ /**
+ * End time of the segment in seconds.
+ */
+ end: number;
+
+ /**
+ * The unique ID of the server event.
+ */
+ event_id: string;
+
+ /**
+ * The ID of the item containing the input audio content.
+ */
+ item_id: string;
+
+ /**
+ * The detected speaker label for this segment.
+ */
+ speaker: string;
+
+ /**
+ * Start time of the segment in seconds.
+ */
+ start: number;
+
+ /**
+ * The text for this segment.
+ */
+ text: string;
+
+ /**
+ * The event type, must be `conversation.item.input_audio_transcription.segment`.
+ */
+ type: 'conversation.item.input_audio_transcription.segment';
+}
+
+/**
+ * Send this event when you want to retrieve the server's representation of a
+ * specific item in the conversation history. This is useful, for example, to
+ * inspect user audio after noise cancellation and VAD. The server will respond
+ * with a `conversation.item.retrieved` event, unless the item does not exist in
+ * the conversation history, in which case the server will respond with an error.
+ */
+export interface ConversationItemRetrieveEvent {
+ /**
+ * The ID of the item to retrieve.
+ */
+ item_id: string;
+
+ /**
+ * The event type, must be `conversation.item.retrieve`.
+ */
+ type: 'conversation.item.retrieve';
+
+ /**
+ * Optional client-generated ID used to identify this event.
+ */
+ event_id?: string;
+}
+
+/**
+ * Send this event to truncate a previous assistant message’s audio. The server
+ * will produce audio faster than realtime, so this event is useful when the user
+ * interrupts to truncate audio that has already been sent to the client but not
+ * yet played. This will synchronize the server's understanding of the audio with
+ * the client's playback.
+ *
+ * Truncating audio will delete the server-side text transcript to ensure there is
+ * not text in the context that hasn't been heard by the user.
+ *
+ * If successful, the server will respond with a `conversation.item.truncated`
+ * event.
+ */
+export interface ConversationItemTruncateEvent {
+ /**
+ * Inclusive duration up to which audio is truncated, in milliseconds. If the
+ * audio_end_ms is greater than the actual audio duration, the server will respond
+ * with an error.
+ */
+ audio_end_ms: number;
+
+ /**
+ * The index of the content part to truncate. Set this to 0.
+ */
+ content_index: number;
+
+ /**
+ * The ID of the assistant message item to truncate. Only assistant message items
+ * can be truncated.
+ */
+ item_id: string;
+
+ /**
+ * The event type, must be `conversation.item.truncate`.
+ */
+ type: 'conversation.item.truncate';
+
+ /**
+ * Optional client-generated ID used to identify this event.
+ */
+ event_id?: string;
+}
+
+/**
+ * Returned when an earlier assistant audio message item is truncated by the client
+ * with a `conversation.item.truncate` event. This event is used to synchronize the
+ * server's understanding of the audio with the client's playback.
+ *
+ * This action will truncate the audio and remove the server-side text transcript
+ * to ensure there is no text in the context that hasn't been heard by the user.
+ */
+export interface ConversationItemTruncatedEvent {
+ /**
+ * The duration up to which the audio was truncated, in milliseconds.
+ */
+ audio_end_ms: number;
+
+ /**
+ * The index of the content part that was truncated.
+ */
+ content_index: number;
+
+ /**
+ * The unique ID of the server event.
+ */
+ event_id: string;
+
+ /**
+ * The ID of the assistant message item that was truncated.
+ */
+ item_id: string;
+
+ /**
+ * The event type, must be `conversation.item.truncated`.
+ */
+ type: 'conversation.item.truncated';
+}
+
+/**
+ * The item to add to the conversation.
+ */
+export interface ConversationItemWithReference {
+ /**
+ * For an item of type (`message` | `function_call` | `function_call_output`) this
+ * field allows the client to assign the unique ID of the item. It is not required
+ * because the server will generate one if not provided.
+ *
+ * For an item of type `item_reference`, this field is required and is a reference
+ * to any item that has previously existed in the conversation.
+ */
+ id?: string;
+
+ /**
+ * The arguments of the function call (for `function_call` items).
+ */
+ arguments?: string;
+
+ /**
+ * The ID of the function call (for `function_call` and `function_call_output`
+ * items). If passed on a `function_call_output` item, the server will check that a
+ * `function_call` item with the same ID exists in the conversation history.
+ */
+ call_id?: string;
+
+ /**
+ * The content of the message, applicable for `message` items.
+ *
+ * - Message items of role `system` support only `input_text` content
+ * - Message items of role `user` support `input_text` and `input_audio` content
+ * - Message items of role `assistant` support `text` content.
+ */
+ content?: Array;
+
+ /**
+ * The name of the function being called (for `function_call` items).
+ */
+ name?: string;
+
+ /**
+ * Identifier for the API object being returned - always `realtime.item`.
+ */
+ object?: 'realtime.item';
+
+ /**
+ * The output of the function call (for `function_call_output` items).
+ */
+ output?: string;
+
+ /**
+ * The role of the message sender (`user`, `assistant`, `system`), only applicable
+ * for `message` items.
+ */
+ role?: 'user' | 'assistant' | 'system';
+
+ /**
+ * The status of the item (`completed`, `incomplete`, `in_progress`). These have no
+ * effect on the conversation, but are accepted for consistency with the
+ * `conversation.item.created` event.
+ */
+ status?: 'completed' | 'incomplete' | 'in_progress';
+
+ /**
+ * The type of the item (`message`, `function_call`, `function_call_output`,
+ * `item_reference`).
+ */
+ type?: 'message' | 'function_call' | 'function_call_output' | 'item_reference';
+}
+
+export namespace ConversationItemWithReference {
+ export interface Content {
+ /**
+ * ID of a previous conversation item to reference (for `item_reference` content
+ * types in `response.create` events). These can reference both client and server
+ * created items.
+ */
+ id?: string;
+
+ /**
+ * Base64-encoded audio bytes, used for `input_audio` content type.
+ */
+ audio?: string;
+
+ /**
+ * The text content, used for `input_text` and `text` content types.
+ */
+ text?: string;
+
+ /**
+ * The transcript of the audio, used for `input_audio` content type.
+ */
+ transcript?: string;
+
+ /**
+ * The content type (`input_text`, `input_audio`, `item_reference`, `text`).
+ */
+ type?: 'input_text' | 'input_audio' | 'item_reference' | 'text';
+ }
+}
+
+/**
+ * Send this event to append audio bytes to the input audio buffer. The audio
+ * buffer is temporary storage you can write to and later commit. In Server VAD
+ * mode, the audio buffer is used to detect speech and the server will decide when
+ * to commit. When Server VAD is disabled, you must commit the audio buffer
+ * manually.
+ *
+ * The client may choose how much audio to place in each event up to a maximum of
+ * 15 MiB, for example streaming smaller chunks from the client may allow the VAD
+ * to be more responsive. Unlike made other client events, the server will not send
+ * a confirmation response to this event.
+ */
+export interface InputAudioBufferAppendEvent {
+ /**
+ * Base64-encoded audio bytes. This must be in the format specified by the
+ * `input_audio_format` field in the session configuration.
+ */
+ audio: string;
+
+ /**
+ * The event type, must be `input_audio_buffer.append`.
+ */
+ type: 'input_audio_buffer.append';
+
+ /**
+ * Optional client-generated ID used to identify this event.
+ */
+ event_id?: string;
+}
+
+/**
+ * Send this event to clear the audio bytes in the buffer. The server will respond
+ * with an `input_audio_buffer.cleared` event.
+ */
+export interface InputAudioBufferClearEvent {
+ /**
+ * The event type, must be `input_audio_buffer.clear`.
+ */
+ type: 'input_audio_buffer.clear';
+
+ /**
+ * Optional client-generated ID used to identify this event.
+ */
+ event_id?: string;
+}
+
+/**
+ * Returned when the input audio buffer is cleared by the client with a
+ * `input_audio_buffer.clear` event.
+ */
+export interface InputAudioBufferClearedEvent {
+ /**
+ * The unique ID of the server event.
+ */
+ event_id: string;
+
+ /**
+ * The event type, must be `input_audio_buffer.cleared`.
+ */
+ type: 'input_audio_buffer.cleared';
+}
+
+/**
+ * Send this event to commit the user input audio buffer, which will create a new
+ * user message item in the conversation. This event will produce an error if the
+ * input audio buffer is empty. When in Server VAD mode, the client does not need
+ * to send this event, the server will commit the audio buffer automatically.
+ *
+ * Committing the input audio buffer will trigger input audio transcription (if
+ * enabled in session configuration), but it will not create a response from the
+ * model. The server will respond with an `input_audio_buffer.committed` event.
+ */
+export interface InputAudioBufferCommitEvent {
+ /**
+ * The event type, must be `input_audio_buffer.commit`.
+ */
+ type: 'input_audio_buffer.commit';
+
+ /**
+ * Optional client-generated ID used to identify this event.
+ */
+ event_id?: string;
+}
+
+/**
+ * Returned when an input audio buffer is committed, either by the client or
+ * automatically in server VAD mode. The `item_id` property is the ID of the user
+ * message item that will be created, thus a `conversation.item.created` event will
+ * also be sent to the client.
+ */
+export interface InputAudioBufferCommittedEvent {
+ /**
+ * The unique ID of the server event.
+ */
+ event_id: string;
+
+ /**
+ * The ID of the user message item that will be created.
+ */
+ item_id: string;
+
+ /**
+ * The event type, must be `input_audio_buffer.committed`.
+ */
+ type: 'input_audio_buffer.committed';
+
+ /**
+ * The ID of the preceding item after which the new item will be inserted. Can be
+ * `null` if the item has no predecessor.
+ */
+ previous_item_id?: string | null;
+}
+
+/**
+ * Sent by the server when in `server_vad` mode to indicate that speech has been
+ * detected in the audio buffer. This can happen any time audio is added to the
+ * buffer (unless speech is already detected). The client may want to use this
+ * event to interrupt audio playback or provide visual feedback to the user.
+ *
+ * The client should expect to receive a `input_audio_buffer.speech_stopped` event
+ * when speech stops. The `item_id` property is the ID of the user message item
+ * that will be created when speech stops and will also be included in the
+ * `input_audio_buffer.speech_stopped` event (unless the client manually commits
+ * the audio buffer during VAD activation).
+ */
+export interface InputAudioBufferSpeechStartedEvent {
+ /**
+ * Milliseconds from the start of all audio written to the buffer during the
+ * session when speech was first detected. This will correspond to the beginning of
+ * audio sent to the model, and thus includes the `prefix_padding_ms` configured in
+ * the Session.
+ */
+ audio_start_ms: number;
+
+ /**
+ * The unique ID of the server event.
+ */
+ event_id: string;
+
+ /**
+ * The ID of the user message item that will be created when speech stops.
+ */
+ item_id: string;
+
+ /**
+ * The event type, must be `input_audio_buffer.speech_started`.
+ */
+ type: 'input_audio_buffer.speech_started';
+}
+
+/**
+ * Returned in `server_vad` mode when the server detects the end of speech in the
+ * audio buffer. The server will also send an `conversation.item.created` event
+ * with the user message item that is created from the audio buffer.
+ */
+export interface InputAudioBufferSpeechStoppedEvent {
+ /**
+ * Milliseconds since the session started when speech stopped. This will correspond
+ * to the end of audio sent to the model, and thus includes the
+ * `min_silence_duration_ms` configured in the Session.
+ */
+ audio_end_ms: number;
+
+ /**
+ * The unique ID of the server event.
+ */
+ event_id: string;
+
+ /**
+ * The ID of the user message item that will be created.
+ */
+ item_id: string;
+
+ /**
+ * The event type, must be `input_audio_buffer.speech_stopped`.
+ */
+ type: 'input_audio_buffer.speech_stopped';
+}
+
+/**
+ * Returned when the server VAD timeout is triggered for the input audio buffer.
+ */
+export interface InputAudioBufferTimeoutTriggered {
+ /**
+ * Millisecond offset where speech ended within the buffered audio.
+ */
+ audio_end_ms: number;
+
+ /**
+ * Millisecond offset where speech started within the buffered audio.
+ */
+ audio_start_ms: number;
+
+ /**
+ * The unique ID of the server event.
+ */
+ event_id: string;
+
+ /**
+ * The ID of the item associated with this segment.
+ */
+ item_id: string;
+
+ /**
+ * The event type, must be `input_audio_buffer.timeout_triggered`.
+ */
+ type: 'input_audio_buffer.timeout_triggered';
+}
+
+/**
+ * A log probability object.
+ */
+export interface LogProbProperties {
+ /**
+ * The token that was used to generate the log probability.
+ */
+ token: string;
+
+ /**
+ * The bytes that were used to generate the log probability.
+ */
+ bytes: Array;
+
+ /**
+ * The log probability of the token.
+ */
+ logprob: number;
+}
+
+/**
+ * Returned when listing MCP tools has completed for an item.
+ */
+export interface McpListToolsCompleted {
+ /**
+ * The unique ID of the server event.
+ */
+ event_id: string;
+
+ /**
+ * The ID of the MCP list tools item.
+ */
+ item_id: string;
+
+ /**
+ * The event type, must be `mcp_list_tools.completed`.
+ */
+ type: 'mcp_list_tools.completed';
+}
+
+/**
+ * Returned when listing MCP tools has failed for an item.
+ */
+export interface McpListToolsFailed {
+ /**
+ * The unique ID of the server event.
+ */
+ event_id: string;
+
+ /**
+ * The ID of the MCP list tools item.
+ */
+ item_id: string;
+
+ /**
+ * The event type, must be `mcp_list_tools.failed`.
+ */
+ type: 'mcp_list_tools.failed';
+}
+
+/**
+ * Returned when listing MCP tools is in progress for an item.
+ */
+export interface McpListToolsInProgress {
+ /**
+ * The unique ID of the server event.
+ */
+ event_id: string;
+
+ /**
+ * The ID of the MCP list tools item.
+ */
+ item_id: string;
+
+ /**
+ * The event type, must be `mcp_list_tools.in_progress`.
+ */
+ type: 'mcp_list_tools.in_progress';
+}
+
+/**
+ * **WebRTC Only:** Emit to cut off the current audio response. This will trigger
+ * the server to stop generating audio and emit a `output_audio_buffer.cleared`
+ * event. This event should be preceded by a `response.cancel` client event to stop
+ * the generation of the current response.
+ * [Learn more](https://platform.openai.com/docs/guides/realtime-conversations#client-and-server-events-for-audio-in-webrtc).
+ */
+export interface OutputAudioBufferClearEvent {
+ /**
+ * The event type, must be `output_audio_buffer.clear`.
+ */
+ type: 'output_audio_buffer.clear';
+
+ /**
+ * The unique ID of the client event used for error handling.
+ */
+ event_id?: string;
+}
+
+/**
+ * Emitted at the beginning of a Response to indicate the updated rate limits. When
+ * a Response is created some tokens will be "reserved" for the output tokens, the
+ * rate limits shown here reflect that reservation, which is then adjusted
+ * accordingly once the Response is completed.
+ */
+export interface RateLimitsUpdatedEvent {
+ /**
+ * The unique ID of the server event.
+ */
+ event_id: string;
+
+ /**
+ * List of rate limit information.
+ */
+ rate_limits: Array;
+
+ /**
+ * The event type, must be `rate_limits.updated`.
+ */
+ type: 'rate_limits.updated';
+}
+
+export namespace RateLimitsUpdatedEvent {
+ export interface RateLimit {
+ /**
+ * The maximum allowed value for the rate limit.
+ */
+ limit?: number;
+
+ /**
+ * The name of the rate limit (`requests`, `tokens`).
+ */
+ name?: 'requests' | 'tokens';
+
+ /**
+ * The remaining value before the limit is reached.
+ */
+ remaining?: number;
+
+ /**
+ * Seconds until the rate limit resets.
+ */
+ reset_seconds?: number;
+ }
+}
+
+/**
+ * Configuration for input and output audio.
+ */
+export interface RealtimeAudioConfig {
+ input?: RealtimeAudioConfig.Input;
+
+ output?: RealtimeAudioConfig.Output;
+}
+
+export namespace RealtimeAudioConfig {
+ export interface Input {
+ /**
+ * The format of input audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`. For
+ * `pcm16`, input audio must be 16-bit PCM at a 24kHz sample rate, single channel
+ * (mono), and little-endian byte order.
+ */
+ format?: 'pcm16' | 'g711_ulaw' | 'g711_alaw';
+
+ /**
+ * Configuration for input audio noise reduction. This can be set to `null` to turn
+ * off. Noise reduction filters audio added to the input audio buffer before it is
+ * sent to VAD and the model. Filtering the audio can improve VAD and turn
+ * detection accuracy (reducing false positives) and model performance by improving
+ * perception of the input audio.
+ */
+ noise_reduction?: Input.NoiseReduction;
+
+ /**
+ * Configuration for input audio transcription, defaults to off and can be set to
+ * `null` to turn off once on. Input audio transcription is not native to the
+ * model, since the model consumes audio directly. Transcription runs
+ * asynchronously through
+ * [the /audio/transcriptions endpoint](https://platform.openai.com/docs/api-reference/audio/createTranscription)
+ * and should be treated as guidance of input audio content rather than precisely
+ * what the model heard. The client can optionally set the language and prompt for
+ * transcription, these offer additional guidance to the transcription service.
+ */
+ transcription?: Input.Transcription;
+
+ /**
+ * Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
+ * set to `null` to turn off, in which case the client must manually trigger model
+ * response. Server VAD means that the model will detect the start and end of
+ * speech based on audio volume and respond at the end of user speech. Semantic VAD
+ * is more advanced and uses a turn detection model (in conjunction with VAD) to
+ * semantically estimate whether the user has finished speaking, then dynamically
+ * sets a timeout based on this probability. For example, if user audio trails off
+ * with "uhhm", the model will score a low probability of turn end and wait longer
+ * for the user to continue speaking. This can be useful for more natural
+ * conversations, but may have a higher latency.
+ */
+ turn_detection?: Input.TurnDetection;
+ }
+
+ export namespace Input {
+ /**
+ * Configuration for input audio noise reduction. This can be set to `null` to turn
+ * off. Noise reduction filters audio added to the input audio buffer before it is
+ * sent to VAD and the model. Filtering the audio can improve VAD and turn
+ * detection accuracy (reducing false positives) and model performance by improving
+ * perception of the input audio.
+ */
+ export interface NoiseReduction {
+ /**
+ * Type of noise reduction. `near_field` is for close-talking microphones such as
+ * headphones, `far_field` is for far-field microphones such as laptop or
+ * conference room microphones.
+ */
+ type?: 'near_field' | 'far_field';
+ }
+
+ /**
+ * Configuration for input audio transcription, defaults to off and can be set to
+ * `null` to turn off once on. Input audio transcription is not native to the
+ * model, since the model consumes audio directly. Transcription runs
+ * asynchronously through
+ * [the /audio/transcriptions endpoint](https://platform.openai.com/docs/api-reference/audio/createTranscription)
+ * and should be treated as guidance of input audio content rather than precisely
+ * what the model heard. The client can optionally set the language and prompt for
+ * transcription, these offer additional guidance to the transcription service.
+ */
+ export interface Transcription {
+ /**
+ * The language of the input audio. Supplying the input language in
+ * [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`)
+ * format will improve accuracy and latency.
+ */
+ language?: string;
+
+ /**
+ * The model to use for transcription. Current options are `whisper-1`,
+ * `gpt-4o-transcribe-latest`, `gpt-4o-mini-transcribe`, `gpt-4o-transcribe`, and
+ * `gpt-4o-transcribe-diarize`.
+ */
+ model?:
+ | 'whisper-1'
+ | 'gpt-4o-transcribe-latest'
+ | 'gpt-4o-mini-transcribe'
+ | 'gpt-4o-transcribe'
+ | 'gpt-4o-transcribe-diarize';
+
+ /**
+ * An optional text to guide the model's style or continue a previous audio
+ * segment. For `whisper-1`, the
+ * [prompt is a list of keywords](https://platform.openai.com/docs/guides/speech-to-text#prompting).
+ * For `gpt-4o-transcribe` models, the prompt is a free text string, for example
+ * "expect words related to technology".
+ */
+ prompt?: string;
+ }
+
+ /**
+ * Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
+ * set to `null` to turn off, in which case the client must manually trigger model
+ * response. Server VAD means that the model will detect the start and end of
+ * speech based on audio volume and respond at the end of user speech. Semantic VAD
+ * is more advanced and uses a turn detection model (in conjunction with VAD) to
+ * semantically estimate whether the user has finished speaking, then dynamically
+ * sets a timeout based on this probability. For example, if user audio trails off
+ * with "uhhm", the model will score a low probability of turn end and wait longer
+ * for the user to continue speaking. This can be useful for more natural
+ * conversations, but may have a higher latency.
+ */
+ export interface TurnDetection {
+ /**
+ * Whether or not to automatically generate a response when a VAD stop event
+ * occurs.
+ */
+ create_response?: boolean;
+
+ /**
+ * Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
+ * will wait longer for the user to continue speaking, `high` will respond more
+ * quickly. `auto` is the default and is equivalent to `medium`.
+ */
+ eagerness?: 'low' | 'medium' | 'high' | 'auto';
+
+ /**
+ * Optional idle timeout after which turn detection will auto-timeout when no
+ * additional audio is received.
+ */
+ idle_timeout_ms?: number | null;
+
+ /**
+ * Whether or not to automatically interrupt any ongoing response with output to
+ * the default conversation (i.e. `conversation` of `auto`) when a VAD start event
+ * occurs.
+ */
+ interrupt_response?: boolean;
+
+ /**
+ * Used only for `server_vad` mode. Amount of audio to include before the VAD
+ * detected speech (in milliseconds). Defaults to 300ms.
+ */
+ prefix_padding_ms?: number;
+
+ /**
+ * Used only for `server_vad` mode. Duration of silence to detect speech stop (in
+ * milliseconds). Defaults to 500ms. With shorter values the model will respond
+ * more quickly, but may jump in on short pauses from the user.
+ */
+ silence_duration_ms?: number;
+
+ /**
+ * Used only for `server_vad` mode. Activation threshold for VAD (0.0 to 1.0), this
+ * defaults to 0.5. A higher threshold will require louder audio to activate the
+ * model, and thus might perform better in noisy environments.
+ */
+ threshold?: number;
+
+ /**
+ * Type of turn detection.
+ */
+ type?: 'server_vad' | 'semantic_vad';
+ }
+ }
+
+ export interface Output {
+ /**
+ * The format of output audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`.
+ * For `pcm16`, output audio is sampled at a rate of 24kHz.
+ */
+ format?: 'pcm16' | 'g711_ulaw' | 'g711_alaw';
+
+ /**
+ * The speed of the model's spoken response. 1.0 is the default speed. 0.25 is the
+ * minimum speed. 1.5 is the maximum speed. This value can only be changed in
+ * between model turns, not while a response is in progress.
+ */
+ speed?: number;
+
+ /**
+ * The voice the model uses to respond. Voice cannot be changed during the session
+ * once the model has responded with audio at least once. Current voice options are
+ * `alloy`, `ash`, `ballad`, `coral`, `echo`, `sage`, `shimmer`, `verse`, `marin`,
+ * and `cedar`.
+ */
+ voice?:
+ | (string & {})
+ | 'alloy'
+ | 'ash'
+ | 'ballad'
+ | 'coral'
+ | 'echo'
+ | 'sage'
+ | 'shimmer'
+ | 'verse'
+ | 'marin'
+ | 'cedar';
+ }
+}
+
+/**
+ * A realtime client event.
+ */
+export type RealtimeClientEvent =
+ | ConversationItemCreateEvent
+ | ConversationItemDeleteEvent
+ | ConversationItemRetrieveEvent
+ | ConversationItemTruncateEvent
+ | InputAudioBufferAppendEvent
+ | InputAudioBufferClearEvent
+ | OutputAudioBufferClearEvent
+ | InputAudioBufferCommitEvent
+ | ResponseCancelEvent
+ | ResponseCreateEvent
+ | SessionUpdateEvent
+ | TranscriptionSessionUpdate;
+
+/**
+ * Configuration options for the generated client secret.
+ */
+export interface RealtimeClientSecretConfig {
+ /**
+ * Configuration for the ephemeral token expiration.
+ */
+ expires_after?: RealtimeClientSecretConfig.ExpiresAfter;
+}
+
+export namespace RealtimeClientSecretConfig {
+ /**
+ * Configuration for the ephemeral token expiration.
+ */
+ export interface ExpiresAfter {
+ /**
+ * The anchor point for the ephemeral token expiration. Only `created_at` is
+ * currently supported.
+ */
+ anchor: 'created_at';
+
+ /**
+ * The number of seconds from the anchor point to the expiration. Select a value
+ * between `10` and `7200`.
+ */
+ seconds?: number;
+ }
+}
+
+/**
+ * An assistant message item in a Realtime conversation.
+ */
+export interface RealtimeConversationItemAssistantMessage {
+ /**
+ * The content of the message.
+ */
+ content: Array;
+
+ /**
+ * The role of the message sender. Always `assistant`.
+ */
+ role: 'assistant';
+
+ /**
+ * The type of the item. Always `message`.
+ */
+ type: 'message';
+
+ /**
+ * The unique ID of the item.
+ */
+ id?: string;
+
+ /**
+ * Identifier for the API object being returned - always `realtime.item`.
+ */
+ object?: 'realtime.item';
+
+ /**
+ * The status of the item. Has no effect on the conversation.
+ */
+ status?: 'completed' | 'incomplete' | 'in_progress';
+}
+
+export namespace RealtimeConversationItemAssistantMessage {
+ export interface Content {
+ /**
+ * The text content.
+ */
+ text?: string;
+
+ /**
+ * The content type. Always `text` for assistant messages.
+ */
+ type?: 'text';
+ }
+}
+
+/**
+ * A function call item in a Realtime conversation.
+ */
+export interface RealtimeConversationItemFunctionCall {
+ /**
+ * The arguments of the function call.
+ */
+ arguments: string;
+
+ /**
+ * The name of the function being called.
+ */
+ name: string;
+
+ /**
+ * The type of the item. Always `function_call`.
+ */
+ type: 'function_call';
+
+ /**
+ * The unique ID of the item.
+ */
+ id?: string;
+
+ /**
+ * The ID of the function call.
+ */
+ call_id?: string;
+
+ /**
+ * Identifier for the API object being returned - always `realtime.item`.
+ */
+ object?: 'realtime.item';
+
+ /**
+ * The status of the item. Has no effect on the conversation.
+ */
+ status?: 'completed' | 'incomplete' | 'in_progress';
+}
+
+/**
+ * A function call output item in a Realtime conversation.
+ */
+export interface RealtimeConversationItemFunctionCallOutput {
+ /**
+ * The ID of the function call this output is for.
+ */
+ call_id: string;
+
+ /**
+ * The output of the function call.
+ */
+ output: string;
+
+ /**
+ * The type of the item. Always `function_call_output`.
+ */
+ type: 'function_call_output';
+
+ /**
+ * The unique ID of the item.
+ */
+ id?: string;
+
+ /**
+ * Identifier for the API object being returned - always `realtime.item`.
+ */
+ object?: 'realtime.item';
+
+ /**
+ * The status of the item. Has no effect on the conversation.
+ */
+ status?: 'completed' | 'incomplete' | 'in_progress';
+}
+
+/**
+ * A system message item in a Realtime conversation.
+ */
+export interface RealtimeConversationItemSystemMessage {
+ /**
+ * The content of the message.
+ */
+ content: Array;
+
+ /**
+ * The role of the message sender. Always `system`.
+ */
+ role: 'system';
+
+ /**
+ * The type of the item. Always `message`.
+ */
+ type: 'message';
+
+ /**
+ * The unique ID of the item.
+ */
+ id?: string;
+
+ /**
+ * Identifier for the API object being returned - always `realtime.item`.
+ */
+ object?: 'realtime.item';
+
+ /**
+ * The status of the item. Has no effect on the conversation.
+ */
+ status?: 'completed' | 'incomplete' | 'in_progress';
+}
+
+export namespace RealtimeConversationItemSystemMessage {
+ export interface Content {
+ /**
+ * The text content.
+ */
+ text?: string;
+
+ /**
+ * The content type. Always `input_text` for system messages.
+ */
+ type?: 'input_text';
+ }
+}
+
+/**
+ * A user message item in a Realtime conversation.
+ */
+export interface RealtimeConversationItemUserMessage {
+ /**
+ * The content of the message.
+ */
+ content: Array;
+
+ /**
+ * The role of the message sender. Always `user`.
+ */
+ role: 'user';
+
+ /**
+ * The type of the item. Always `message`.
+ */
+ type: 'message';
+
+ /**
+ * The unique ID of the item.
+ */
+ id?: string;
+
+ /**
+ * Identifier for the API object being returned - always `realtime.item`.
+ */
+ object?: 'realtime.item';
+
+ /**
+ * The status of the item. Has no effect on the conversation.
+ */
+ status?: 'completed' | 'incomplete' | 'in_progress';
+}
+
+export namespace RealtimeConversationItemUserMessage {
+ export interface Content {
+ /**
+ * Base64-encoded audio bytes (for `input_audio`).
+ */
+ audio?: string;
+
+ /**
+ * The text content (for `input_text`).
+ */
+ text?: string;
+
+ /**
+ * Transcript of the audio (for `input_audio`).
+ */
+ transcript?: string;
+
+ /**
+ * The content type (`input_text` or `input_audio`).
+ */
+ type?: 'input_text' | 'input_audio';
+ }
+}
+
+/**
+ * Details of the error.
+ */
+export interface RealtimeError {
+ /**
+ * A human-readable error message.
+ */
+ message: string;
+
+ /**
+ * The type of error (e.g., "invalid_request_error", "server_error").
+ */
+ type: string;
+
+ /**
+ * Error code, if any.
+ */
+ code?: string | null;
+
+ /**
+ * The event_id of the client event that caused the error, if applicable.
+ */
+ event_id?: string | null;
+
+ /**
+ * Parameter related to the error, if any.
+ */
+ param?: string | null;
+}
+
+/**
+ * Returned when an error occurs, which could be a client problem or a server
+ * problem. Most errors are recoverable and the session will stay open, we
+ * recommend to implementors to monitor and log error messages by default.
+ */
+export interface RealtimeErrorEvent {
+ /**
+ * Details of the error.
+ */
+ error: RealtimeError;
+
+ /**
+ * The unique ID of the server event.
+ */
+ event_id: string;
+
+ /**
+ * The event type, must be `error`.
+ */
+ type: 'error';
+}
+
+/**
+ * A Realtime item requesting human approval of a tool invocation.
+ */
+export interface RealtimeMcpApprovalRequest {
+ /**
+ * The unique ID of the approval request.
+ */
+ id: string;
+
+ /**
+ * A JSON string of arguments for the tool.
+ */
+ arguments: string;
+
+ /**
+ * The name of the tool to run.
+ */
+ name: string;
+
+ /**
+ * The label of the MCP server making the request.
+ */
+ server_label: string;
+
+ /**
+ * The type of the item. Always `mcp_approval_request`.
+ */
+ type: 'mcp_approval_request';
+}
+
+/**
+ * A Realtime item responding to an MCP approval request.
+ */
+export interface RealtimeMcpApprovalResponse {
+ /**
+ * The unique ID of the approval response.
+ */
+ id: string;
+
+ /**
+ * The ID of the approval request being answered.
+ */
+ approval_request_id: string;
+
+ /**
+ * Whether the request was approved.
+ */
+ approve: boolean;
+
+ /**
+ * The type of the item. Always `mcp_approval_response`.
+ */
+ type: 'mcp_approval_response';
+
+ /**
+ * Optional reason for the decision.
+ */
+ reason?: string | null;
+}
+
+/**
+ * A Realtime item listing tools available on an MCP server.
+ */
+export interface RealtimeMcpListTools {
+ /**
+ * The label of the MCP server.
+ */
+ server_label: string;
+
+ /**
+ * The tools available on the server.
+ */
+ tools: Array;
+
+ /**
+ * The type of the item. Always `mcp_list_tools`.
+ */
+ type: 'mcp_list_tools';
+
+ /**
+ * The unique ID of the list.
+ */
+ id?: string;
+}
+
+export namespace RealtimeMcpListTools {
+ /**
+ * A tool available on an MCP server.
+ */
+ export interface Tool {
+ /**
+ * The JSON schema describing the tool's input.
+ */
+ input_schema: unknown;
+
+ /**
+ * The name of the tool.
+ */
+ name: string;
+
+ /**
+ * Additional annotations about the tool.
+ */
+ annotations?: unknown | null;
+
+ /**
+ * The description of the tool.
+ */
+ description?: string | null;
+ }
+}
+
+export interface RealtimeMcpProtocolError {
+ code: number;
+
+ message: string;
+
+ type: 'protocol_error';
+}
+
+/**
+ * A Realtime item representing an invocation of a tool on an MCP server.
+ */
+export interface RealtimeMcpToolCall {
+ /**
+ * The unique ID of the tool call.
+ */
+ id: string;
+
+ /**
+ * A JSON string of the arguments passed to the tool.
+ */
+ arguments: string;
+
+ /**
+ * The name of the tool that was run.
+ */
+ name: string;
+
+ /**
+ * The label of the MCP server running the tool.
+ */
+ server_label: string;
+
+ /**
+ * The type of the item. Always `mcp_tool_call`.
+ */
+ type: 'mcp_tool_call';
+
+ /**
+ * The ID of an associated approval request, if any.
+ */
+ approval_request_id?: string | null;
+
+ /**
+ * The error from the tool call, if any.
+ */
+ error?: RealtimeMcpProtocolError | RealtimeMcpToolExecutionError | RealtimeMcphttpError | null;
+
+ /**
+ * The output from the tool call.
+ */
+ output?: string | null;
+}
+
+export interface RealtimeMcpToolExecutionError {
+ message: string;
+
+ type: 'tool_execution_error';
+}
+
+export interface RealtimeMcphttpError {
+ code: number;
+
+ message: string;
+
+ type: 'http_error';
+}
+
+/**
+ * The response resource.
+ */
+export interface RealtimeResponse {
+ /**
+ * The unique ID of the response.
+ */
+ id?: string;
+
+ /**
+ * Which conversation the response is added to, determined by the `conversation`
+ * field in the `response.create` event. If `auto`, the response will be added to
+ * the default conversation and the value of `conversation_id` will be an id like
+ * `conv_1234`. If `none`, the response will not be added to any conversation and
+ * the value of `conversation_id` will be `null`. If responses are being triggered
+ * by server VAD, the response will be added to the default conversation, thus the
+ * `conversation_id` will be an id like `conv_1234`.
+ */
+ conversation_id?: string;
+
+ /**
+ * Maximum number of output tokens for a single assistant response, inclusive of
+ * tool calls, that was used in this response.
+ */
+ max_output_tokens?: number | 'inf';
+
+ /**
+ * Set of 16 key-value pairs that can be attached to an object. This can be useful
+ * for storing additional information about the object in a structured format, and
+ * querying for objects via API or the dashboard.
+ *
+ * Keys are strings with a maximum length of 64 characters. Values are strings with
+ * a maximum length of 512 characters.
+ */
+ metadata?: Shared.Metadata | null;
+
+ /**
+ * The set of modalities the model used to respond. If there are multiple
+ * modalities, the model will pick one, for example if `modalities` is
+ * `["text", "audio"]`, the model could be responding in either text or audio.
+ */
+ modalities?: Array<'text' | 'audio'>;
+
+ /**
+ * The object type, must be `realtime.response`.
+ */
+ object?: 'realtime.response';
+
+ /**
+ * The list of output items generated by the response.
+ */
+ output?: Array;
+
+ /**
+ * The format of output audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`.
+ */
+ output_audio_format?: 'pcm16' | 'g711_ulaw' | 'g711_alaw';
+
+ /**
+ * The final status of the response (`completed`, `cancelled`, `failed`, or
+ * `incomplete`, `in_progress`).
+ */
+ status?: 'completed' | 'cancelled' | 'failed' | 'incomplete' | 'in_progress';
+
+ /**
+ * Additional details about the status.
+ */
+ status_details?: RealtimeResponseStatus;
+
+ /**
+ * Sampling temperature for the model, limited to [0.6, 1.2]. Defaults to 0.8.
+ */
+ temperature?: number;
+
+ /**
+ * Usage statistics for the Response, this will correspond to billing. A Realtime
+ * API session will maintain a conversation context and append new Items to the
+ * Conversation, thus output from previous turns (text and audio tokens) will
+ * become the input for later turns.
+ */
+ usage?: RealtimeResponseUsage;
+
+ /**
+ * The voice the model used to respond. Current voice options are `alloy`, `ash`,
+ * `ballad`, `coral`, `echo`, `sage`, `shimmer`, and `verse`.
+ */
+ voice?:
+ | (string & {})
+ | 'alloy'
+ | 'ash'
+ | 'ballad'
+ | 'coral'
+ | 'echo'
+ | 'sage'
+ | 'shimmer'
+ | 'verse'
+ | 'marin'
+ | 'cedar';
+}
+
+/**
+ * Additional details about the status.
+ */
+export interface RealtimeResponseStatus {
+ /**
+ * A description of the error that caused the response to fail, populated when the
+ * `status` is `failed`.
+ */
+ error?: RealtimeResponseStatus.Error;
+
+ /**
+ * The reason the Response did not complete. For a `cancelled` Response, one of
+ * `turn_detected` (the server VAD detected a new start of speech) or
+ * `client_cancelled` (the client sent a cancel event). For an `incomplete`
+ * Response, one of `max_output_tokens` or `content_filter` (the server-side safety
+ * filter activated and cut off the response).
+ */
+ reason?: 'turn_detected' | 'client_cancelled' | 'max_output_tokens' | 'content_filter';
+
+ /**
+ * The type of error that caused the response to fail, corresponding with the
+ * `status` field (`completed`, `cancelled`, `incomplete`, `failed`).
+ */
+ type?: 'completed' | 'cancelled' | 'incomplete' | 'failed';
+}
+
+export namespace RealtimeResponseStatus {
+ /**
+ * A description of the error that caused the response to fail, populated when the
+ * `status` is `failed`.
+ */
+ export interface Error {
+ /**
+ * Error code, if any.
+ */
+ code?: string;
+
+ /**
+ * The type of error.
+ */
+ type?: string;
+ }
+}
+
+/**
+ * Usage statistics for the Response, this will correspond to billing. A Realtime
+ * API session will maintain a conversation context and append new Items to the
+ * Conversation, thus output from previous turns (text and audio tokens) will
+ * become the input for later turns.
+ */
+export interface RealtimeResponseUsage {
+ /**
+ * Details about the input tokens used in the Response.
+ */
+ input_token_details?: RealtimeResponseUsageInputTokenDetails;
+
+ /**
+ * The number of input tokens used in the Response, including text and audio
+ * tokens.
+ */
+ input_tokens?: number;
+
+ /**
+ * Details about the output tokens used in the Response.
+ */
+ output_token_details?: RealtimeResponseUsageOutputTokenDetails;
+
+ /**
+ * The number of output tokens sent in the Response, including text and audio
+ * tokens.
+ */
+ output_tokens?: number;
+
+ /**
+ * The total number of tokens in the Response including input and output text and
+ * audio tokens.
+ */
+ total_tokens?: number;
+}
+
+/**
+ * Details about the input tokens used in the Response.
+ */
+export interface RealtimeResponseUsageInputTokenDetails {
+ /**
+ * The number of audio tokens used in the Response.
+ */
+ audio_tokens?: number;
+
+ /**
+ * The number of cached tokens used in the Response.
+ */
+ cached_tokens?: number;
+
+ /**
+ * The number of text tokens used in the Response.
+ */
+ text_tokens?: number;
+}
+
+/**
+ * Details about the output tokens used in the Response.
+ */
+export interface RealtimeResponseUsageOutputTokenDetails {
+ /**
+ * The number of audio tokens used in the Response.
+ */
+ audio_tokens?: number;
+
+ /**
+ * The number of text tokens used in the Response.
+ */
+ text_tokens?: number;
+}
+
+/**
+ * A realtime server event.
+ */
+export type RealtimeServerEvent =
+ | ConversationCreatedEvent
+ | ConversationItemCreatedEvent
+ | ConversationItemDeletedEvent
+ | ConversationItemInputAudioTranscriptionCompletedEvent
+ | ConversationItemInputAudioTranscriptionDeltaEvent
+ | ConversationItemInputAudioTranscriptionFailedEvent
+ | RealtimeServerEvent.ConversationItemRetrieved
+ | ConversationItemTruncatedEvent
+ | RealtimeErrorEvent
+ | InputAudioBufferClearedEvent
+ | InputAudioBufferCommittedEvent
+ | InputAudioBufferSpeechStartedEvent
+ | InputAudioBufferSpeechStoppedEvent
+ | RateLimitsUpdatedEvent
+ | ResponseAudioDeltaEvent
+ | ResponseAudioDoneEvent
+ | ResponseAudioTranscriptDeltaEvent
+ | ResponseAudioTranscriptDoneEvent
+ | ResponseContentPartAddedEvent
+ | ResponseContentPartDoneEvent
+ | ResponseCreatedEvent
+ | ResponseDoneEvent
+ | ResponseFunctionCallArgumentsDeltaEvent
+ | ResponseFunctionCallArgumentsDoneEvent
+ | ResponseOutputItemAddedEvent
+ | ResponseOutputItemDoneEvent
+ | ResponseTextDeltaEvent
+ | ResponseTextDoneEvent
+ | SessionCreatedEvent
+ | SessionUpdatedEvent
+ | TranscriptionSessionUpdatedEvent
+ | TranscriptionSessionCreated
+ | RealtimeServerEvent.OutputAudioBufferStarted
+ | RealtimeServerEvent.OutputAudioBufferStopped
+ | RealtimeServerEvent.OutputAudioBufferCleared
+ | ConversationItemAdded
+ | ConversationItemDone
+ | InputAudioBufferTimeoutTriggered
+ | ConversationItemInputAudioTranscriptionSegment
+ | McpListToolsInProgress
+ | McpListToolsCompleted
+ | McpListToolsFailed
+ | ResponseMcpCallArgumentsDelta
+ | ResponseMcpCallArgumentsDone
+ | ResponseMcpCallInProgress
+ | ResponseMcpCallCompleted
+ | ResponseMcpCallFailed;
+
+export namespace RealtimeServerEvent {
+ /**
+ * Returned when a conversation item is retrieved with
+ * `conversation.item.retrieve`.
+ */
+ export interface ConversationItemRetrieved {
+ /**
+ * The unique ID of the server event.
+ */
+ event_id: string;
+
+ /**
+ * A single item within a Realtime conversation.
+ */
+ item: RealtimeAPI.ConversationItem;
+
+ /**
+ * The event type, must be `conversation.item.retrieved`.
+ */
+ type: 'conversation.item.retrieved';
+ }
+
+ /**
+ * **WebRTC Only:** Emitted when the server begins streaming audio to the client.
+ * This event is emitted after an audio content part has been added
+ * (`response.content_part.added`) to the response.
+ * [Learn more](https://platform.openai.com/docs/guides/realtime-conversations#client-and-server-events-for-audio-in-webrtc).
+ */
+ export interface OutputAudioBufferStarted {
+ /**
+ * The unique ID of the server event.
+ */
+ event_id: string;
+
+ /**
+ * The unique ID of the response that produced the audio.
+ */
+ response_id: string;
+
+ /**
+ * The event type, must be `output_audio_buffer.started`.
+ */
+ type: 'output_audio_buffer.started';
+ }
+
+ /**
+ * **WebRTC Only:** Emitted when the output audio buffer has been completely
+ * drained on the server, and no more audio is forthcoming. This event is emitted
+ * after the full response data has been sent to the client (`response.done`).
+ * [Learn more](https://platform.openai.com/docs/guides/realtime-conversations#client-and-server-events-for-audio-in-webrtc).
+ */
+ export interface OutputAudioBufferStopped {
+ /**
+ * The unique ID of the server event.
+ */
+ event_id: string;
+
+ /**
+ * The unique ID of the response that produced the audio.
+ */
+ response_id: string;
+
+ /**
+ * The event type, must be `output_audio_buffer.stopped`.
+ */
+ type: 'output_audio_buffer.stopped';
+ }
+
+ /**
+ * **WebRTC Only:** Emitted when the output audio buffer is cleared. This happens
+ * either in VAD mode when the user has interrupted
+ * (`input_audio_buffer.speech_started`), or when the client has emitted the
+ * `output_audio_buffer.clear` event to manually cut off the current audio
+ * response.
+ * [Learn more](https://platform.openai.com/docs/guides/realtime-conversations#client-and-server-events-for-audio-in-webrtc).
+ */
+ export interface OutputAudioBufferCleared {
+ /**
+ * The unique ID of the server event.
+ */
+ event_id: string;
+
+ /**
+ * The unique ID of the response that produced the audio.
+ */
+ response_id: string;
+
+ /**
+ * The event type, must be `output_audio_buffer.cleared`.
+ */
+ type: 'output_audio_buffer.cleared';
+ }
+}
+
+/**
+ * Realtime session object.
+ */
+export interface RealtimeSession {
+ /**
+ * Unique identifier for the session that looks like `sess_1234567890abcdef`.
+ */
+ id?: string;
+
+ /**
+ * Expiration timestamp for the session, in seconds since epoch.
+ */
+ expires_at?: number;
+
+ /**
+ * Additional fields to include in server outputs.
+ *
+ * - `item.input_audio_transcription.logprobs`: Include logprobs for input audio
+ * transcription.
+ */
+ include?: Array<'item.input_audio_transcription.logprobs'> | null;
+
+ /**
+ * The format of input audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`. For
+ * `pcm16`, input audio must be 16-bit PCM at a 24kHz sample rate, single channel
+ * (mono), and little-endian byte order.
+ */
+ input_audio_format?: 'pcm16' | 'g711_ulaw' | 'g711_alaw';
+
+ /**
+ * Configuration for input audio noise reduction. This can be set to `null` to turn
+ * off. Noise reduction filters audio added to the input audio buffer before it is
+ * sent to VAD and the model. Filtering the audio can improve VAD and turn
+ * detection accuracy (reducing false positives) and model performance by improving
+ * perception of the input audio.
+ */
+ input_audio_noise_reduction?: RealtimeSession.InputAudioNoiseReduction;
+
+ /**
+ * Configuration for input audio transcription, defaults to off and can be set to
+ * `null` to turn off once on. Input audio transcription is not native to the
+ * model, since the model consumes audio directly. Transcription runs
+ * asynchronously through
+ * [the /audio/transcriptions endpoint](https://platform.openai.com/docs/api-reference/audio/createTranscription)
+ * and should be treated as guidance of input audio content rather than precisely
+ * what the model heard. The client can optionally set the language and prompt for
+ * transcription, these offer additional guidance to the transcription service.
+ */
+ input_audio_transcription?: RealtimeSession.InputAudioTranscription | null;
+
+ /**
+ * The default system instructions (i.e. system message) prepended to model calls.
+ * This field allows the client to guide the model on desired responses. The model
+ * can be instructed on response content and format, (e.g. "be extremely succinct",
+ * "act friendly", "here are examples of good responses") and on audio behavior
+ * (e.g. "talk quickly", "inject emotion into your voice", "laugh frequently"). The
+ * instructions are not guaranteed to be followed by the model, but they provide
+ * guidance to the model on the desired behavior.
+ *
+ * Note that the server sets default instructions which will be used if this field
+ * is not set and are visible in the `session.created` event at the start of the
+ * session.
+ */
+ instructions?: string;
+
+ /**
+ * Maximum number of output tokens for a single assistant response, inclusive of
+ * tool calls. Provide an integer between 1 and 4096 to limit output tokens, or
+ * `inf` for the maximum available tokens for a given model. Defaults to `inf`.
+ */
+ max_response_output_tokens?: number | 'inf';
+
+ /**
+ * The set of modalities the model can respond with. To disable audio, set this to
+ * ["text"].
+ */
+ modalities?: Array<'text' | 'audio'>;
+
+ /**
+ * The Realtime model used for this session.
+ */
+ model?:
+ | 'gpt-4o-realtime-preview'
+ | 'gpt-4o-realtime-preview-2024-10-01'
+ | 'gpt-4o-realtime-preview-2024-12-17'
+ | 'gpt-4o-realtime-preview-2025-06-03'
+ | 'gpt-4o-mini-realtime-preview'
+ | 'gpt-4o-mini-realtime-preview-2024-12-17';
+
+ /**
+ * The object type. Always `realtime.session`.
+ */
+ object?: 'realtime.session';
+
+ /**
+ * The format of output audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`.
+ * For `pcm16`, output audio is sampled at a rate of 24kHz.
+ */
+ output_audio_format?: 'pcm16' | 'g711_ulaw' | 'g711_alaw';
+
+ /**
+ * Reference to a prompt template and its variables.
+ * [Learn more](https://platform.openai.com/docs/guides/text?api-mode=responses#reusable-prompts).
+ */
+ prompt?: ResponsesAPI.ResponsePrompt | null;
+
+ /**
+ * The speed of the model's spoken response. 1.0 is the default speed. 0.25 is the
+ * minimum speed. 1.5 is the maximum speed. This value can only be changed in
+ * between model turns, not while a response is in progress.
+ */
+ speed?: number;
+
+ /**
+ * Sampling temperature for the model, limited to [0.6, 1.2]. For audio models a
+ * temperature of 0.8 is highly recommended for best performance.
+ */
+ temperature?: number;
+
+ /**
+ * How the model chooses tools. Options are `auto`, `none`, `required`, or specify
+ * a function.
+ */
+ tool_choice?: string;
+
+ /**
+ * Tools (functions) available to the model.
+ */
+ tools?: Array;
+
+ /**
+ * Configuration options for tracing. Set to null to disable tracing. Once tracing
+ * is enabled for a session, the configuration cannot be modified.
+ *
+ * `auto` will create a trace for the session with default values for the workflow
+ * name, group id, and metadata.
+ */
+ tracing?: 'auto' | RealtimeSession.TracingConfiguration | null;
+
+ /**
+ * Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
+ * set to `null` to turn off, in which case the client must manually trigger model
+ * response. Server VAD means that the model will detect the start and end of
+ * speech based on audio volume and respond at the end of user speech. Semantic VAD
+ * is more advanced and uses a turn detection model (in conjunction with VAD) to
+ * semantically estimate whether the user has finished speaking, then dynamically
+ * sets a timeout based on this probability. For example, if user audio trails off
+ * with "uhhm", the model will score a low probability of turn end and wait longer
+ * for the user to continue speaking. This can be useful for more natural
+ * conversations, but may have a higher latency.
+ */
+ turn_detection?: RealtimeSession.TurnDetection | null;
+
+ /**
+ * The voice the model uses to respond. Voice cannot be changed during the session
+ * once the model has responded with audio at least once. Current voice options are
+ * `alloy`, `ash`, `ballad`, `coral`, `echo`, `sage`, `shimmer`, and `verse`.
+ */
+ voice?:
+ | (string & {})
+ | 'alloy'
+ | 'ash'
+ | 'ballad'
+ | 'coral'
+ | 'echo'
+ | 'sage'
+ | 'shimmer'
+ | 'verse'
+ | 'marin'
+ | 'cedar';
+}
+
+export namespace RealtimeSession {
+ /**
+ * Configuration for input audio noise reduction. This can be set to `null` to turn
+ * off. Noise reduction filters audio added to the input audio buffer before it is
+ * sent to VAD and the model. Filtering the audio can improve VAD and turn
+ * detection accuracy (reducing false positives) and model performance by improving
+ * perception of the input audio.
+ */
+ export interface InputAudioNoiseReduction {
+ /**
+ * Type of noise reduction. `near_field` is for close-talking microphones such as
+ * headphones, `far_field` is for far-field microphones such as laptop or
+ * conference room microphones.
+ */
+ type?: 'near_field' | 'far_field';
+ }
+
+ /**
+ * Configuration for input audio transcription, defaults to off and can be set to
+ * `null` to turn off once on. Input audio transcription is not native to the
+ * model, since the model consumes audio directly. Transcription runs
+ * asynchronously through
+ * [the /audio/transcriptions endpoint](https://platform.openai.com/docs/api-reference/audio/createTranscription)
+ * and should be treated as guidance of input audio content rather than precisely
+ * what the model heard. The client can optionally set the language and prompt for
+ * transcription, these offer additional guidance to the transcription service.
+ */
+ export interface InputAudioTranscription {
+ /**
+ * The language of the input audio. Supplying the input language in
+ * [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`)
+ * format will improve accuracy and latency.
+ */
+ language?: string;
+
+ /**
+ * The model to use for transcription, current options are `gpt-4o-transcribe`,
+ * `gpt-4o-mini-transcribe`, and `whisper-1`.
+ */
+ model?: string;
+
+ /**
+ * An optional text to guide the model's style or continue a previous audio
+ * segment. For `whisper-1`, the
+ * [prompt is a list of keywords](https://platform.openai.com/docs/guides/speech-to-text#prompting).
+ * For `gpt-4o-transcribe` models, the prompt is a free text string, for example
+ * "expect words related to technology".
+ */
+ prompt?: string;
+ }
+
+ export interface Tool {
+ /**
+ * The description of the function, including guidance on when and how to call it,
+ * and guidance about what to tell the user when calling (if anything).
+ */
+ description?: string;
+
+ /**
+ * The name of the function.
+ */
+ name?: string;
+
+ /**
+ * Parameters of the function in JSON Schema.
+ */
+ parameters?: unknown;
+
+ /**
+ * The type of the tool, i.e. `function`.
+ */
+ type?: 'function';
+ }
+
+ /**
+ * Granular configuration for tracing.
+ */
+ export interface TracingConfiguration {
+ /**
+ * The group id to attach to this trace to enable filtering and grouping in the
+ * traces dashboard.
+ */
+ group_id?: string;
+
+ /**
+ * The arbitrary metadata to attach to this trace to enable filtering in the traces
+ * dashboard.
+ */
+ metadata?: unknown;
+
+ /**
+ * The name of the workflow to attach to this trace. This is used to name the trace
+ * in the traces dashboard.
+ */
+ workflow_name?: string;
+ }
+
+ /**
+ * Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
+ * set to `null` to turn off, in which case the client must manually trigger model
+ * response. Server VAD means that the model will detect the start and end of
+ * speech based on audio volume and respond at the end of user speech. Semantic VAD
+ * is more advanced and uses a turn detection model (in conjunction with VAD) to
+ * semantically estimate whether the user has finished speaking, then dynamically
+ * sets a timeout based on this probability. For example, if user audio trails off
+ * with "uhhm", the model will score a low probability of turn end and wait longer
+ * for the user to continue speaking. This can be useful for more natural
+ * conversations, but may have a higher latency.
+ */
+ export interface TurnDetection {
+ /**
+ * Whether or not to automatically generate a response when a VAD stop event
+ * occurs.
+ */
+ create_response?: boolean;
+
+ /**
+ * Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
+ * will wait longer for the user to continue speaking, `high` will respond more
+ * quickly. `auto` is the default and is equivalent to `medium`.
+ */
+ eagerness?: 'low' | 'medium' | 'high' | 'auto';
+
+ /**
+ * Optional idle timeout after which turn detection will auto-timeout when no
+ * additional audio is received.
+ */
+ idle_timeout_ms?: number | null;
+
+ /**
+ * Whether or not to automatically interrupt any ongoing response with output to
+ * the default conversation (i.e. `conversation` of `auto`) when a VAD start event
+ * occurs.
+ */
+ interrupt_response?: boolean;
+
+ /**
+ * Used only for `server_vad` mode. Amount of audio to include before the VAD
+ * detected speech (in milliseconds). Defaults to 300ms.
+ */
+ prefix_padding_ms?: number;
+
+ /**
+ * Used only for `server_vad` mode. Duration of silence to detect speech stop (in
+ * milliseconds). Defaults to 500ms. With shorter values the model will respond
+ * more quickly, but may jump in on short pauses from the user.
+ */
+ silence_duration_ms?: number;
+
+ /**
+ * Used only for `server_vad` mode. Activation threshold for VAD (0.0 to 1.0), this
+ * defaults to 0.5. A higher threshold will require louder audio to activate the
+ * model, and thus might perform better in noisy environments.
+ */
+ threshold?: number;
+
+ /**
+ * Type of turn detection.
+ */
+ type?: 'server_vad' | 'semantic_vad';
+ }
+}
+
+/**
+ * Realtime session object configuration.
+ */
+export interface RealtimeSessionCreateRequest {
+ /**
+ * The Realtime model used for this session.
+ */
+ model:
+ | (string & {})
+ | 'gpt-4o-realtime'
+ | 'gpt-4o-mini-realtime'
+ | 'gpt-4o-realtime-preview'
+ | 'gpt-4o-realtime-preview-2024-10-01'
+ | 'gpt-4o-realtime-preview-2024-12-17'
+ | 'gpt-4o-realtime-preview-2025-06-03'
+ | 'gpt-4o-mini-realtime-preview'
+ | 'gpt-4o-mini-realtime-preview-2024-12-17';
+
+ /**
+ * The type of session to create. Always `realtime` for the Realtime API.
+ */
+ type: 'realtime';
+
+ /**
+ * Configuration for input and output audio.
+ */
+ audio?: RealtimeAudioConfig;
+
+ /**
+ * Configuration options for the generated client secret.
+ */
+ client_secret?: RealtimeClientSecretConfig;
+
+ /**
+ * Additional fields to include in server outputs.
+ *
+ * - `item.input_audio_transcription.logprobs`: Include logprobs for input audio
+ * transcription.
+ */
+ include?: Array<'item.input_audio_transcription.logprobs'>;
+
+ /**
+ * The default system instructions (i.e. system message) prepended to model calls.
+ * This field allows the client to guide the model on desired responses. The model
+ * can be instructed on response content and format, (e.g. "be extremely succinct",
+ * "act friendly", "here are examples of good responses") and on audio behavior
+ * (e.g. "talk quickly", "inject emotion into your voice", "laugh frequently"). The
+ * instructions are not guaranteed to be followed by the model, but they provide
+ * guidance to the model on the desired behavior.
+ *
+ * Note that the server sets default instructions which will be used if this field
+ * is not set and are visible in the `session.created` event at the start of the
+ * session.
+ */
+ instructions?: string;
+
+ /**
+ * Maximum number of output tokens for a single assistant response, inclusive of
+ * tool calls. Provide an integer between 1 and 4096 to limit output tokens, or
+ * `inf` for the maximum available tokens for a given model. Defaults to `inf`.
+ */
+ max_output_tokens?: number | 'inf';
+
+ /**
+ * The set of modalities the model can respond with. To disable audio, set this to
+ * ["text"].
+ */
+ output_modalities?: Array<'text' | 'audio'>;
+
+ /**
+ * Reference to a prompt template and its variables.
+ * [Learn more](https://platform.openai.com/docs/guides/text?api-mode=responses#reusable-prompts).
+ */
+ prompt?: ResponsesAPI.ResponsePrompt | null;
+
+ /**
+ * Sampling temperature for the model, limited to [0.6, 1.2]. For audio models a
+ * temperature of 0.8 is highly recommended for best performance.
+ */
+ temperature?: number;
+
+ /**
+ * How the model chooses tools. Provide one of the string modes or force a specific
+ * function/MCP tool.
+ */
+ tool_choice?: RealtimeToolChoiceConfig;
+
+ /**
+ * Tools available to the model.
+ */
+ tools?: RealtimeToolsConfig;
+
+ /**
+ * Configuration options for tracing. Set to null to disable tracing. Once tracing
+ * is enabled for a session, the configuration cannot be modified.
+ *
+ * `auto` will create a trace for the session with default values for the workflow
+ * name, group id, and metadata.
+ */
+ tracing?: RealtimeTracingConfig | null;
+
+ /**
+ * Controls how the realtime conversation is truncated prior to model inference.
+ * The default is `auto`. When set to `retention_ratio`, the server retains a
+ * fraction of the conversation tokens prior to the instructions.
+ */
+ truncation?: RealtimeTruncation;
+}
+
+/**
+ * How the model chooses tools. Provide one of the string modes or force a specific
+ * function/MCP tool.
+ */
+export type RealtimeToolChoiceConfig =
+ | ResponsesAPI.ToolChoiceOptions
+ | ResponsesAPI.ToolChoiceFunction
+ | ResponsesAPI.ToolChoiceMcp;
+
+/**
+ * Tools available to the model.
+ */
+export type RealtimeToolsConfig = Array;
+
+/**
+ * Give the model access to additional tools via remote Model Context Protocol
+ * (MCP) servers.
+ * [Learn more about MCP](https://platform.openai.com/docs/guides/tools-remote-mcp).
+ */
+export type RealtimeToolsConfigUnion = RealtimeToolsConfigUnion.Function | RealtimeToolsConfigUnion.Mcp;
+
+export namespace RealtimeToolsConfigUnion {
+ export interface Function {
+ /**
+ * The description of the function, including guidance on when and how to call it,
+ * and guidance about what to tell the user when calling (if anything).
+ */
+ description?: string;
+
+ /**
+ * The name of the function.
+ */
+ name?: string;
+
+ /**
+ * Parameters of the function in JSON Schema.
+ */
+ parameters?: unknown;
+
+ /**
+ * The type of the tool, i.e. `function`.
+ */
+ type?: 'function';
+ }
+
+ /**
+ * Give the model access to additional tools via remote Model Context Protocol
+ * (MCP) servers.
+ * [Learn more about MCP](https://platform.openai.com/docs/guides/tools-remote-mcp).
+ */
+ export interface Mcp {
+ /**
+ * A label for this MCP server, used to identify it in tool calls.
+ */
+ server_label: string;
+
+ /**
+ * The type of the MCP tool. Always `mcp`.
+ */
+ type: 'mcp';
+
+ /**
+ * List of allowed tool names or a filter object.
+ */
+ allowed_tools?: Array | Mcp.McpToolFilter | null;
+
+ /**
+ * An OAuth access token that can be used with a remote MCP server, either with a
+ * custom MCP server URL or a service connector. Your application must handle the
+ * OAuth authorization flow and provide the token here.
+ */
+ authorization?: string;
+
+ /**
+ * Identifier for service connectors, like those available in ChatGPT. One of
+ * `server_url` or `connector_id` must be provided. Learn more about service
+ * connectors
+ * [here](https://platform.openai.com/docs/guides/tools-remote-mcp#connectors).
+ *
+ * Currently supported `connector_id` values are:
+ *
+ * - Dropbox: `connector_dropbox`
+ * - Gmail: `connector_gmail`
+ * - Google Calendar: `connector_googlecalendar`
+ * - Google Drive: `connector_googledrive`
+ * - Microsoft Teams: `connector_microsoftteams`
+ * - Outlook Calendar: `connector_outlookcalendar`
+ * - Outlook Email: `connector_outlookemail`
+ * - SharePoint: `connector_sharepoint`
+ */
+ connector_id?:
+ | 'connector_dropbox'
+ | 'connector_gmail'
+ | 'connector_googlecalendar'
+ | 'connector_googledrive'
+ | 'connector_microsoftteams'
+ | 'connector_outlookcalendar'
+ | 'connector_outlookemail'
+ | 'connector_sharepoint';
+
+ /**
+ * Optional HTTP headers to send to the MCP server. Use for authentication or other
+ * purposes.
+ */
+ headers?: { [key: string]: string } | null;
+
+ /**
+ * Specify which of the MCP server's tools require approval.
+ */
+ require_approval?: Mcp.McpToolApprovalFilter | 'always' | 'never' | null;
+
+ /**
+ * Optional description of the MCP server, used to provide more context.
+ */
+ server_description?: string;
+
+ /**
+ * The URL for the MCP server. One of `server_url` or `connector_id` must be
+ * provided.
+ */
+ server_url?: string;
+ }
+
+ export namespace Mcp {
+ /**
+ * A filter object to specify which tools are allowed.
+ */
+ export interface McpToolFilter {
+ /**
+ * Indicates whether or not a tool modifies data or is read-only. If an MCP server
+ * is
+ * [annotated with `readOnlyHint`](https://modelcontextprotocol.io/specification/2025-06-18/schema#toolannotations-readonlyhint),
+ * it will match this filter.
+ */
+ read_only?: boolean;
+
+ /**
+ * List of allowed tool names.
+ */
+ tool_names?: Array;
+ }
+
+ /**
+ * Specify which of the MCP server's tools require approval. Can be `always`,
+ * `never`, or a filter object associated with tools that require approval.
+ */
+ export interface McpToolApprovalFilter {
+ /**
+ * A filter object to specify which tools are allowed.
+ */
+ always?: McpToolApprovalFilter.Always;
+
+ /**
+ * A filter object to specify which tools are allowed.
+ */
+ never?: McpToolApprovalFilter.Never;
+ }
+
+ export namespace McpToolApprovalFilter {
+ /**
+ * A filter object to specify which tools are allowed.
+ */
+ export interface Always {
+ /**
+ * Indicates whether or not a tool modifies data or is read-only. If an MCP server
+ * is
+ * [annotated with `readOnlyHint`](https://modelcontextprotocol.io/specification/2025-06-18/schema#toolannotations-readonlyhint),
+ * it will match this filter.
+ */
+ read_only?: boolean;
+
+ /**
+ * List of allowed tool names.
+ */
+ tool_names?: Array;
+ }
+
+ /**
+ * A filter object to specify which tools are allowed.
+ */
+ export interface Never {
+ /**
+ * Indicates whether or not a tool modifies data or is read-only. If an MCP server
+ * is
+ * [annotated with `readOnlyHint`](https://modelcontextprotocol.io/specification/2025-06-18/schema#toolannotations-readonlyhint),
+ * it will match this filter.
+ */
+ read_only?: boolean;
+
+ /**
+ * List of allowed tool names.
+ */
+ tool_names?: Array;
+ }
+ }
+ }
+}
+
+/**
+ * Configuration options for tracing. Set to null to disable tracing. Once tracing
+ * is enabled for a session, the configuration cannot be modified.
+ *
+ * `auto` will create a trace for the session with default values for the workflow
+ * name, group id, and metadata.
+ */
+export type RealtimeTracingConfig = 'auto' | RealtimeTracingConfig.TracingConfiguration;
+
+export namespace RealtimeTracingConfig {
+ /**
+ * Granular configuration for tracing.
+ */
+ export interface TracingConfiguration {
+ /**
+ * The group id to attach to this trace to enable filtering and grouping in the
+ * traces dashboard.
+ */
+ group_id?: string;
+
+ /**
+ * The arbitrary metadata to attach to this trace to enable filtering in the traces
+ * dashboard.
+ */
+ metadata?: unknown;
+
+ /**
+ * The name of the workflow to attach to this trace. This is used to name the trace
+ * in the traces dashboard.
+ */
+ workflow_name?: string;
+ }
+}
+
+/**
+ * Realtime transcription session object configuration.
+ */
+export interface RealtimeTranscriptionSessionCreateRequest {
+ /**
+ * ID of the model to use. The options are `gpt-4o-transcribe`,
+ * `gpt-4o-mini-transcribe`, and `whisper-1` (which is powered by our open source
+ * Whisper V2 model).
+ */
+ model: (string & {}) | 'whisper-1' | 'gpt-4o-transcribe' | 'gpt-4o-mini-transcribe';
+
+ /**
+ * The type of session to create. Always `transcription` for transcription
+ * sessions.
+ */
+ type: 'transcription';
+
+ /**
+ * The set of items to include in the transcription. Current available items are:
+ *
+ * - `item.input_audio_transcription.logprobs`
+ */
+ include?: Array<'item.input_audio_transcription.logprobs'>;
+
+ /**
+ * The format of input audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`. For
+ * `pcm16`, input audio must be 16-bit PCM at a 24kHz sample rate, single channel
+ * (mono), and little-endian byte order.
+ */
+ input_audio_format?: 'pcm16' | 'g711_ulaw' | 'g711_alaw';
+
+ /**
+ * Configuration for input audio noise reduction. This can be set to `null` to turn
+ * off. Noise reduction filters audio added to the input audio buffer before it is
+ * sent to VAD and the model. Filtering the audio can improve VAD and turn
+ * detection accuracy (reducing false positives) and model performance by improving
+ * perception of the input audio.
+ */
+ input_audio_noise_reduction?: RealtimeTranscriptionSessionCreateRequest.InputAudioNoiseReduction;
+
+ /**
+ * Configuration for input audio transcription. The client can optionally set the
+ * language and prompt for transcription, these offer additional guidance to the
+ * transcription service.
+ */
+ input_audio_transcription?: RealtimeTranscriptionSessionCreateRequest.InputAudioTranscription;
+
+ /**
+ * Configuration for turn detection. Can be set to `null` to turn off. Server VAD
+ * means that the model will detect the start and end of speech based on audio
+ * volume and respond at the end of user speech.
+ */
+ turn_detection?: RealtimeTranscriptionSessionCreateRequest.TurnDetection;
+}
+
+export namespace RealtimeTranscriptionSessionCreateRequest {
+ /**
+ * Configuration for input audio noise reduction. This can be set to `null` to turn
+ * off. Noise reduction filters audio added to the input audio buffer before it is
+ * sent to VAD and the model. Filtering the audio can improve VAD and turn
+ * detection accuracy (reducing false positives) and model performance by improving
+ * perception of the input audio.
+ */
+ export interface InputAudioNoiseReduction {
+ /**
+ * Type of noise reduction. `near_field` is for close-talking microphones such as
+ * headphones, `far_field` is for far-field microphones such as laptop or
+ * conference room microphones.
+ */
+ type?: 'near_field' | 'far_field';
+ }
+
+ /**
+ * Configuration for input audio transcription. The client can optionally set the
+ * language and prompt for transcription, these offer additional guidance to the
+ * transcription service.
+ */
+ export interface InputAudioTranscription {
+ /**
+ * The language of the input audio. Supplying the input language in
+ * [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`)
+ * format will improve accuracy and latency.
+ */
+ language?: string;
+
+ /**
+ * The model to use for transcription, current options are `gpt-4o-transcribe`,
+ * `gpt-4o-mini-transcribe`, and `whisper-1`.
+ */
+ model?: 'gpt-4o-transcribe' | 'gpt-4o-mini-transcribe' | 'whisper-1';
+
+ /**
+ * An optional text to guide the model's style or continue a previous audio
+ * segment. For `whisper-1`, the
+ * [prompt is a list of keywords](https://platform.openai.com/docs/guides/speech-to-text#prompting).
+ * For `gpt-4o-transcribe` models, the prompt is a free text string, for example
+ * "expect words related to technology".
+ */
+ prompt?: string;
+ }
+
+ /**
+ * Configuration for turn detection. Can be set to `null` to turn off. Server VAD
+ * means that the model will detect the start and end of speech based on audio
+ * volume and respond at the end of user speech.
+ */
+ export interface TurnDetection {
+ /**
+ * Amount of audio to include before the VAD detected speech (in milliseconds).
+ * Defaults to 300ms.
+ */
+ prefix_padding_ms?: number;
+
+ /**
+ * Duration of silence to detect speech stop (in milliseconds). Defaults to 500ms.
+ * With shorter values the model will respond more quickly, but may jump in on
+ * short pauses from the user.
+ */
+ silence_duration_ms?: number;
+
+ /**
+ * Activation threshold for VAD (0.0 to 1.0), this defaults to 0.5. A higher
+ * threshold will require louder audio to activate the model, and thus might
+ * perform better in noisy environments.
+ */
+ threshold?: number;
+
+ /**
+ * Type of turn detection. Only `server_vad` is currently supported for
+ * transcription sessions.
+ */
+ type?: 'server_vad';
+ }
+}
+
+/**
+ * Controls how the realtime conversation is truncated prior to model inference.
+ * The default is `auto`. When set to `retention_ratio`, the server retains a
+ * fraction of the conversation tokens prior to the instructions.
+ */
+export type RealtimeTruncation = 'auto' | 'disabled' | RealtimeTruncation.RetentionRatioTruncation;
+
+export namespace RealtimeTruncation {
+ /**
+ * Retain a fraction of the conversation tokens.
+ */
+ export interface RetentionRatioTruncation {
+ /**
+ * Fraction of pre-instruction conversation tokens to retain (0.0 - 1.0).
+ */
+ retention_ratio: number;
+
+ /**
+ * Use retention ratio truncation.
+ */
+ type: 'retention_ratio';
+
+ /**
+ * Optional cap on tokens allowed after the instructions.
+ */
+ post_instructions_token_limit?: number | null;
+ }
+}
+
+/**
+ * Returned when the model-generated audio is updated.
+ */
+export interface ResponseAudioDeltaEvent {
+ /**
+ * The index of the content part in the item's content array.
+ */
+ content_index: number;
+
+ /**
+ * Base64-encoded audio data delta.
+ */
+ delta: string;
+
+ /**
+ * The unique ID of the server event.
+ */
+ event_id: string;
+
+ /**
+ * The ID of the item.
+ */
+ item_id: string;
+
+ /**
+ * The index of the output item in the response.
+ */
+ output_index: number;
+
+ /**
+ * The ID of the response.
+ */
+ response_id: string;
+
+ /**
+ * The event type, must be `response.output_audio.delta`.
+ */
+ type: 'response.output_audio.delta';
+}
+
+/**
+ * Returned when the model-generated audio is done. Also emitted when a Response is
+ * interrupted, incomplete, or cancelled.
+ */
+export interface ResponseAudioDoneEvent {
+ /**
+ * The index of the content part in the item's content array.
+ */
+ content_index: number;
+
+ /**
+ * The unique ID of the server event.
+ */
+ event_id: string;
+
+ /**
+ * The ID of the item.
+ */
+ item_id: string;
+
+ /**
+ * The index of the output item in the response.
+ */
+ output_index: number;
+
+ /**
+ * The ID of the response.
+ */
+ response_id: string;
+
+ /**
+ * The event type, must be `response.output_audio.done`.
+ */
+ type: 'response.output_audio.done';
+}
+
+/**
+ * Returned when the model-generated transcription of audio output is updated.
+ */
+export interface ResponseAudioTranscriptDeltaEvent {
+ /**
+ * The index of the content part in the item's content array.
+ */
+ content_index: number;
+
+ /**
+ * The transcript delta.
+ */
+ delta: string;
+
+ /**
+ * The unique ID of the server event.
+ */
+ event_id: string;
+
+ /**
+ * The ID of the item.
+ */
+ item_id: string;
+
+ /**
+ * The index of the output item in the response.
+ */
+ output_index: number;
+
+ /**
+ * The ID of the response.
+ */
+ response_id: string;
+
+ /**
+ * The event type, must be `response.output_audio_transcript.delta`.
+ */
+ type: 'response.output_audio_transcript.delta';
+}
+
+/**
+ * Returned when the model-generated transcription of audio output is done
+ * streaming. Also emitted when a Response is interrupted, incomplete, or
+ * cancelled.
+ */
+export interface ResponseAudioTranscriptDoneEvent {
+ /**
+ * The index of the content part in the item's content array.
+ */
+ content_index: number;
+
+ /**
+ * The unique ID of the server event.
+ */
+ event_id: string;
+
+ /**
+ * The ID of the item.
+ */
+ item_id: string;
+
+ /**
+ * The index of the output item in the response.
+ */
+ output_index: number;
+
+ /**
+ * The ID of the response.
+ */
+ response_id: string;
+
+ /**
+ * The final transcript of the audio.
+ */
+ transcript: string;
+
+ /**
+ * The event type, must be `response.output_audio_transcript.done`.
+ */
+ type: 'response.output_audio_transcript.done';
+}
+
+/**
+ * Send this event to cancel an in-progress response. The server will respond with
+ * a `response.done` event with a status of `response.status=cancelled`. If there
+ * is no response to cancel, the server will respond with an error.
+ */
+export interface ResponseCancelEvent {
+ /**
+ * The event type, must be `response.cancel`.
+ */
+ type: 'response.cancel';
+
+ /**
+ * Optional client-generated ID used to identify this event.
+ */
+ event_id?: string;
+
+ /**
+ * A specific response ID to cancel - if not provided, will cancel an in-progress
+ * response in the default conversation.
+ */
+ response_id?: string;
+}
+
+/**
+ * Returned when a new content part is added to an assistant message item during
+ * response generation.
+ */
+export interface ResponseContentPartAddedEvent {
+ /**
+ * The index of the content part in the item's content array.
+ */
+ content_index: number;
+
+ /**
+ * The unique ID of the server event.
+ */
+ event_id: string;
+
+ /**
+ * The ID of the item to which the content part was added.
+ */
+ item_id: string;
+
+ /**
+ * The index of the output item in the response.
+ */
+ output_index: number;
+
+ /**
+ * The content part that was added.
+ */
+ part: ResponseContentPartAddedEvent.Part;
+
+ /**
+ * The ID of the response.
+ */
+ response_id: string;
+
+ /**
+ * The event type, must be `response.content_part.added`.
+ */
+ type: 'response.content_part.added';
+}
+
+export namespace ResponseContentPartAddedEvent {
+ /**
+ * The content part that was added.
+ */
+ export interface Part {
+ /**
+ * Base64-encoded audio data (if type is "audio").
+ */
+ audio?: string;
+
+ /**
+ * The text content (if type is "text").
+ */
+ text?: string;
+
+ /**
+ * The transcript of the audio (if type is "audio").
+ */
+ transcript?: string;
+
+ /**
+ * The content type ("text", "audio").
+ */
+ type?: 'text' | 'audio';
+ }
+}
+
+/**
+ * Returned when a content part is done streaming in an assistant message item.
+ * Also emitted when a Response is interrupted, incomplete, or cancelled.
+ */
+export interface ResponseContentPartDoneEvent {
+ /**
+ * The index of the content part in the item's content array.
+ */
+ content_index: number;
+
+ /**
+ * The unique ID of the server event.
+ */
+ event_id: string;
+
+ /**
+ * The ID of the item.
+ */
+ item_id: string;
+
+ /**
+ * The index of the output item in the response.
+ */
+ output_index: number;
+
+ /**
+ * The content part that is done.
+ */
+ part: ResponseContentPartDoneEvent.Part;
+
+ /**
+ * The ID of the response.
+ */
+ response_id: string;
+
+ /**
+ * The event type, must be `response.content_part.done`.
+ */
+ type: 'response.content_part.done';
+}
+
+export namespace ResponseContentPartDoneEvent {
+ /**
+ * The content part that is done.
+ */
+ export interface Part {
+ /**
+ * Base64-encoded audio data (if type is "audio").
+ */
+ audio?: string;
+
+ /**
+ * The text content (if type is "text").
+ */
+ text?: string;
+
+ /**
+ * The transcript of the audio (if type is "audio").
+ */
+ transcript?: string;
+
+ /**
+ * The content type ("text", "audio").
+ */
+ type?: 'text' | 'audio';
+ }
+}
+
+/**
+ * This event instructs the server to create a Response, which means triggering
+ * model inference. When in Server VAD mode, the server will create Responses
+ * automatically.
+ *
+ * A Response will include at least one Item, and may have two, in which case the
+ * second will be a function call. These Items will be appended to the conversation
+ * history.
+ *
+ * The server will respond with a `response.created` event, events for Items and
+ * content created, and finally a `response.done` event to indicate the Response is
+ * complete.
+ *
+ * The `response.create` event includes inference configuration like
+ * `instructions`, and `temperature`. These fields will override the Session's
+ * configuration for this Response only.
+ */
+export interface ResponseCreateEvent {
+ /**
+ * The event type, must be `response.create`.
+ */
+ type: 'response.create';
+
+ /**
+ * Optional client-generated ID used to identify this event.
+ */
+ event_id?: string;
+
+ /**
+ * Create a new Realtime response with these parameters
+ */
+ response?: ResponseCreateEvent.Response;
+}
+
+export namespace ResponseCreateEvent {
+ /**
+ * Create a new Realtime response with these parameters
+ */
+ export interface Response {
+ /**
+ * Controls which conversation the response is added to. Currently supports `auto`
+ * and `none`, with `auto` as the default value. The `auto` value means that the
+ * contents of the response will be added to the default conversation. Set this to
+ * `none` to create an out-of-band response which will not add items to default
+ * conversation.
+ */
+ conversation?: (string & {}) | 'auto' | 'none';
+
+ /**
+ * Input items to include in the prompt for the model. Using this field creates a
+ * new context for this Response instead of using the default conversation. An
+ * empty array `[]` will clear the context for this Response. Note that this can
+ * include references to items from the default conversation.
+ */
+ input?: Array;
+
+ /**
+ * The default system instructions (i.e. system message) prepended to model calls.
+ * This field allows the client to guide the model on desired responses. The model
+ * can be instructed on response content and format, (e.g. "be extremely succinct",
+ * "act friendly", "here are examples of good responses") and on audio behavior
+ * (e.g. "talk quickly", "inject emotion into your voice", "laugh frequently"). The
+ * instructions are not guaranteed to be followed by the model, but they provide
+ * guidance to the model on the desired behavior.
+ *
+ * Note that the server sets default instructions which will be used if this field
+ * is not set and are visible in the `session.created` event at the start of the
+ * session.
+ */
+ instructions?: string;
+
+ /**
+ * Maximum number of output tokens for a single assistant response, inclusive of
+ * tool calls. Provide an integer between 1 and 4096 to limit output tokens, or
+ * `inf` for the maximum available tokens for a given model. Defaults to `inf`.
+ */
+ max_output_tokens?: number | 'inf';
+
+ /**
+ * Set of 16 key-value pairs that can be attached to an object. This can be useful
+ * for storing additional information about the object in a structured format, and
+ * querying for objects via API or the dashboard.
+ *
+ * Keys are strings with a maximum length of 64 characters. Values are strings with
+ * a maximum length of 512 characters.
+ */
+ metadata?: Shared.Metadata | null;
+
+ /**
+ * The set of modalities the model can respond with. To disable audio, set this to
+ * ["text"].
+ */
+ modalities?: Array<'text' | 'audio'>;
+
+ /**
+ * The format of output audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`.
+ */
+ output_audio_format?: 'pcm16' | 'g711_ulaw' | 'g711_alaw';
+
+ /**
+ * Reference to a prompt template and its variables.
+ * [Learn more](https://platform.openai.com/docs/guides/text?api-mode=responses#reusable-prompts).
+ */
+ prompt?: ResponsesAPI.ResponsePrompt | null;
+
+ /**
+ * Sampling temperature for the model, limited to [0.6, 1.2]. Defaults to 0.8.
+ */
+ temperature?: number;
+
+ /**
+ * How the model chooses tools. Provide one of the string modes or force a specific
+ * function/MCP tool.
+ */
+ tool_choice?:
+ | ResponsesAPI.ToolChoiceOptions
+ | ResponsesAPI.ToolChoiceFunction
+ | ResponsesAPI.ToolChoiceMcp;
+
+ /**
+ * Tools (functions) available to the model.
+ */
+ tools?: Array;
+
+ /**
+ * The voice the model uses to respond. Voice cannot be changed during the session
+ * once the model has responded with audio at least once. Current voice options are
+ * `alloy`, `ash`, `ballad`, `coral`, `echo`, `sage`, `shimmer`, and `verse`.
+ */
+ voice?:
+ | (string & {})
+ | 'alloy'
+ | 'ash'
+ | 'ballad'
+ | 'coral'
+ | 'echo'
+ | 'sage'
+ | 'shimmer'
+ | 'verse'
+ | 'marin'
+ | 'cedar';
+ }
+
+ export namespace Response {
+ export interface Tool {
+ /**
+ * The description of the function, including guidance on when and how to call it,
+ * and guidance about what to tell the user when calling (if anything).
+ */
+ description?: string;
+
+ /**
+ * The name of the function.
+ */
+ name?: string;
+
+ /**
+ * Parameters of the function in JSON Schema.
+ */
+ parameters?: unknown;
+
+ /**
+ * The type of the tool, i.e. `function`.
+ */
+ type?: 'function';
+ }
+ }
+}
+
+/**
+ * Returned when a new Response is created. The first event of response creation,
+ * where the response is in an initial state of `in_progress`.
+ */
+export interface ResponseCreatedEvent {
+ /**
+ * The unique ID of the server event.
+ */
+ event_id: string;
+
+ /**
+ * The response resource.
+ */
+ response: RealtimeResponse;
+
+ /**
+ * The event type, must be `response.created`.
+ */
+ type: 'response.created';
+}
+
+/**
+ * Returned when a Response is done streaming. Always emitted, no matter the final
+ * state. The Response object included in the `response.done` event will include
+ * all output Items in the Response but will omit the raw audio data.
+ */
+export interface ResponseDoneEvent {
+ /**
+ * The unique ID of the server event.
+ */
+ event_id: string;
+
+ /**
+ * The response resource.
+ */
+ response: RealtimeResponse;
+
+ /**
+ * The event type, must be `response.done`.
+ */
+ type: 'response.done';
+}
+
+/**
+ * Returned when the model-generated function call arguments are updated.
+ */
+export interface ResponseFunctionCallArgumentsDeltaEvent {
+ /**
+ * The ID of the function call.
+ */
+ call_id: string;
+
+ /**
+ * The arguments delta as a JSON string.
+ */
+ delta: string;
+
+ /**
+ * The unique ID of the server event.
+ */
+ event_id: string;
+
+ /**
+ * The ID of the function call item.
+ */
+ item_id: string;
+
+ /**
+ * The index of the output item in the response.
+ */
+ output_index: number;
+
+ /**
+ * The ID of the response.
+ */
+ response_id: string;
+
+ /**
+ * The event type, must be `response.function_call_arguments.delta`.
+ */
+ type: 'response.function_call_arguments.delta';
+}
+
+/**
+ * Returned when the model-generated function call arguments are done streaming.
+ * Also emitted when a Response is interrupted, incomplete, or cancelled.
+ */
+export interface ResponseFunctionCallArgumentsDoneEvent {
+ /**
+ * The final arguments as a JSON string.
+ */
+ arguments: string;
+
+ /**
+ * The ID of the function call.
+ */
+ call_id: string;
+
+ /**
+ * The unique ID of the server event.
+ */
+ event_id: string;
+
+ /**
+ * The ID of the function call item.
+ */
+ item_id: string;
+
+ /**
+ * The index of the output item in the response.
+ */
+ output_index: number;
+
+ /**
+ * The ID of the response.
+ */
+ response_id: string;
+
+ /**
+ * The event type, must be `response.function_call_arguments.done`.
+ */
+ type: 'response.function_call_arguments.done';
+}
+
+/**
+ * Returned when MCP tool call arguments are updated during response generation.
+ */
+export interface ResponseMcpCallArgumentsDelta {
+ /**
+ * The JSON-encoded arguments delta.
+ */
+ delta: string;
+
+ /**
+ * The unique ID of the server event.
+ */
+ event_id: string;
+
+ /**
+ * The ID of the MCP tool call item.
+ */
+ item_id: string;
+
+ /**
+ * The index of the output item in the response.
+ */
+ output_index: number;
+
+ /**
+ * The ID of the response.
+ */
+ response_id: string;
+
+ /**
+ * The event type, must be `response.mcp_call_arguments.delta`.
+ */
+ type: 'response.mcp_call_arguments.delta';
+
+ /**
+ * If present, indicates the delta text was obfuscated.
+ */
+ obfuscation?: string | null;
+}
+
+/**
+ * Returned when MCP tool call arguments are finalized during response generation.
+ */
+export interface ResponseMcpCallArgumentsDone {
+ /**
+ * The final JSON-encoded arguments string.
+ */
+ arguments: string;
+
+ /**
+ * The unique ID of the server event.
+ */
+ event_id: string;
+
+ /**
+ * The ID of the MCP tool call item.
+ */
+ item_id: string;
+
+ /**
+ * The index of the output item in the response.
+ */
+ output_index: number;
+
+ /**
+ * The ID of the response.
+ */
+ response_id: string;
+
+ /**
+ * The event type, must be `response.mcp_call_arguments.done`.
+ */
+ type: 'response.mcp_call_arguments.done';
+}
+
+/**
+ * Returned when an MCP tool call has completed successfully.
+ */
+export interface ResponseMcpCallCompleted {
+ /**
+ * The unique ID of the server event.
+ */
+ event_id: string;
+
+ /**
+ * The ID of the MCP tool call item.
+ */
+ item_id: string;
+
+ /**
+ * The index of the output item in the response.
+ */
+ output_index: number;
+
+ /**
+ * The event type, must be `response.mcp_call.completed`.
+ */
+ type: 'response.mcp_call.completed';
+}
+
+/**
+ * Returned when an MCP tool call has failed.
+ */
+export interface ResponseMcpCallFailed {
+ /**
+ * The unique ID of the server event.
+ */
+ event_id: string;
+
+ /**
+ * The ID of the MCP tool call item.
+ */
+ item_id: string;
+
+ /**
+ * The index of the output item in the response.
+ */
+ output_index: number;
+
+ /**
+ * The event type, must be `response.mcp_call.failed`.
+ */
+ type: 'response.mcp_call.failed';
+}
+
+/**
+ * Returned when an MCP tool call has started and is in progress.
+ */
+export interface ResponseMcpCallInProgress {
+ /**
+ * The unique ID of the server event.
+ */
+ event_id: string;
+
+ /**
+ * The ID of the MCP tool call item.
+ */
+ item_id: string;
+
+ /**
+ * The index of the output item in the response.
+ */
+ output_index: number;
+
+ /**
+ * The event type, must be `response.mcp_call.in_progress`.
+ */
+ type: 'response.mcp_call.in_progress';
+}
+
+/**
+ * Returned when a new Item is created during Response generation.
+ */
+export interface ResponseOutputItemAddedEvent {
+ /**
+ * The unique ID of the server event.
+ */
+ event_id: string;
+
+ /**
+ * A single item within a Realtime conversation.
+ */
+ item: ConversationItem;
+
+ /**
+ * The index of the output item in the Response.
+ */
+ output_index: number;
+
+ /**
+ * The ID of the Response to which the item belongs.
+ */
+ response_id: string;
+
+ /**
+ * The event type, must be `response.output_item.added`.
+ */
+ type: 'response.output_item.added';
+}
+
+/**
+ * Returned when an Item is done streaming. Also emitted when a Response is
+ * interrupted, incomplete, or cancelled.
+ */
+export interface ResponseOutputItemDoneEvent {
+ /**
+ * The unique ID of the server event.
+ */
+ event_id: string;
+
+ /**
+ * A single item within a Realtime conversation.
+ */
+ item: ConversationItem;
+
+ /**
+ * The index of the output item in the Response.
+ */
+ output_index: number;
+
+ /**
+ * The ID of the Response to which the item belongs.
+ */
+ response_id: string;
+
+ /**
+ * The event type, must be `response.output_item.done`.
+ */
+ type: 'response.output_item.done';
+}
+
+/**
+ * Returned when the text value of an "output_text" content part is updated.
+ */
+export interface ResponseTextDeltaEvent {
+ /**
+ * The index of the content part in the item's content array.
+ */
+ content_index: number;
+
+ /**
+ * The text delta.
+ */
+ delta: string;
+
+ /**
+ * The unique ID of the server event.
+ */
+ event_id: string;
+
+ /**
+ * The ID of the item.
+ */
+ item_id: string;
+
+ /**
+ * The index of the output item in the response.
+ */
+ output_index: number;
+
+ /**
+ * The ID of the response.
+ */
+ response_id: string;
+
+ /**
+ * The event type, must be `response.output_text.delta`.
+ */
+ type: 'response.output_text.delta';
+}
+
+/**
+ * Returned when the text value of an "output_text" content part is done streaming.
+ * Also emitted when a Response is interrupted, incomplete, or cancelled.
+ */
+export interface ResponseTextDoneEvent {
+ /**
+ * The index of the content part in the item's content array.
+ */
+ content_index: number;
+
+ /**
+ * The unique ID of the server event.
+ */
+ event_id: string;
+
+ /**
+ * The ID of the item.
+ */
+ item_id: string;
+
+ /**
+ * The index of the output item in the response.
+ */
+ output_index: number;
+
+ /**
+ * The ID of the response.
+ */
+ response_id: string;
+
+ /**
+ * The final text content.
+ */
+ text: string;
+
+ /**
+ * The event type, must be `response.output_text.done`.
+ */
+ type: 'response.output_text.done';
+}
+
+/**
+ * Returned when a Session is created. Emitted automatically when a new connection
+ * is established as the first server event. This event will contain the default
+ * Session configuration.
+ */
+export interface SessionCreatedEvent {
+ /**
+ * The unique ID of the server event.
+ */
+ event_id: string;
+
+ /**
+ * Realtime session object.
+ */
+ session: RealtimeSession;
+
+ /**
+ * The event type, must be `session.created`.
+ */
+ type: 'session.created';
+}
+
+/**
+ * Send this event to update the session’s default configuration. The client may
+ * send this event at any time to update any field, except for `voice`. However,
+ * note that once a session has been initialized with a particular `model`, it
+ * can’t be changed to another model using `session.update`.
+ *
+ * When the server receives a `session.update`, it will respond with a
+ * `session.updated` event showing the full, effective configuration. Only the
+ * fields that are present are updated. To clear a field like `instructions`, pass
+ * an empty string.
+ */
+export interface SessionUpdateEvent {
+ /**
+ * Realtime session object configuration.
+ */
+ session: RealtimeSessionCreateRequest;
+
+ /**
+ * The event type, must be `session.update`.
+ */
+ type: 'session.update';
+
+ /**
+ * Optional client-generated ID used to identify this event.
+ */
+ event_id?: string;
+}
+
+/**
+ * Returned when a session is updated with a `session.update` event, unless there
+ * is an error.
+ */
+export interface SessionUpdatedEvent {
+ /**
+ * The unique ID of the server event.
+ */
+ event_id: string;
+
+ /**
+ * Realtime session object.
+ */
+ session: RealtimeSession;
+
+ /**
+ * The event type, must be `session.updated`.
+ */
+ type: 'session.updated';
+}
+
+/**
+ * Returned when a transcription session is created.
+ */
+export interface TranscriptionSessionCreated {
+ /**
+ * The unique ID of the server event.
+ */
+ event_id: string;
+
+ /**
+ * A Realtime transcription session configuration object.
+ */
+ session: TranscriptionSessionCreated.Session;
+
+ /**
+ * The event type, must be `transcription_session.created`.
+ */
+ type: 'transcription_session.created';
+}
+
+export namespace TranscriptionSessionCreated {
+ /**
+ * A Realtime transcription session configuration object.
+ */
+ export interface Session {
+ /**
+ * Unique identifier for the session that looks like `sess_1234567890abcdef`.
+ */
+ id?: string;
+
+ /**
+ * Configuration for input audio for the session.
+ */
+ audio?: Session.Audio;
+
+ /**
+ * Expiration timestamp for the session, in seconds since epoch.
+ */
+ expires_at?: number;
+
+ /**
+ * Additional fields to include in server outputs.
+ *
+ * - `item.input_audio_transcription.logprobs`: Include logprobs for input audio
+ * transcription.
+ */
+ include?: Array<'item.input_audio_transcription.logprobs'>;
+
+ /**
+ * The object type. Always `realtime.transcription_session`.
+ */
+ object?: string;
+ }
+
+ export namespace Session {
+ /**
+ * Configuration for input audio for the session.
+ */
+ export interface Audio {
+ input?: Audio.Input;
+ }
+
+ export namespace Audio {
+ export interface Input {
+ /**
+ * The format of input audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`.
+ */
+ format?: string;
+
+ /**
+ * Configuration for input audio noise reduction.
+ */
+ noise_reduction?: Input.NoiseReduction;
+
+ /**
+ * Configuration of the transcription model.
+ */
+ transcription?: Input.Transcription;
+
+ /**
+ * Configuration for turn detection.
+ */
+ turn_detection?: Input.TurnDetection;
+ }
+
+ export namespace Input {
+ /**
+ * Configuration for input audio noise reduction.
+ */
+ export interface NoiseReduction {
+ type?: 'near_field' | 'far_field';
+ }
+
+ /**
+ * Configuration of the transcription model.
+ */
+ export interface Transcription {
+ /**
+ * The language of the input audio. Supplying the input language in
+ * [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`)
+ * format will improve accuracy and latency.
+ */
+ language?: string;
+
+ /**
+ * The model to use for transcription. Can be `gpt-4o-transcribe`,
+ * `gpt-4o-mini-transcribe`, or `whisper-1`.
+ */
+ model?: 'gpt-4o-transcribe' | 'gpt-4o-mini-transcribe' | 'whisper-1';
+
+ /**
+ * An optional text to guide the model's style or continue a previous audio
+ * segment. The
+ * [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting)
+ * should match the audio language.
+ */
+ prompt?: string;
+ }
+
+ /**
+ * Configuration for turn detection.
+ */
+ export interface TurnDetection {
+ prefix_padding_ms?: number;
+
+ silence_duration_ms?: number;
+
+ threshold?: number;
+
+ /**
+ * Type of turn detection, only `server_vad` is currently supported.
+ */
+ type?: string;
+ }
+ }
+ }
+ }
+}
+
+/**
+ * Send this event to update a transcription session.
+ */
+export interface TranscriptionSessionUpdate {
+ /**
+ * Realtime transcription session object configuration.
+ */
+ session: RealtimeTranscriptionSessionCreateRequest;
+
+ /**
+ * The event type, must be `transcription_session.update`.
+ */
+ type: 'transcription_session.update';
+
+ /**
+ * Optional client-generated ID used to identify this event.
+ */
+ event_id?: string;
+}
+
+/**
+ * Returned when a transcription session is updated with a
+ * `transcription_session.update` event, unless there is an error.
+ */
+export interface TranscriptionSessionUpdatedEvent {
+ /**
+ * The unique ID of the server event.
+ */
+ event_id: string;
+
+ /**
+ * A Realtime transcription session configuration object.
+ */
+ session: TranscriptionSessionUpdatedEvent.Session;
+
+ /**
+ * The event type, must be `transcription_session.updated`.
+ */
+ type: 'transcription_session.updated';
+}
+
+export namespace TranscriptionSessionUpdatedEvent {
+ /**
+ * A Realtime transcription session configuration object.
+ */
+ export interface Session {
+ /**
+ * Unique identifier for the session that looks like `sess_1234567890abcdef`.
+ */
+ id?: string;
+
+ /**
+ * Configuration for input audio for the session.
+ */
+ audio?: Session.Audio;
+
+ /**
+ * Expiration timestamp for the session, in seconds since epoch.
+ */
+ expires_at?: number;
+
+ /**
+ * Additional fields to include in server outputs.
+ *
+ * - `item.input_audio_transcription.logprobs`: Include logprobs for input audio
+ * transcription.
+ */
+ include?: Array<'item.input_audio_transcription.logprobs'>;
+
+ /**
+ * The object type. Always `realtime.transcription_session`.
+ */
+ object?: string;
+ }
+
+ export namespace Session {
+ /**
+ * Configuration for input audio for the session.
+ */
+ export interface Audio {
+ input?: Audio.Input;
+ }
+
+ export namespace Audio {
+ export interface Input {
+ /**
+ * The format of input audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`.
+ */
+ format?: string;
+
+ /**
+ * Configuration for input audio noise reduction.
+ */
+ noise_reduction?: Input.NoiseReduction;
+
+ /**
+ * Configuration of the transcription model.
+ */
+ transcription?: Input.Transcription;
+
+ /**
+ * Configuration for turn detection.
+ */
+ turn_detection?: Input.TurnDetection;
+ }
+
+ export namespace Input {
+ /**
+ * Configuration for input audio noise reduction.
+ */
+ export interface NoiseReduction {
+ type?: 'near_field' | 'far_field';
+ }
+
+ /**
+ * Configuration of the transcription model.
+ */
+ export interface Transcription {
+ /**
+ * The language of the input audio. Supplying the input language in
+ * [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`)
+ * format will improve accuracy and latency.
+ */
+ language?: string;
+
+ /**
+ * The model to use for transcription. Can be `gpt-4o-transcribe`,
+ * `gpt-4o-mini-transcribe`, or `whisper-1`.
+ */
+ model?: 'gpt-4o-transcribe' | 'gpt-4o-mini-transcribe' | 'whisper-1';
+
+ /**
+ * An optional text to guide the model's style or continue a previous audio
+ * segment. The
+ * [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting)
+ * should match the audio language.
+ */
+ prompt?: string;
+ }
+
+ /**
+ * Configuration for turn detection.
+ */
+ export interface TurnDetection {
+ prefix_padding_ms?: number;
+
+ silence_duration_ms?: number;
+
+ threshold?: number;
+
+ /**
+ * Type of turn detection, only `server_vad` is currently supported.
+ */
+ type?: string;
+ }
+ }
+ }
+ }
+}
+
+Realtime.ClientSecrets = ClientSecrets;
+
+export declare namespace Realtime {
+ export {
+ type ConversationCreatedEvent as ConversationCreatedEvent,
+ type ConversationItem as ConversationItem,
+ type ConversationItemAdded as ConversationItemAdded,
+ type ConversationItemCreateEvent as ConversationItemCreateEvent,
+ type ConversationItemCreatedEvent as ConversationItemCreatedEvent,
+ type ConversationItemDeleteEvent as ConversationItemDeleteEvent,
+ type ConversationItemDeletedEvent as ConversationItemDeletedEvent,
+ type ConversationItemDone as ConversationItemDone,
+ type ConversationItemInputAudioTranscriptionCompletedEvent as ConversationItemInputAudioTranscriptionCompletedEvent,
+ type ConversationItemInputAudioTranscriptionDeltaEvent as ConversationItemInputAudioTranscriptionDeltaEvent,
+ type ConversationItemInputAudioTranscriptionFailedEvent as ConversationItemInputAudioTranscriptionFailedEvent,
+ type ConversationItemInputAudioTranscriptionSegment as ConversationItemInputAudioTranscriptionSegment,
+ type ConversationItemRetrieveEvent as ConversationItemRetrieveEvent,
+ type ConversationItemTruncateEvent as ConversationItemTruncateEvent,
+ type ConversationItemTruncatedEvent as ConversationItemTruncatedEvent,
+ type ConversationItemWithReference as ConversationItemWithReference,
+ type InputAudioBufferAppendEvent as InputAudioBufferAppendEvent,
+ type InputAudioBufferClearEvent as InputAudioBufferClearEvent,
+ type InputAudioBufferClearedEvent as InputAudioBufferClearedEvent,
+ type InputAudioBufferCommitEvent as InputAudioBufferCommitEvent,
+ type InputAudioBufferCommittedEvent as InputAudioBufferCommittedEvent,
+ type InputAudioBufferSpeechStartedEvent as InputAudioBufferSpeechStartedEvent,
+ type InputAudioBufferSpeechStoppedEvent as InputAudioBufferSpeechStoppedEvent,
+ type InputAudioBufferTimeoutTriggered as InputAudioBufferTimeoutTriggered,
+ type LogProbProperties as LogProbProperties,
+ type McpListToolsCompleted as McpListToolsCompleted,
+ type McpListToolsFailed as McpListToolsFailed,
+ type McpListToolsInProgress as McpListToolsInProgress,
+ type OutputAudioBufferClearEvent as OutputAudioBufferClearEvent,
+ type RateLimitsUpdatedEvent as RateLimitsUpdatedEvent,
+ type RealtimeAudioConfig as RealtimeAudioConfig,
+ type RealtimeClientEvent as RealtimeClientEvent,
+ type RealtimeClientSecretConfig as RealtimeClientSecretConfig,
+ type RealtimeConversationItemAssistantMessage as RealtimeConversationItemAssistantMessage,
+ type RealtimeConversationItemFunctionCall as RealtimeConversationItemFunctionCall,
+ type RealtimeConversationItemFunctionCallOutput as RealtimeConversationItemFunctionCallOutput,
+ type RealtimeConversationItemSystemMessage as RealtimeConversationItemSystemMessage,
+ type RealtimeConversationItemUserMessage as RealtimeConversationItemUserMessage,
+ type RealtimeError as RealtimeError,
+ type RealtimeErrorEvent as RealtimeErrorEvent,
+ type RealtimeMcpApprovalRequest as RealtimeMcpApprovalRequest,
+ type RealtimeMcpApprovalResponse as RealtimeMcpApprovalResponse,
+ type RealtimeMcpListTools as RealtimeMcpListTools,
+ type RealtimeMcpProtocolError as RealtimeMcpProtocolError,
+ type RealtimeMcpToolCall as RealtimeMcpToolCall,
+ type RealtimeMcpToolExecutionError as RealtimeMcpToolExecutionError,
+ type RealtimeMcphttpError as RealtimeMcphttpError,
+ type RealtimeResponse as RealtimeResponse,
+ type RealtimeResponseStatus as RealtimeResponseStatus,
+ type RealtimeResponseUsage as RealtimeResponseUsage,
+ type RealtimeResponseUsageInputTokenDetails as RealtimeResponseUsageInputTokenDetails,
+ type RealtimeResponseUsageOutputTokenDetails as RealtimeResponseUsageOutputTokenDetails,
+ type RealtimeServerEvent as RealtimeServerEvent,
+ type RealtimeSession as RealtimeSession,
+ type RealtimeSessionCreateRequest as RealtimeSessionCreateRequest,
+ type RealtimeToolChoiceConfig as RealtimeToolChoiceConfig,
+ type RealtimeToolsConfig as RealtimeToolsConfig,
+ type RealtimeToolsConfigUnion as RealtimeToolsConfigUnion,
+ type RealtimeTracingConfig as RealtimeTracingConfig,
+ type RealtimeTranscriptionSessionCreateRequest as RealtimeTranscriptionSessionCreateRequest,
+ type RealtimeTruncation as RealtimeTruncation,
+ type ResponseAudioDeltaEvent as ResponseAudioDeltaEvent,
+ type ResponseAudioDoneEvent as ResponseAudioDoneEvent,
+ type ResponseAudioTranscriptDeltaEvent as ResponseAudioTranscriptDeltaEvent,
+ type ResponseAudioTranscriptDoneEvent as ResponseAudioTranscriptDoneEvent,
+ type ResponseCancelEvent as ResponseCancelEvent,
+ type ResponseContentPartAddedEvent as ResponseContentPartAddedEvent,
+ type ResponseContentPartDoneEvent as ResponseContentPartDoneEvent,
+ type ResponseCreateEvent as ResponseCreateEvent,
+ type ResponseCreatedEvent as ResponseCreatedEvent,
+ type ResponseDoneEvent as ResponseDoneEvent,
+ type ResponseFunctionCallArgumentsDeltaEvent as ResponseFunctionCallArgumentsDeltaEvent,
+ type ResponseFunctionCallArgumentsDoneEvent as ResponseFunctionCallArgumentsDoneEvent,
+ type ResponseMcpCallArgumentsDelta as ResponseMcpCallArgumentsDelta,
+ type ResponseMcpCallArgumentsDone as ResponseMcpCallArgumentsDone,
+ type ResponseMcpCallCompleted as ResponseMcpCallCompleted,
+ type ResponseMcpCallFailed as ResponseMcpCallFailed,
+ type ResponseMcpCallInProgress as ResponseMcpCallInProgress,
+ type ResponseOutputItemAddedEvent as ResponseOutputItemAddedEvent,
+ type ResponseOutputItemDoneEvent as ResponseOutputItemDoneEvent,
+ type ResponseTextDeltaEvent as ResponseTextDeltaEvent,
+ type ResponseTextDoneEvent as ResponseTextDoneEvent,
+ type SessionCreatedEvent as SessionCreatedEvent,
+ type SessionUpdateEvent as SessionUpdateEvent,
+ type SessionUpdatedEvent as SessionUpdatedEvent,
+ type TranscriptionSessionCreated as TranscriptionSessionCreated,
+ type TranscriptionSessionUpdate as TranscriptionSessionUpdate,
+ type TranscriptionSessionUpdatedEvent as TranscriptionSessionUpdatedEvent,
+ };
+
+ export {
+ ClientSecrets as ClientSecrets,
+ type RealtimeSessionCreateResponse as RealtimeSessionCreateResponse,
+ type ClientSecretCreateResponse as ClientSecretCreateResponse,
+ type ClientSecretCreateParams as ClientSecretCreateParams,
+ };
+}
diff --git a/src/resources/responses/responses.ts b/src/resources/responses/responses.ts
index 5512b0e11..5a8f1a446 100644
--- a/src/resources/responses/responses.ts
+++ b/src/resources/responses/responses.ts
@@ -463,7 +463,7 @@ export interface Response {
* An array of tools the model may call while generating a response. You can
* specify which tool to use by setting the `tool_choice` parameter.
*
- * The two categories of tools you can provide the model are:
+ * We support the following categories of tools:
*
* - **Built-in tools**: Tools that are provided by OpenAI that extend the model's
* capabilities, like
@@ -471,6 +471,9 @@ export interface Response {
* [file search](https://platform.openai.com/docs/guides/tools-file-search).
* Learn more about
* [built-in tools](https://platform.openai.com/docs/guides/tools).
+ * - **MCP Tools**: Integrations with third-party systems via custom MCP servers or
+ * predefined connectors such as Google Drive and Notion. Learn more about
+ * [MCP Tools](https://platform.openai.com/docs/guides/tools-connectors-mcp).
* - **Function calls (custom tools)**: Functions that are defined by you, enabling
* the model to call your own code with strongly typed arguments and outputs.
* Learn more about
@@ -4654,89 +4657,15 @@ export type Tool =
| FunctionTool
| FileSearchTool
| ComputerTool
- | Tool.WebSearchTool
+ | WebSearchTool
| Tool.Mcp
| Tool.CodeInterpreter
| Tool.ImageGeneration
| Tool.LocalShell
| CustomTool
- | WebSearchTool;
+ | WebSearchPreviewTool;
export namespace Tool {
- /**
- * Search the Internet for sources related to the prompt. Learn more about the
- * [web search tool](https://platform.openai.com/docs/guides/tools-web-search).
- */
- export interface WebSearchTool {
- /**
- * The type of the web search tool. One of `web_search` or `web_search_2025_08_26`.
- */
- type: 'web_search' | 'web_search_2025_08_26';
-
- /**
- * Filters for the search.
- */
- filters?: WebSearchTool.Filters | null;
-
- /**
- * High level guidance for the amount of context window space to use for the
- * search. One of `low`, `medium`, or `high`. `medium` is the default.
- */
- search_context_size?: 'low' | 'medium' | 'high';
-
- /**
- * The approximate location of the user.
- */
- user_location?: WebSearchTool.UserLocation | null;
- }
-
- export namespace WebSearchTool {
- /**
- * Filters for the search.
- */
- export interface Filters {
- /**
- * Allowed domains for the search. If not provided, all domains are allowed.
- * Subdomains of the provided domains are allowed as well.
- *
- * Example: `["pubmed.ncbi.nlm.nih.gov"]`
- */
- allowed_domains?: Array | null;
- }
-
- /**
- * The approximate location of the user.
- */
- export interface UserLocation {
- /**
- * Free text input for the city of the user, e.g. `San Francisco`.
- */
- city?: string | null;
-
- /**
- * The two-letter [ISO country code](https://en.wikipedia.org/wiki/ISO_3166-1) of
- * the user, e.g. `US`.
- */
- country?: string | null;
-
- /**
- * Free text input for the region of the user, e.g. `California`.
- */
- region?: string | null;
-
- /**
- * The [IANA timezone](https://timeapi.io/documentation/iana-timezones) of the
- * user, e.g. `America/Los_Angeles`.
- */
- timezone?: string | null;
-
- /**
- * The type of location approximation. Always `approximate`.
- */
- type?: 'approximate';
- }
- }
-
/**
* Give the model access to additional tools via remote Model Context Protocol
* (MCP) servers.
@@ -5151,7 +5080,7 @@ export interface ToolChoiceTypes {
* about the
* [web search tool](https://platform.openai.com/docs/guides/tools-web-search).
*/
-export interface WebSearchTool {
+export interface WebSearchPreviewTool {
/**
* The type of the web search tool. One of `web_search_preview` or
* `web_search_preview_2025_03_11`.
@@ -5167,10 +5096,10 @@ export interface WebSearchTool {
/**
* The user's location.
*/
- user_location?: WebSearchTool.UserLocation | null;
+ user_location?: WebSearchPreviewTool.UserLocation | null;
}
-export namespace WebSearchTool {
+export namespace WebSearchPreviewTool {
/**
* The user's location.
*/
@@ -5204,6 +5133,80 @@ export namespace WebSearchTool {
}
}
+/**
+ * Search the Internet for sources related to the prompt. Learn more about the
+ * [web search tool](https://platform.openai.com/docs/guides/tools-web-search).
+ */
+export interface WebSearchTool {
+ /**
+ * The type of the web search tool. One of `web_search` or `web_search_2025_08_26`.
+ */
+ type: 'web_search' | 'web_search_2025_08_26';
+
+ /**
+ * Filters for the search.
+ */
+ filters?: WebSearchTool.Filters | null;
+
+ /**
+ * High level guidance for the amount of context window space to use for the
+ * search. One of `low`, `medium`, or `high`. `medium` is the default.
+ */
+ search_context_size?: 'low' | 'medium' | 'high';
+
+ /**
+ * The approximate location of the user.
+ */
+ user_location?: WebSearchTool.UserLocation | null;
+}
+
+export namespace WebSearchTool {
+ /**
+ * Filters for the search.
+ */
+ export interface Filters {
+ /**
+ * Allowed domains for the search. If not provided, all domains are allowed.
+ * Subdomains of the provided domains are allowed as well.
+ *
+ * Example: `["pubmed.ncbi.nlm.nih.gov"]`
+ */
+ allowed_domains?: Array | null;
+ }
+
+ /**
+ * The approximate location of the user.
+ */
+ export interface UserLocation {
+ /**
+ * Free text input for the city of the user, e.g. `San Francisco`.
+ */
+ city?: string | null;
+
+ /**
+ * The two-letter [ISO country code](https://en.wikipedia.org/wiki/ISO_3166-1) of
+ * the user, e.g. `US`.
+ */
+ country?: string | null;
+
+ /**
+ * Free text input for the region of the user, e.g. `California`.
+ */
+ region?: string | null;
+
+ /**
+ * The [IANA timezone](https://timeapi.io/documentation/iana-timezones) of the
+ * user, e.g. `America/Los_Angeles`.
+ */
+ timezone?: string | null;
+
+ /**
+ * The type of location approximation. Always `approximate`.
+ */
+ type?: 'approximate';
+ }
+}
+
export type ResponseCreateParams = ResponseCreateParamsNonStreaming | ResponseCreateParamsStreaming;
export interface ResponseCreateParamsBase {
@@ -5410,7 +5413,7 @@ export interface ResponseCreateParamsBase {
* An array of tools the model may call while generating a response. You can
* specify which tool to use by setting the `tool_choice` parameter.
*
- * The two categories of tools you can provide the model are:
+ * We support the following categories of tools:
*
* - **Built-in tools**: Tools that are provided by OpenAI that extend the model's
* capabilities, like
@@ -5418,6 +5421,9 @@ export interface ResponseCreateParamsBase {
* [file search](https://platform.openai.com/docs/guides/tools-file-search).
* Learn more about
* [built-in tools](https://platform.openai.com/docs/guides/tools).
+ * - **MCP Tools**: Integrations with third-party systems via custom MCP servers or
+ * predefined connectors such as Google Drive and Notion. Learn more about
+ * [MCP Tools](https://platform.openai.com/docs/guides/tools-connectors-mcp).
* - **Function calls (custom tools)**: Functions that are defined by you, enabling
* the model to call your own code with strongly typed arguments and outputs.
* Learn more about
@@ -5673,6 +5679,7 @@ export declare namespace Responses {
type ToolChoiceMcp as ToolChoiceMcp,
type ToolChoiceOptions as ToolChoiceOptions,
type ToolChoiceTypes as ToolChoiceTypes,
+ type WebSearchPreviewTool as WebSearchPreviewTool,
type WebSearchTool as WebSearchTool,
type ResponseCreateParams as ResponseCreateParams,
type ResponseCreateParamsNonStreaming as ResponseCreateParamsNonStreaming,
diff --git a/src/resources/webhooks.ts b/src/resources/webhooks.ts
index fa337478b..7449d0830 100644
--- a/src/resources/webhooks.ts
+++ b/src/resources/webhooks.ts
@@ -559,6 +559,70 @@ export namespace FineTuningJobSucceededWebhookEvent {
}
}
+/**
+ * Sent when Realtime API Receives a incoming SIP call.
+ */
+export interface RealtimeCallIncomingWebhookEvent {
+ /**
+ * The unique ID of the event.
+ */
+ id: string;
+
+ /**
+ * The Unix timestamp (in seconds) of when the model response was completed.
+ */
+ created_at: number;
+
+ /**
+ * Event data payload.
+ */
+ data: RealtimeCallIncomingWebhookEvent.Data;
+
+ /**
+ * The type of the event. Always `realtime.call.incoming`.
+ */
+ type: 'realtime.call.incoming';
+
+ /**
+ * The object of the event. Always `event`.
+ */
+ object?: 'event';
+}
+
+export namespace RealtimeCallIncomingWebhookEvent {
+ /**
+ * Event data payload.
+ */
+ export interface Data {
+ /**
+ * The unique ID of this call.
+ */
+ call_id: string;
+
+ /**
+ * Headers from the SIP Invite.
+ */
+ sip_headers: Array;
+ }
+
+ export namespace Data {
+ /**
+ * A header from the SIP Invite.
+ */
+ export interface SipHeader {
+ /**
+ * Name of the SIP Header.
+ */
+ name: string;
+
+ /**
+ * Value of the SIP Header.
+ */
+ value: string;
+ }
+ }
+}
+
/**
* Sent when a background response has been cancelled.
*/
@@ -741,6 +805,7 @@ export type UnwrapWebhookEvent =
| FineTuningJobCancelledWebhookEvent
| FineTuningJobFailedWebhookEvent
| FineTuningJobSucceededWebhookEvent
+ | RealtimeCallIncomingWebhookEvent
| ResponseCancelledWebhookEvent
| ResponseCompletedWebhookEvent
| ResponseFailedWebhookEvent
@@ -758,6 +823,7 @@ export declare namespace Webhooks {
type FineTuningJobCancelledWebhookEvent as FineTuningJobCancelledWebhookEvent,
type FineTuningJobFailedWebhookEvent as FineTuningJobFailedWebhookEvent,
type FineTuningJobSucceededWebhookEvent as FineTuningJobSucceededWebhookEvent,
+ type RealtimeCallIncomingWebhookEvent as RealtimeCallIncomingWebhookEvent,
type ResponseCancelledWebhookEvent as ResponseCancelledWebhookEvent,
type ResponseCompletedWebhookEvent as ResponseCompletedWebhookEvent,
type ResponseFailedWebhookEvent as ResponseFailedWebhookEvent,
diff --git a/src/version.ts b/src/version.ts
index cf8aa5418..02ab094c5 100644
--- a/src/version.ts
+++ b/src/version.ts
@@ -1 +1 @@
-export const VERSION = '5.16.0'; // x-release-please-version
+export const VERSION = '5.17.0'; // x-release-please-version
diff --git a/tests/api-resources/beta/realtime/transcription-sessions.test.ts b/tests/api-resources/beta/realtime/transcription-sessions.test.ts
deleted file mode 100644
index 2c7cbbb15..000000000
--- a/tests/api-resources/beta/realtime/transcription-sessions.test.ts
+++ /dev/null
@@ -1,21 +0,0 @@
-// File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
-
-import OpenAI from 'openai';
-
-const client = new OpenAI({
- apiKey: 'My API Key',
- baseURL: process.env['TEST_API_BASE_URL'] ?? 'http://127.0.0.1:4010',
-});
-
-describe('resource transcriptionSessions', () => {
- test('create', async () => {
- const responsePromise = client.beta.realtime.transcriptionSessions.create({});
- const rawResponse = await responsePromise.asResponse();
- expect(rawResponse).toBeInstanceOf(Response);
- const response = await responsePromise;
- expect(response).not.toBeInstanceOf(Response);
- const dataAndResponse = await responsePromise.withResponse();
- expect(dataAndResponse.data).toBe(response);
- expect(dataAndResponse.response).toBe(rawResponse);
- });
-});
diff --git a/tests/api-resources/beta/realtime/sessions.test.ts b/tests/api-resources/realtime/client-secrets.test.ts
similarity index 86%
rename from tests/api-resources/beta/realtime/sessions.test.ts
rename to tests/api-resources/realtime/client-secrets.test.ts
index 1a75a532c..105cdfe7f 100644
--- a/tests/api-resources/beta/realtime/sessions.test.ts
+++ b/tests/api-resources/realtime/client-secrets.test.ts
@@ -7,9 +7,9 @@ const client = new OpenAI({
baseURL: process.env['TEST_API_BASE_URL'] ?? 'http://127.0.0.1:4010',
});
-describe('resource sessions', () => {
+describe('resource clientSecrets', () => {
test('create', async () => {
- const responsePromise = client.beta.realtime.sessions.create({});
+ const responsePromise = client.realtime.clientSecrets.create({});
const rawResponse = await responsePromise.asResponse();
expect(rawResponse).toBeInstanceOf(Response);
const response = await responsePromise;