diff --git a/.release-please-manifest.json b/.release-please-manifest.json index a0c8c3e19..7443ba284 100644 --- a/.release-please-manifest.json +++ b/.release-please-manifest.json @@ -1,3 +1,3 @@ { - ".": "5.16.0" + ".": "5.17.0" } diff --git a/.stats.yml b/.stats.yml index 5ad90ac5a..ebe81d146 100644 --- a/.stats.yml +++ b/.stats.yml @@ -1,4 +1,4 @@ -configured_endpoints: 119 -openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/openai%2Fopenai-8517ffa1004e31ca2523d617629e64be6fe4f13403ddfd9db5b3be002656cbde.yml -openapi_spec_hash: b64dd8c8b23082a7aa2a3e5c5fffd8bd -config_hash: fe0ea26680ac2075a6cd66416aefe7db +configured_endpoints: 118 +openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/openai%2Fopenai-356b4364203ff36d7724074cd04f6e684253bfcc3c9d969122d730aa7bc51b46.yml +openapi_spec_hash: 4ab8e96f52699bc3d2b0c4432aa92af8 +config_hash: b854932c0ea24b400bdd64e4376936bd diff --git a/CHANGELOG.md b/CHANGELOG.md index 2dd01aa0c..c358929fe 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,18 @@ # Changelog +## 5.17.0 (2025-09-02) + +Full Changelog: [v5.16.0...v5.17.0](https://github.com/openai/openai-node/compare/v5.16.0...v5.17.0) + +### Features + +* **api:** realtime API updates ([e817255](https://github.com/openai/openai-node/commit/e817255e6ff9e3ad6bd08b001644c335e0459537)) + + +### Chores + +* **internal:** update global Error reference ([e566ff3](https://github.com/openai/openai-node/commit/e566ff321642a100756224b75a67d44e262e5bea)) + ## 5.16.0 (2025-08-26) Full Changelog: [v5.15.0...v5.16.0](https://github.com/openai/openai-node/compare/v5.15.0...v5.16.0) diff --git a/README.md b/README.md index 9864a4829..351d9c0dc 100644 --- a/README.md +++ b/README.md @@ -264,14 +264,14 @@ const { data: stream, request_id } = await openai.chat.completions .withResponse(); ``` -## Realtime API Beta +## Realtime API The Realtime API enables you to build low-latency, multi-modal conversational experiences. It currently supports text and audio as both input and output, as well as [function calling](https://platform.openai.com/docs/guides/function-calling) through a `WebSocket` connection. ```ts -import { OpenAIRealtimeWebSocket } from 'openai/beta/realtime/websocket'; +import { OpenAIRealtimeWebSocket } from 'openai/realtime/websocket'; -const rt = new OpenAIRealtimeWebSocket({ model: 'gpt-4o-realtime-preview-2024-12-17' }); +const rt = new OpenAIRealtimeWebSocket({ model: 'gpt-realtime' }); rt.on('response.text.delta', (event) => process.stdout.write(event.delta)); ``` @@ -401,14 +401,14 @@ while (page.hasNextPage()) { } ``` -## Realtime API Beta +## Realtime API The Realtime API enables you to build low-latency, multi-modal conversational experiences. It currently supports text and audio as both input and output, as well as [function calling](https://platform.openai.com/docs/guides/function-calling) through a `WebSocket` connection. ```ts -import { OpenAIRealtimeWebSocket } from 'openai/beta/realtime/websocket'; +import { OpenAIRealtimeWebSocket } from 'openai/realtime/websocket'; -const rt = new OpenAIRealtimeWebSocket({ model: 'gpt-4o-realtime-preview-2024-12-17' }); +const rt = new OpenAIRealtimeWebSocket({ model: 'gpt-realtime' }); rt.on('response.text.delta', (event) => process.stdout.write(event.delta)); ``` diff --git a/api.md b/api.md index d133b6162..e8a4c861d 100644 --- a/api.md +++ b/api.md @@ -381,6 +381,7 @@ Types: - FineTuningJobCancelledWebhookEvent - FineTuningJobFailedWebhookEvent - FineTuningJobSucceededWebhookEvent +- RealtimeCallIncomingWebhookEvent - ResponseCancelledWebhookEvent - ResponseCompletedWebhookEvent - ResponseFailedWebhookEvent @@ -751,6 +752,7 @@ Types: - ToolChoiceMcp - ToolChoiceOptions - ToolChoiceTypes +- WebSearchPreviewTool - WebSearchTool Methods: @@ -770,6 +772,110 @@ Methods: - client.responses.inputItems.list(responseID, { ...params }) -> ResponseItemsPage +# Realtime + +Types: + +- ConversationCreatedEvent +- ConversationItem +- ConversationItemAdded +- ConversationItemCreateEvent +- ConversationItemCreatedEvent +- ConversationItemDeleteEvent +- ConversationItemDeletedEvent +- ConversationItemDone +- ConversationItemInputAudioTranscriptionCompletedEvent +- ConversationItemInputAudioTranscriptionDeltaEvent +- ConversationItemInputAudioTranscriptionFailedEvent +- ConversationItemInputAudioTranscriptionSegment +- ConversationItemRetrieveEvent +- ConversationItemTruncateEvent +- ConversationItemTruncatedEvent +- ConversationItemWithReference +- InputAudioBufferAppendEvent +- InputAudioBufferClearEvent +- InputAudioBufferClearedEvent +- InputAudioBufferCommitEvent +- InputAudioBufferCommittedEvent +- InputAudioBufferSpeechStartedEvent +- InputAudioBufferSpeechStoppedEvent +- InputAudioBufferTimeoutTriggered +- LogProbProperties +- McpListToolsCompleted +- McpListToolsFailed +- McpListToolsInProgress +- OutputAudioBufferClearEvent +- RateLimitsUpdatedEvent +- RealtimeAudioConfig +- RealtimeClientEvent +- RealtimeClientSecretConfig +- RealtimeConversationItemAssistantMessage +- RealtimeConversationItemFunctionCall +- RealtimeConversationItemFunctionCallOutput +- RealtimeConversationItemSystemMessage +- RealtimeConversationItemUserMessage +- RealtimeError +- RealtimeErrorEvent +- RealtimeMcpApprovalRequest +- RealtimeMcpApprovalResponse +- RealtimeMcpListTools +- RealtimeMcpProtocolError +- RealtimeMcpToolCall +- RealtimeMcpToolExecutionError +- RealtimeMcphttpError +- RealtimeResponse +- RealtimeResponseStatus +- RealtimeResponseUsage +- RealtimeResponseUsageInputTokenDetails +- RealtimeResponseUsageOutputTokenDetails +- RealtimeServerEvent +- RealtimeSession +- RealtimeSessionCreateRequest +- RealtimeToolChoiceConfig +- RealtimeToolsConfig +- RealtimeToolsConfigUnion +- RealtimeTracingConfig +- RealtimeTranscriptionSessionCreateRequest +- RealtimeTruncation +- ResponseAudioDeltaEvent +- ResponseAudioDoneEvent +- ResponseAudioTranscriptDeltaEvent +- ResponseAudioTranscriptDoneEvent +- ResponseCancelEvent +- ResponseContentPartAddedEvent +- ResponseContentPartDoneEvent +- ResponseCreateEvent +- ResponseCreatedEvent +- ResponseDoneEvent +- ResponseFunctionCallArgumentsDeltaEvent +- ResponseFunctionCallArgumentsDoneEvent +- ResponseMcpCallArgumentsDelta +- ResponseMcpCallArgumentsDone +- ResponseMcpCallCompleted +- ResponseMcpCallFailed +- ResponseMcpCallInProgress +- ResponseOutputItemAddedEvent +- ResponseOutputItemDoneEvent +- ResponseTextDeltaEvent +- ResponseTextDoneEvent +- SessionCreatedEvent +- SessionUpdateEvent +- SessionUpdatedEvent +- TranscriptionSessionCreated +- TranscriptionSessionUpdate +- TranscriptionSessionUpdatedEvent + +## ClientSecrets + +Types: + +- RealtimeSessionCreateResponse +- ClientSecretCreateResponse + +Methods: + +- client.realtime.clientSecrets.create({ ...params }) -> ClientSecretCreateResponse + # Conversations Types: diff --git a/examples/azure/realtime/websocket.ts b/examples/azure/realtime/websocket.ts index 91fe3b7b9..146f7f94e 100644 --- a/examples/azure/realtime/websocket.ts +++ b/examples/azure/realtime/websocket.ts @@ -1,4 +1,4 @@ -import { OpenAIRealtimeWebSocket } from 'openai/beta/realtime/websocket'; +import { OpenAIRealtimeWebSocket } from 'openai/realtime/websocket'; import { AzureOpenAI } from 'openai'; import { DefaultAzureCredential, getBearerTokenProvider } from '@azure/identity'; import 'dotenv/config'; @@ -21,8 +21,9 @@ async function main() { rt.send({ type: 'session.update', session: { - modalities: ['text'], + output_modalities: ['text'], model: 'gpt-4o-realtime-preview', + type: 'realtime', }, }); @@ -49,8 +50,8 @@ async function main() { console.log(); }); - rt.on('response.text.delta', (event) => process.stdout.write(event.delta)); - rt.on('response.text.done', () => console.log()); + rt.on('response.output_text.delta', (event) => process.stdout.write(event.delta)); + rt.on('response.output_text.done', () => console.log()); rt.on('response.done', () => rt.close()); diff --git a/examples/azure/realtime/ws.ts b/examples/azure/realtime/ws.ts index 8b22aeef0..83f8c6297 100644 --- a/examples/azure/realtime/ws.ts +++ b/examples/azure/realtime/ws.ts @@ -1,5 +1,5 @@ import { DefaultAzureCredential, getBearerTokenProvider } from '@azure/identity'; -import { OpenAIRealtimeWS } from 'openai/beta/realtime/ws'; +import { OpenAIRealtimeWS } from 'openai/realtime/ws'; import { AzureOpenAI } from 'openai'; import 'dotenv/config'; @@ -21,8 +21,9 @@ async function main() { rt.send({ type: 'session.update', session: { - modalities: ['text'], + output_modalities: ['text'], model: 'gpt-4o-realtime-preview', + type: 'realtime', }, }); @@ -49,8 +50,8 @@ async function main() { console.log(); }); - rt.on('response.text.delta', (event) => process.stdout.write(event.delta)); - rt.on('response.text.done', () => console.log()); + rt.on('response.output_text.delta', (event) => process.stdout.write(event.delta)); + rt.on('response.output_text.done', () => console.log()); rt.on('response.done', () => rt.close()); diff --git a/examples/realtime/websocket.ts b/examples/realtime/websocket.ts index 6fb4740af..bf61db9ac 100644 --- a/examples/realtime/websocket.ts +++ b/examples/realtime/websocket.ts @@ -1,7 +1,7 @@ -import { OpenAIRealtimeWebSocket } from 'openai/beta/realtime/websocket'; +import { OpenAIRealtimeWebSocket } from 'openai/realtime/websocket'; async function main() { - const rt = new OpenAIRealtimeWebSocket({ model: 'gpt-4o-realtime-preview-2024-12-17' }); + const rt = new OpenAIRealtimeWebSocket({ model: 'gpt-realtime' }); // access the underlying `ws.WebSocket` instance rt.socket.addEventListener('open', () => { @@ -9,8 +9,9 @@ async function main() { rt.send({ type: 'session.update', session: { - modalities: ['text'], + output_modalities: ['text'], model: 'gpt-4o-realtime-preview', + type: 'realtime', }, }); @@ -37,8 +38,8 @@ async function main() { console.log(); }); - rt.on('response.text.delta', (event) => process.stdout.write(event.delta)); - rt.on('response.text.done', () => console.log()); + rt.on('response.output_text.delta', (event) => process.stdout.write(event.delta)); + rt.on('response.output_text.done', () => console.log()); rt.on('response.done', () => rt.close()); diff --git a/examples/realtime/ws.ts b/examples/realtime/ws.ts index 6cc950b76..ba22e262a 100644 --- a/examples/realtime/ws.ts +++ b/examples/realtime/ws.ts @@ -1,7 +1,7 @@ -import { OpenAIRealtimeWS } from 'openai/beta/realtime/ws'; +import { OpenAIRealtimeWS } from 'openai/realtime/ws'; async function main() { - const rt = new OpenAIRealtimeWS({ model: 'gpt-4o-realtime-preview-2024-12-17' }); + const rt = new OpenAIRealtimeWS({ model: 'gpt-realtime' }); // access the underlying `ws.WebSocket` instance rt.socket.on('open', () => { @@ -9,8 +9,9 @@ async function main() { rt.send({ type: 'session.update', session: { - modalities: ['text'], + output_modalities: ['text'], model: 'gpt-4o-realtime-preview', + type: 'realtime', }, }); @@ -37,8 +38,8 @@ async function main() { console.log(); }); - rt.on('response.text.delta', (event) => process.stdout.write(event.delta)); - rt.on('response.text.done', () => console.log()); + rt.on('response.output_text.delta', (event) => process.stdout.write(event.delta)); + rt.on('response.output_text.done', () => console.log()); rt.on('response.done', () => rt.close()); diff --git a/jsr.json b/jsr.json index 2996d8f66..cf46e84e3 100644 --- a/jsr.json +++ b/jsr.json @@ -1,6 +1,6 @@ { "name": "@openai/openai", - "version": "5.16.0", + "version": "5.17.0", "exports": { ".": "./index.ts", "./helpers/zod": "./helpers/zod.ts", diff --git a/jsr.json.orig b/jsr.json.orig index 3e7c40d5f..30eac2430 100644 --- a/jsr.json.orig +++ b/jsr.json.orig @@ -5,6 +5,7 @@ ".": "./index.ts", "./helpers/zod": "./helpers/zod.ts", "./beta/realtime/websocket": "./beta/realtime/websocket.ts" + "./realtime/websocket": "./realtime/websocket.ts" }, "imports": { "zod": "npm:zod@3" diff --git a/package.json b/package.json index b3a4f4685..ccff023c1 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "openai", - "version": "5.16.0", + "version": "5.17.0", "description": "The official TypeScript library for the OpenAI API", "author": "OpenAI ", "types": "dist/index.d.ts", diff --git a/realtime.md b/realtime.md index 9842ad453..1f47600e4 100644 --- a/realtime.md +++ b/realtime.md @@ -1,4 +1,4 @@ -## Realtime API beta +## Realtime API The Realtime API enables you to build low-latency, multi-modal conversational experiences. It currently supports text and audio as both input and output, as well as [function calling](https://platform.openai.com/docs/guides/function-calling) through a `WebSocket` connection. @@ -10,9 +10,9 @@ Basic text based example with `ws`: ```ts // requires `yarn add ws @types/ws` -import { OpenAIRealtimeWS } from 'openai/beta/realtime/ws'; +import { OpenAIRealtimeWS } from 'openai/realtime/ws'; -const rt = new OpenAIRealtimeWS({ model: 'gpt-4o-realtime-preview-2024-12-17' }); +const rt = new OpenAIRealtimeWS({ model: 'gpt-realtime' }); // access the underlying `ws.WebSocket` instance rt.socket.on('open', () => { @@ -59,9 +59,9 @@ rt.socket.on('close', () => console.log('\nConnection closed!')); To use the web API `WebSocket` implementation, replace `OpenAIRealtimeWS` with `OpenAIRealtimeWebSocket` and adjust any `rt.socket` access: ```ts -import { OpenAIRealtimeWebSocket } from 'openai/beta/realtime/websocket'; +import { OpenAIRealtimeWebSocket } from 'openai/realtime/websocket'; -const rt = new OpenAIRealtimeWebSocket({ model: 'gpt-4o-realtime-preview-2024-12-17' }); +const rt = new OpenAIRealtimeWebSocket({ model: 'gpt-realtime' }); // ... rt.socket.addEventListener('open', () => { // ... @@ -77,7 +77,7 @@ When an error is encountered, either on the client side or returned from the ser It is **highly recommended** that you register an `error` event listener and handle errors appropriately as typically the underlying connection is still usable. ```ts -const rt = new OpenAIRealtimeWS({ model: 'gpt-4o-realtime-preview-2024-12-17' }); +const rt = new OpenAIRealtimeWS({ model: 'gpt-realtime' }); rt.on('error', (err) => { // in a real world scenario this should be logged somewhere as you // likely want to continue processing events regardless of any errors diff --git a/scripts/detect-breaking-changes b/scripts/detect-breaking-changes index 9f5a00452..85607de43 100755 --- a/scripts/detect-breaking-changes +++ b/scripts/detect-breaking-changes @@ -44,6 +44,8 @@ TEST_PATHS=( tests/api-resources/uploads/parts.test.ts tests/api-resources/responses/responses.test.ts tests/api-resources/responses/input-items.test.ts + tests/api-resources/realtime/realtime.test.ts + tests/api-resources/realtime/client-secrets.test.ts tests/api-resources/conversations/conversations.test.ts tests/api-resources/conversations/items.test.ts tests/api-resources/evals/evals.test.ts diff --git a/src/client.ts b/src/client.ts index 78e29664a..a853d2890 100644 --- a/src/client.ts +++ b/src/client.ts @@ -121,6 +121,7 @@ import { } from './resources/evals/evals'; import { FineTuning } from './resources/fine-tuning/fine-tuning'; import { Graders } from './resources/graders/graders'; +import { Realtime } from './resources/realtime/realtime'; import { Responses } from './resources/responses/responses'; import { Upload, @@ -574,7 +575,7 @@ export class OpenAI { const response = await this.fetchWithTimeout(url, req, timeout, controller).catch(castToError); const headersTime = Date.now(); - if (response instanceof Error) { + if (response instanceof globalThis.Error) { const retryMessage = `retrying, ${retriesRemaining} attempts remaining`; if (options.signal?.aborted) { throw new Errors.APIUserAbortError(); @@ -962,6 +963,7 @@ export class OpenAI { batches: API.Batches = new API.Batches(this); uploads: API.Uploads = new API.Uploads(this); responses: API.Responses = new API.Responses(this); + realtime: API.Realtime = new API.Realtime(this); conversations: API.Conversations = new API.Conversations(this); evals: API.Evals = new API.Evals(this); containers: API.Containers = new API.Containers(this); @@ -983,6 +985,7 @@ OpenAI.Beta = Beta; OpenAI.Batches = Batches; OpenAI.Uploads = UploadsAPIUploads; OpenAI.Responses = Responses; +OpenAI.Realtime = Realtime; OpenAI.Conversations = Conversations; OpenAI.Evals = Evals; OpenAI.Containers = Containers; @@ -1165,6 +1168,8 @@ export declare namespace OpenAI { export { Responses as Responses }; + export { Realtime as Realtime }; + export { Conversations as Conversations }; export { diff --git a/src/realtime/index.ts b/src/realtime/index.ts new file mode 100644 index 000000000..75f0f3088 --- /dev/null +++ b/src/realtime/index.ts @@ -0,0 +1 @@ +export { OpenAIRealtimeError } from './internal-base'; diff --git a/src/realtime/internal-base.ts b/src/realtime/internal-base.ts new file mode 100644 index 000000000..92cc1d1c6 --- /dev/null +++ b/src/realtime/internal-base.ts @@ -0,0 +1,98 @@ +import { + RealtimeClientEvent, + RealtimeServerEvent, + RealtimeErrorEvent, + RealtimeError, +} from '../resources/realtime/realtime'; +import { EventEmitter } from '../lib/EventEmitter'; +import { OpenAIError } from '../error'; +import OpenAI, { AzureOpenAI } from '../index'; + +export class OpenAIRealtimeError extends OpenAIError { + /** + * The error data that the API sent back in an `error` event. + */ + error?: RealtimeError | undefined; + + /** + * The unique ID of the server event. + */ + event_id?: string | undefined; + + constructor(message: string, event: RealtimeErrorEvent | null) { + super(message); + + this.error = event?.error; + this.event_id = event?.event_id; + } +} + +type Simplify = { [KeyType in keyof T]: T[KeyType] } & {}; + +type RealtimeEvents = Simplify< + { + event: (event: RealtimeServerEvent) => void; + error: (error: OpenAIRealtimeError) => void; + } & { + [EventType in Exclude]: ( + event: Extract, + ) => unknown; + } +>; + +export abstract class OpenAIRealtimeEmitter extends EventEmitter { + /** + * Send an event to the API. + */ + abstract send(event: RealtimeClientEvent): void; + + /** + * Close the websocket connection. + */ + abstract close(props?: { code: number; reason: string }): void; + + protected _onError(event: null, message: string, cause: any): void; + protected _onError(event: RealtimeErrorEvent, message?: string | undefined): void; + protected _onError(event: RealtimeErrorEvent | null, message?: string | undefined, cause?: any): void { + message = + event?.error ? + `${event.error.message} code=${event.error.code} param=${event.error.param} type=${event.error.type} event_id=${event.error.event_id}` + : message ?? 'unknown error'; + + if (!this._hasListener('error')) { + const error = new OpenAIRealtimeError( + message + + `\n\nTo resolve these unhandled rejection errors you should bind an \`error\` callback, e.g. \`rt.on('error', (error) => ...)\` `, + event, + ); + // @ts-ignore + error.cause = cause; + Promise.reject(error); + return; + } + + const error = new OpenAIRealtimeError(message, event); + // @ts-ignore + error.cause = cause; + + this._emit('error', error); + } +} + +export function isAzure(client: Pick): client is AzureOpenAI { + return client instanceof AzureOpenAI; +} + +export function buildRealtimeURL(client: Pick, model: string): URL { + const path = '/realtime'; + const baseURL = client.baseURL; + const url = new URL(baseURL + (baseURL.endsWith('/') ? path.slice(1) : path)); + url.protocol = 'wss'; + if (isAzure(client)) { + url.searchParams.set('api-version', client.apiVersion); + url.searchParams.set('deployment', model); + } else { + url.searchParams.set('model', model); + } + return url; +} diff --git a/src/realtime/websocket.ts b/src/realtime/websocket.ts new file mode 100644 index 000000000..c83b2cf05 --- /dev/null +++ b/src/realtime/websocket.ts @@ -0,0 +1,142 @@ +import { AzureOpenAI, OpenAI } from '../index'; +import { OpenAIError } from '../error'; +import type { RealtimeClientEvent, RealtimeServerEvent } from '../resources/realtime/realtime'; +import { OpenAIRealtimeEmitter, buildRealtimeURL, isAzure } from './internal-base'; +import { isRunningInBrowser } from '../internal/detect-platform'; + +interface MessageEvent { + data: string; +} + +type _WebSocket = + typeof globalThis extends ( + { + WebSocket: infer ws extends abstract new (...args: any) => any; + } + ) ? + // @ts-ignore + InstanceType + : any; + +export class OpenAIRealtimeWebSocket extends OpenAIRealtimeEmitter { + url: URL; + socket: _WebSocket; + + constructor( + props: { + model: string; + dangerouslyAllowBrowser?: boolean; + /** + * Callback to mutate the URL, needed for Azure. + * @internal + */ + onURL?: (url: URL) => void; + }, + client?: Pick, + ) { + super(); + + const dangerouslyAllowBrowser = + props.dangerouslyAllowBrowser ?? + (client as any)?._options?.dangerouslyAllowBrowser ?? + (client?.apiKey.startsWith('ek_') ? true : null); + + if (!dangerouslyAllowBrowser && isRunningInBrowser()) { + throw new OpenAIError( + "It looks like you're running in a browser-like environment.\n\nThis is disabled by default, as it risks exposing your secret API credentials to attackers.\n\nYou can avoid this error by creating an ephemeral session token:\nhttps://platform.openai.com/docs/api-reference/realtime-sessions\n", + ); + } + + client ??= new OpenAI({ dangerouslyAllowBrowser }); + + this.url = buildRealtimeURL(client, props.model); + props.onURL?.(this.url); + + // @ts-ignore + this.socket = new WebSocket(this.url.toString(), [ + 'realtime', + ...(isAzure(client) ? [] : [`openai-insecure-api-key.${client.apiKey}`]), + ]); + + this.socket.addEventListener('message', (websocketEvent: MessageEvent) => { + const event = (() => { + try { + return JSON.parse(websocketEvent.data.toString()) as RealtimeServerEvent; + } catch (err) { + this._onError(null, 'could not parse websocket event', err); + return null; + } + })(); + + if (event) { + this._emit('event', event); + + if (event.type === 'error') { + this._onError(event); + } else { + // @ts-expect-error TS isn't smart enough to get the relationship right here + this._emit(event.type, event); + } + } + }); + + this.socket.addEventListener('error', (event: any) => { + this._onError(null, event.message, null); + }); + + if (isAzure(client)) { + if (this.url.searchParams.get('Authorization') !== null) { + this.url.searchParams.set('Authorization', ''); + } else { + this.url.searchParams.set('api-key', ''); + } + } + } + + static async azure( + client: Pick, + options: { deploymentName?: string; dangerouslyAllowBrowser?: boolean } = {}, + ): Promise { + const token = await client._getAzureADToken(); + function onURL(url: URL) { + if (client.apiKey !== '') { + url.searchParams.set('api-key', client.apiKey); + } else { + if (token) { + url.searchParams.set('Authorization', `Bearer ${token}`); + } else { + throw new Error('AzureOpenAI is not instantiated correctly. No API key or token provided.'); + } + } + } + const deploymentName = options.deploymentName ?? client.deploymentName; + if (!deploymentName) { + throw new Error('No deployment name provided'); + } + const { dangerouslyAllowBrowser } = options; + return new OpenAIRealtimeWebSocket( + { + model: deploymentName, + onURL, + ...(dangerouslyAllowBrowser ? { dangerouslyAllowBrowser } : {}), + }, + client, + ); + } + + send(event: RealtimeClientEvent) { + try { + this.socket.send(JSON.stringify(event)); + } catch (err) { + this._onError(null, 'could not send data', err); + } + } + + close(props?: { code: number; reason: string }) { + try { + this.socket.close(props?.code ?? 1000, props?.reason ?? 'OK'); + } catch (err) { + this._onError(null, 'could not close the connection', err); + } + } +} diff --git a/src/realtime/ws.ts b/src/realtime/ws.ts new file mode 100644 index 000000000..5226d6601 --- /dev/null +++ b/src/realtime/ws.ts @@ -0,0 +1,95 @@ +import * as WS from 'ws'; +import { AzureOpenAI, OpenAI } from '../index'; +import type { RealtimeClientEvent, RealtimeServerEvent } from '../resources/realtime/realtime'; +import { OpenAIRealtimeEmitter, buildRealtimeURL, isAzure } from './internal-base'; + +export class OpenAIRealtimeWS extends OpenAIRealtimeEmitter { + url: URL; + socket: WS.WebSocket; + + constructor( + props: { model: string; options?: WS.ClientOptions | undefined }, + client?: Pick, + ) { + super(); + client ??= new OpenAI(); + + this.url = buildRealtimeURL(client, props.model); + this.socket = new WS.WebSocket(this.url, { + ...props.options, + headers: { + ...props.options?.headers, + ...(isAzure(client) ? {} : { Authorization: `Bearer ${client.apiKey}` }), + }, + }); + + this.socket.on('message', (wsEvent) => { + const event = (() => { + try { + return JSON.parse(wsEvent.toString()) as RealtimeServerEvent; + } catch (err) { + this._onError(null, 'could not parse websocket event', err); + return null; + } + })(); + + if (event) { + this._emit('event', event); + + if (event.type === 'error') { + this._onError(event); + } else { + // @ts-expect-error TS isn't smart enough to get the relationship right here + this._emit(event.type, event); + } + } + }); + + this.socket.on('error', (err) => { + this._onError(null, err.message, err); + }); + } + + static async azure( + client: Pick, + options: { deploymentName?: string; options?: WS.ClientOptions | undefined } = {}, + ): Promise { + const deploymentName = options.deploymentName ?? client.deploymentName; + if (!deploymentName) { + throw new Error('No deployment name provided'); + } + return new OpenAIRealtimeWS( + { model: deploymentName, options: { headers: await getAzureHeaders(client) } }, + client, + ); + } + + send(event: RealtimeClientEvent) { + try { + this.socket.send(JSON.stringify(event)); + } catch (err) { + this._onError(null, 'could not send data', err); + } + } + + close(props?: { code: number; reason: string }) { + try { + this.socket.close(props?.code ?? 1000, props?.reason ?? 'OK'); + } catch (err) { + this._onError(null, 'could not close the connection', err); + } + } +} + +async function getAzureHeaders(client: Pick) { + if (client.apiKey !== '') { + return { 'api-key': client.apiKey }; + } else { + const token = await client._getAzureADToken(); + if (token) { + return { Authorization: `Bearer ${token}` }; + } else { + throw new Error('AzureOpenAI is not instantiated correctly. No API key or token provided.'); + } + } +} diff --git a/src/resources/audio/speech.ts b/src/resources/audio/speech.ts index f533a558b..e68e806e0 100644 --- a/src/resources/audio/speech.ts +++ b/src/resources/audio/speech.ts @@ -51,7 +51,18 @@ export interface SpeechCreateParams { * `verse`. Previews of the voices are available in the * [Text to speech guide](https://platform.openai.com/docs/guides/text-to-speech#voice-options). */ - voice: (string & {}) | 'alloy' | 'ash' | 'ballad' | 'coral' | 'echo' | 'sage' | 'shimmer' | 'verse'; + voice: + | (string & {}) + | 'alloy' + | 'ash' + | 'ballad' + | 'coral' + | 'echo' + | 'sage' + | 'shimmer' + | 'verse' + | 'marin' + | 'cedar'; /** * Control the voice of your generated audio with additional instructions. Does not diff --git a/src/resources/beta/realtime/realtime.ts b/src/resources/beta/realtime/realtime.ts index 4635c6762..b7fe85dc0 100644 --- a/src/resources/beta/realtime/realtime.ts +++ b/src/resources/beta/realtime/realtime.ts @@ -17,6 +17,9 @@ import { TranscriptionSessions, } from './transcription-sessions'; +/** + * @deprecated Realtime has now launched and is generally available. The old beta API is now deprecated. + */ export class Realtime extends APIResource { sessions: SessionsAPI.Sessions = new SessionsAPI.Sessions(this._client); transcriptionSessions: TranscriptionSessionsAPI.TranscriptionSessions = diff --git a/src/resources/chat/completions/completions.ts b/src/resources/chat/completions/completions.ts index a71e574e9..17269f25b 100644 --- a/src/resources/chat/completions/completions.ts +++ b/src/resources/chat/completions/completions.ts @@ -489,7 +489,18 @@ export interface ChatCompletionAudioParam { * The voice the model uses to respond. Supported voices are `alloy`, `ash`, * `ballad`, `coral`, `echo`, `fable`, `nova`, `onyx`, `sage`, and `shimmer`. */ - voice: (string & {}) | 'alloy' | 'ash' | 'ballad' | 'coral' | 'echo' | 'sage' | 'shimmer' | 'verse'; + voice: + | (string & {}) + | 'alloy' + | 'ash' + | 'ballad' + | 'coral' + | 'echo' + | 'sage' + | 'shimmer' + | 'verse' + | 'marin' + | 'cedar'; } /** diff --git a/src/resources/index.ts b/src/resources/index.ts index 129b1cbd0..fbbc0e3bb 100644 --- a/src/resources/index.ts +++ b/src/resources/index.ts @@ -95,6 +95,7 @@ export { type ModerationCreateResponse, type ModerationCreateParams, } from './moderations'; +export { Realtime } from './realtime/realtime'; export { Responses } from './responses/responses'; export { Uploads, type Upload, type UploadCreateParams, type UploadCompleteParams } from './uploads/uploads'; export { diff --git a/src/resources/realtime.ts b/src/resources/realtime.ts new file mode 100644 index 000000000..1c5df27d9 --- /dev/null +++ b/src/resources/realtime.ts @@ -0,0 +1,3 @@ +// File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. + +export * from './realtime/index'; diff --git a/src/resources/realtime/client-secrets.ts b/src/resources/realtime/client-secrets.ts new file mode 100644 index 000000000..c48fe8243 --- /dev/null +++ b/src/resources/realtime/client-secrets.ts @@ -0,0 +1,470 @@ +// File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. + +import { APIResource } from '../../core/resource'; +import * as RealtimeAPI from './realtime'; +import { APIPromise } from '../../core/api-promise'; +import { RequestOptions } from '../../internal/request-options'; + +export class ClientSecrets extends APIResource { + /** + * Create a Realtime session and client secret for either realtime or + * transcription. + */ + create(body: ClientSecretCreateParams, options?: RequestOptions): APIPromise { + return this._client.post('/realtime/client_secrets', { body, ...options }); + } +} + +/** + * A Realtime session configuration object. + */ +export interface RealtimeSessionCreateResponse { + /** + * Unique identifier for the session that looks like `sess_1234567890abcdef`. + */ + id?: string; + + /** + * Configuration for input and output audio for the session. + */ + audio?: RealtimeSessionCreateResponse.Audio; + + /** + * Expiration timestamp for the session, in seconds since epoch. + */ + expires_at?: number; + + /** + * Additional fields to include in server outputs. + * + * - `item.input_audio_transcription.logprobs`: Include logprobs for input audio + * transcription. + */ + include?: Array<'item.input_audio_transcription.logprobs'>; + + /** + * The default system instructions (i.e. system message) prepended to model calls. + * This field allows the client to guide the model on desired responses. The model + * can be instructed on response content and format, (e.g. "be extremely succinct", + * "act friendly", "here are examples of good responses") and on audio behavior + * (e.g. "talk quickly", "inject emotion into your voice", "laugh frequently"). The + * instructions are not guaranteed to be followed by the model, but they provide + * guidance to the model on the desired behavior. + * + * Note that the server sets default instructions which will be used if this field + * is not set and are visible in the `session.created` event at the start of the + * session. + */ + instructions?: string; + + /** + * Maximum number of output tokens for a single assistant response, inclusive of + * tool calls. Provide an integer between 1 and 4096 to limit output tokens, or + * `inf` for the maximum available tokens for a given model. Defaults to `inf`. + */ + max_output_tokens?: number | 'inf'; + + /** + * The Realtime model used for this session. + */ + model?: string; + + /** + * The object type. Always `realtime.session`. + */ + object?: string; + + /** + * The set of modalities the model can respond with. To disable audio, set this to + * ["text"]. + */ + output_modalities?: Array<'text' | 'audio'>; + + /** + * How the model chooses tools. Options are `auto`, `none`, `required`, or specify + * a function. + */ + tool_choice?: string; + + /** + * Tools (functions) available to the model. + */ + tools?: Array; + + /** + * Configuration options for tracing. Set to null to disable tracing. Once tracing + * is enabled for a session, the configuration cannot be modified. + * + * `auto` will create a trace for the session with default values for the workflow + * name, group id, and metadata. + */ + tracing?: 'auto' | RealtimeSessionCreateResponse.TracingConfiguration; + + /** + * Configuration for turn detection. Can be set to `null` to turn off. Server VAD + * means that the model will detect the start and end of speech based on audio + * volume and respond at the end of user speech. + */ + turn_detection?: RealtimeSessionCreateResponse.TurnDetection; +} + +export namespace RealtimeSessionCreateResponse { + /** + * Configuration for input and output audio for the session. + */ + export interface Audio { + input?: Audio.Input; + + output?: Audio.Output; + } + + export namespace Audio { + export interface Input { + /** + * The format of input audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`. + */ + format?: string; + + /** + * Configuration for input audio noise reduction. + */ + noise_reduction?: Input.NoiseReduction; + + /** + * Configuration for input audio transcription. + */ + transcription?: Input.Transcription; + + /** + * Configuration for turn detection. + */ + turn_detection?: Input.TurnDetection; + } + + export namespace Input { + /** + * Configuration for input audio noise reduction. + */ + export interface NoiseReduction { + type?: 'near_field' | 'far_field'; + } + + /** + * Configuration for input audio transcription. + */ + export interface Transcription { + /** + * The language of the input audio. + */ + language?: string; + + /** + * The model to use for transcription. + */ + model?: string; + + /** + * Optional text to guide the model's style or continue a previous audio segment. + */ + prompt?: string; + } + + /** + * Configuration for turn detection. + */ + export interface TurnDetection { + prefix_padding_ms?: number; + + silence_duration_ms?: number; + + threshold?: number; + + /** + * Type of turn detection, only `server_vad` is currently supported. + */ + type?: string; + } + } + + export interface Output { + /** + * The format of output audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`. + */ + format?: string; + + speed?: number; + + voice?: + | (string & {}) + | 'alloy' + | 'ash' + | 'ballad' + | 'coral' + | 'echo' + | 'sage' + | 'shimmer' + | 'verse' + | 'marin' + | 'cedar'; + } + } + + export interface Tool { + /** + * The description of the function, including guidance on when and how to call it, + * and guidance about what to tell the user when calling (if anything). + */ + description?: string; + + /** + * The name of the function. + */ + name?: string; + + /** + * Parameters of the function in JSON Schema. + */ + parameters?: unknown; + + /** + * The type of the tool, i.e. `function`. + */ + type?: 'function'; + } + + /** + * Granular configuration for tracing. + */ + export interface TracingConfiguration { + /** + * The group id to attach to this trace to enable filtering and grouping in the + * traces dashboard. + */ + group_id?: string; + + /** + * The arbitrary metadata to attach to this trace to enable filtering in the traces + * dashboard. + */ + metadata?: unknown; + + /** + * The name of the workflow to attach to this trace. This is used to name the trace + * in the traces dashboard. + */ + workflow_name?: string; + } + + /** + * Configuration for turn detection. Can be set to `null` to turn off. Server VAD + * means that the model will detect the start and end of speech based on audio + * volume and respond at the end of user speech. + */ + export interface TurnDetection { + /** + * Amount of audio to include before the VAD detected speech (in milliseconds). + * Defaults to 300ms. + */ + prefix_padding_ms?: number; + + /** + * Duration of silence to detect speech stop (in milliseconds). Defaults to 500ms. + * With shorter values the model will respond more quickly, but may jump in on + * short pauses from the user. + */ + silence_duration_ms?: number; + + /** + * Activation threshold for VAD (0.0 to 1.0), this defaults to 0.5. A higher + * threshold will require louder audio to activate the model, and thus might + * perform better in noisy environments. + */ + threshold?: number; + + /** + * Type of turn detection, only `server_vad` is currently supported. + */ + type?: string; + } +} + +/** + * Response from creating a session and client secret for the Realtime API. + */ +export interface ClientSecretCreateResponse { + /** + * Expiration timestamp for the client secret, in seconds since epoch. + */ + expires_at: number; + + /** + * The session configuration for either a realtime or transcription session. + */ + session: + | RealtimeSessionCreateResponse + | ClientSecretCreateResponse.RealtimeTranscriptionSessionCreateResponse; + + /** + * The generated client secret value. + */ + value: string; +} + +export namespace ClientSecretCreateResponse { + /** + * A Realtime transcription session configuration object. + */ + export interface RealtimeTranscriptionSessionCreateResponse { + /** + * Unique identifier for the session that looks like `sess_1234567890abcdef`. + */ + id?: string; + + /** + * Configuration for input audio for the session. + */ + audio?: RealtimeTranscriptionSessionCreateResponse.Audio; + + /** + * Expiration timestamp for the session, in seconds since epoch. + */ + expires_at?: number; + + /** + * Additional fields to include in server outputs. + * + * - `item.input_audio_transcription.logprobs`: Include logprobs for input audio + * transcription. + */ + include?: Array<'item.input_audio_transcription.logprobs'>; + + /** + * The object type. Always `realtime.transcription_session`. + */ + object?: string; + } + + export namespace RealtimeTranscriptionSessionCreateResponse { + /** + * Configuration for input audio for the session. + */ + export interface Audio { + input?: Audio.Input; + } + + export namespace Audio { + export interface Input { + /** + * The format of input audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`. + */ + format?: string; + + /** + * Configuration for input audio noise reduction. + */ + noise_reduction?: Input.NoiseReduction; + + /** + * Configuration of the transcription model. + */ + transcription?: Input.Transcription; + + /** + * Configuration for turn detection. + */ + turn_detection?: Input.TurnDetection; + } + + export namespace Input { + /** + * Configuration for input audio noise reduction. + */ + export interface NoiseReduction { + type?: 'near_field' | 'far_field'; + } + + /** + * Configuration of the transcription model. + */ + export interface Transcription { + /** + * The language of the input audio. Supplying the input language in + * [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`) + * format will improve accuracy and latency. + */ + language?: string; + + /** + * The model to use for transcription. Can be `gpt-4o-transcribe`, + * `gpt-4o-mini-transcribe`, or `whisper-1`. + */ + model?: 'gpt-4o-transcribe' | 'gpt-4o-mini-transcribe' | 'whisper-1'; + + /** + * An optional text to guide the model's style or continue a previous audio + * segment. The + * [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting) + * should match the audio language. + */ + prompt?: string; + } + + /** + * Configuration for turn detection. + */ + export interface TurnDetection { + prefix_padding_ms?: number; + + silence_duration_ms?: number; + + threshold?: number; + + /** + * Type of turn detection, only `server_vad` is currently supported. + */ + type?: string; + } + } + } + } +} + +export interface ClientSecretCreateParams { + /** + * Configuration for the ephemeral token expiration. + */ + expires_after?: ClientSecretCreateParams.ExpiresAfter; + + /** + * Session configuration to use for the client secret. Choose either a realtime + * session or a transcription session. + */ + session?: RealtimeAPI.RealtimeSessionCreateRequest | RealtimeAPI.RealtimeTranscriptionSessionCreateRequest; +} + +export namespace ClientSecretCreateParams { + /** + * Configuration for the ephemeral token expiration. + */ + export interface ExpiresAfter { + /** + * The anchor point for the ephemeral token expiration. Only `created_at` is + * currently supported. + */ + anchor?: 'created_at'; + + /** + * The number of seconds from the anchor point to the expiration. Select a value + * between `10` and `7200`. + */ + seconds?: number; + } +} + +export declare namespace ClientSecrets { + export { + type RealtimeSessionCreateResponse as RealtimeSessionCreateResponse, + type ClientSecretCreateResponse as ClientSecretCreateResponse, + type ClientSecretCreateParams as ClientSecretCreateParams, + }; +} diff --git a/src/resources/realtime/index.ts b/src/resources/realtime/index.ts new file mode 100644 index 000000000..a6c5db35e --- /dev/null +++ b/src/resources/realtime/index.ts @@ -0,0 +1,9 @@ +// File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. + +export { + ClientSecrets, + type RealtimeSessionCreateResponse, + type ClientSecretCreateResponse, + type ClientSecretCreateParams, +} from './client-secrets'; +export { Realtime } from './realtime'; diff --git a/src/resources/realtime/realtime.ts b/src/resources/realtime/realtime.ts new file mode 100644 index 000000000..562b2d739 --- /dev/null +++ b/src/resources/realtime/realtime.ts @@ -0,0 +1,4351 @@ +// File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. + +import { APIResource } from '../../core/resource'; +import * as RealtimeAPI from './realtime'; +import * as Shared from '../shared'; +import * as ClientSecretsAPI from './client-secrets'; +import { + ClientSecretCreateParams, + ClientSecretCreateResponse, + ClientSecrets, + RealtimeSessionCreateResponse, +} from './client-secrets'; +import * as ResponsesAPI from '../responses/responses'; + +export class Realtime extends APIResource { + clientSecrets: ClientSecretsAPI.ClientSecrets = new ClientSecretsAPI.ClientSecrets(this._client); +} + +/** + * Returned when a conversation is created. Emitted right after session creation. + */ +export interface ConversationCreatedEvent { + /** + * The conversation resource. + */ + conversation: ConversationCreatedEvent.Conversation; + + /** + * The unique ID of the server event. + */ + event_id: string; + + /** + * The event type, must be `conversation.created`. + */ + type: 'conversation.created'; +} + +export namespace ConversationCreatedEvent { + /** + * The conversation resource. + */ + export interface Conversation { + /** + * The unique ID of the conversation. + */ + id?: string; + + /** + * The object type, must be `realtime.conversation`. + */ + object?: 'realtime.conversation'; + } +} + +/** + * A single item within a Realtime conversation. + */ +export type ConversationItem = + | RealtimeConversationItemSystemMessage + | RealtimeConversationItemUserMessage + | RealtimeConversationItemAssistantMessage + | RealtimeConversationItemFunctionCall + | RealtimeConversationItemFunctionCallOutput + | RealtimeMcpApprovalResponse + | RealtimeMcpListTools + | RealtimeMcpToolCall + | RealtimeMcpApprovalRequest; + +/** + * Returned when a conversation item is added. + */ +export interface ConversationItemAdded { + /** + * The unique ID of the server event. + */ + event_id: string; + + /** + * A single item within a Realtime conversation. + */ + item: ConversationItem; + + /** + * The event type, must be `conversation.item.added`. + */ + type: 'conversation.item.added'; + + /** + * The ID of the item that precedes this one, if any. This is used to maintain + * ordering when items are inserted. + */ + previous_item_id?: string | null; +} + +/** + * Add a new Item to the Conversation's context, including messages, function + * calls, and function call responses. This event can be used both to populate a + * "history" of the conversation and to add new items mid-stream, but has the + * current limitation that it cannot populate assistant audio messages. + * + * If successful, the server will respond with a `conversation.item.created` event, + * otherwise an `error` event will be sent. + */ +export interface ConversationItemCreateEvent { + /** + * A single item within a Realtime conversation. + */ + item: ConversationItem; + + /** + * The event type, must be `conversation.item.create`. + */ + type: 'conversation.item.create'; + + /** + * Optional client-generated ID used to identify this event. + */ + event_id?: string; + + /** + * The ID of the preceding item after which the new item will be inserted. If not + * set, the new item will be appended to the end of the conversation. If set to + * `root`, the new item will be added to the beginning of the conversation. If set + * to an existing ID, it allows an item to be inserted mid-conversation. If the ID + * cannot be found, an error will be returned and the item will not be added. + */ + previous_item_id?: string; +} + +/** + * Returned when a conversation item is created. There are several scenarios that + * produce this event: + * + * - The server is generating a Response, which if successful will produce either + * one or two Items, which will be of type `message` (role `assistant`) or type + * `function_call`. + * - The input audio buffer has been committed, either by the client or the server + * (in `server_vad` mode). The server will take the content of the input audio + * buffer and add it to a new user message Item. + * - The client has sent a `conversation.item.create` event to add a new Item to + * the Conversation. + */ +export interface ConversationItemCreatedEvent { + /** + * The unique ID of the server event. + */ + event_id: string; + + /** + * A single item within a Realtime conversation. + */ + item: ConversationItem; + + /** + * The event type, must be `conversation.item.created`. + */ + type: 'conversation.item.created'; + + /** + * The ID of the preceding item in the Conversation context, allows the client to + * understand the order of the conversation. Can be `null` if the item has no + * predecessor. + */ + previous_item_id?: string | null; +} + +/** + * Send this event when you want to remove any item from the conversation history. + * The server will respond with a `conversation.item.deleted` event, unless the + * item does not exist in the conversation history, in which case the server will + * respond with an error. + */ +export interface ConversationItemDeleteEvent { + /** + * The ID of the item to delete. + */ + item_id: string; + + /** + * The event type, must be `conversation.item.delete`. + */ + type: 'conversation.item.delete'; + + /** + * Optional client-generated ID used to identify this event. + */ + event_id?: string; +} + +/** + * Returned when an item in the conversation is deleted by the client with a + * `conversation.item.delete` event. This event is used to synchronize the server's + * understanding of the conversation history with the client's view. + */ +export interface ConversationItemDeletedEvent { + /** + * The unique ID of the server event. + */ + event_id: string; + + /** + * The ID of the item that was deleted. + */ + item_id: string; + + /** + * The event type, must be `conversation.item.deleted`. + */ + type: 'conversation.item.deleted'; +} + +/** + * Returned when a conversation item is finalized. + */ +export interface ConversationItemDone { + /** + * The unique ID of the server event. + */ + event_id: string; + + /** + * A single item within a Realtime conversation. + */ + item: ConversationItem; + + /** + * The event type, must be `conversation.item.done`. + */ + type: 'conversation.item.done'; + + /** + * The ID of the item that precedes this one, if any. This is used to maintain + * ordering when items are inserted. + */ + previous_item_id?: string | null; +} + +/** + * This event is the output of audio transcription for user audio written to the + * user audio buffer. Transcription begins when the input audio buffer is committed + * by the client or server (in `server_vad` mode). Transcription runs + * asynchronously with Response creation, so this event may come before or after + * the Response events. + * + * Realtime API models accept audio natively, and thus input transcription is a + * separate process run on a separate ASR (Automatic Speech Recognition) model. The + * transcript may diverge somewhat from the model's interpretation, and should be + * treated as a rough guide. + */ +export interface ConversationItemInputAudioTranscriptionCompletedEvent { + /** + * The index of the content part containing the audio. + */ + content_index: number; + + /** + * The unique ID of the server event. + */ + event_id: string; + + /** + * The ID of the user message item containing the audio. + */ + item_id: string; + + /** + * The transcribed text. + */ + transcript: string; + + /** + * The event type, must be `conversation.item.input_audio_transcription.completed`. + */ + type: 'conversation.item.input_audio_transcription.completed'; + + /** + * Usage statistics for the transcription. + */ + usage: + | ConversationItemInputAudioTranscriptionCompletedEvent.TranscriptTextUsageTokens + | ConversationItemInputAudioTranscriptionCompletedEvent.TranscriptTextUsageDuration; + + /** + * The log probabilities of the transcription. + */ + logprobs?: Array | null; +} + +export namespace ConversationItemInputAudioTranscriptionCompletedEvent { + /** + * Usage statistics for models billed by token usage. + */ + export interface TranscriptTextUsageTokens { + /** + * Number of input tokens billed for this request. + */ + input_tokens: number; + + /** + * Number of output tokens generated. + */ + output_tokens: number; + + /** + * Total number of tokens used (input + output). + */ + total_tokens: number; + + /** + * The type of the usage object. Always `tokens` for this variant. + */ + type: 'tokens'; + + /** + * Details about the input tokens billed for this request. + */ + input_token_details?: TranscriptTextUsageTokens.InputTokenDetails; + } + + export namespace TranscriptTextUsageTokens { + /** + * Details about the input tokens billed for this request. + */ + export interface InputTokenDetails { + /** + * Number of audio tokens billed for this request. + */ + audio_tokens?: number; + + /** + * Number of text tokens billed for this request. + */ + text_tokens?: number; + } + } + + /** + * Usage statistics for models billed by audio input duration. + */ + export interface TranscriptTextUsageDuration { + /** + * Duration of the input audio in seconds. + */ + seconds: number; + + /** + * The type of the usage object. Always `duration` for this variant. + */ + type: 'duration'; + } +} + +/** + * Returned when the text value of an input audio transcription content part is + * updated. + */ +export interface ConversationItemInputAudioTranscriptionDeltaEvent { + /** + * The unique ID of the server event. + */ + event_id: string; + + /** + * The ID of the item. + */ + item_id: string; + + /** + * The event type, must be `conversation.item.input_audio_transcription.delta`. + */ + type: 'conversation.item.input_audio_transcription.delta'; + + /** + * The index of the content part in the item's content array. + */ + content_index?: number; + + /** + * The text delta. + */ + delta?: string; + + /** + * The log probabilities of the transcription. + */ + logprobs?: Array | null; +} + +/** + * Returned when input audio transcription is configured, and a transcription + * request for a user message failed. These events are separate from other `error` + * events so that the client can identify the related Item. + */ +export interface ConversationItemInputAudioTranscriptionFailedEvent { + /** + * The index of the content part containing the audio. + */ + content_index: number; + + /** + * Details of the transcription error. + */ + error: ConversationItemInputAudioTranscriptionFailedEvent.Error; + + /** + * The unique ID of the server event. + */ + event_id: string; + + /** + * The ID of the user message item. + */ + item_id: string; + + /** + * The event type, must be `conversation.item.input_audio_transcription.failed`. + */ + type: 'conversation.item.input_audio_transcription.failed'; +} + +export namespace ConversationItemInputAudioTranscriptionFailedEvent { + /** + * Details of the transcription error. + */ + export interface Error { + /** + * Error code, if any. + */ + code?: string; + + /** + * A human-readable error message. + */ + message?: string; + + /** + * Parameter related to the error, if any. + */ + param?: string; + + /** + * The type of error. + */ + type?: string; + } +} + +/** + * Returned when an input audio transcription segment is identified for an item. + */ +export interface ConversationItemInputAudioTranscriptionSegment { + /** + * The segment identifier. + */ + id: string; + + /** + * The index of the input audio content part within the item. + */ + content_index: number; + + /** + * End time of the segment in seconds. + */ + end: number; + + /** + * The unique ID of the server event. + */ + event_id: string; + + /** + * The ID of the item containing the input audio content. + */ + item_id: string; + + /** + * The detected speaker label for this segment. + */ + speaker: string; + + /** + * Start time of the segment in seconds. + */ + start: number; + + /** + * The text for this segment. + */ + text: string; + + /** + * The event type, must be `conversation.item.input_audio_transcription.segment`. + */ + type: 'conversation.item.input_audio_transcription.segment'; +} + +/** + * Send this event when you want to retrieve the server's representation of a + * specific item in the conversation history. This is useful, for example, to + * inspect user audio after noise cancellation and VAD. The server will respond + * with a `conversation.item.retrieved` event, unless the item does not exist in + * the conversation history, in which case the server will respond with an error. + */ +export interface ConversationItemRetrieveEvent { + /** + * The ID of the item to retrieve. + */ + item_id: string; + + /** + * The event type, must be `conversation.item.retrieve`. + */ + type: 'conversation.item.retrieve'; + + /** + * Optional client-generated ID used to identify this event. + */ + event_id?: string; +} + +/** + * Send this event to truncate a previous assistant message’s audio. The server + * will produce audio faster than realtime, so this event is useful when the user + * interrupts to truncate audio that has already been sent to the client but not + * yet played. This will synchronize the server's understanding of the audio with + * the client's playback. + * + * Truncating audio will delete the server-side text transcript to ensure there is + * not text in the context that hasn't been heard by the user. + * + * If successful, the server will respond with a `conversation.item.truncated` + * event. + */ +export interface ConversationItemTruncateEvent { + /** + * Inclusive duration up to which audio is truncated, in milliseconds. If the + * audio_end_ms is greater than the actual audio duration, the server will respond + * with an error. + */ + audio_end_ms: number; + + /** + * The index of the content part to truncate. Set this to 0. + */ + content_index: number; + + /** + * The ID of the assistant message item to truncate. Only assistant message items + * can be truncated. + */ + item_id: string; + + /** + * The event type, must be `conversation.item.truncate`. + */ + type: 'conversation.item.truncate'; + + /** + * Optional client-generated ID used to identify this event. + */ + event_id?: string; +} + +/** + * Returned when an earlier assistant audio message item is truncated by the client + * with a `conversation.item.truncate` event. This event is used to synchronize the + * server's understanding of the audio with the client's playback. + * + * This action will truncate the audio and remove the server-side text transcript + * to ensure there is no text in the context that hasn't been heard by the user. + */ +export interface ConversationItemTruncatedEvent { + /** + * The duration up to which the audio was truncated, in milliseconds. + */ + audio_end_ms: number; + + /** + * The index of the content part that was truncated. + */ + content_index: number; + + /** + * The unique ID of the server event. + */ + event_id: string; + + /** + * The ID of the assistant message item that was truncated. + */ + item_id: string; + + /** + * The event type, must be `conversation.item.truncated`. + */ + type: 'conversation.item.truncated'; +} + +/** + * The item to add to the conversation. + */ +export interface ConversationItemWithReference { + /** + * For an item of type (`message` | `function_call` | `function_call_output`) this + * field allows the client to assign the unique ID of the item. It is not required + * because the server will generate one if not provided. + * + * For an item of type `item_reference`, this field is required and is a reference + * to any item that has previously existed in the conversation. + */ + id?: string; + + /** + * The arguments of the function call (for `function_call` items). + */ + arguments?: string; + + /** + * The ID of the function call (for `function_call` and `function_call_output` + * items). If passed on a `function_call_output` item, the server will check that a + * `function_call` item with the same ID exists in the conversation history. + */ + call_id?: string; + + /** + * The content of the message, applicable for `message` items. + * + * - Message items of role `system` support only `input_text` content + * - Message items of role `user` support `input_text` and `input_audio` content + * - Message items of role `assistant` support `text` content. + */ + content?: Array; + + /** + * The name of the function being called (for `function_call` items). + */ + name?: string; + + /** + * Identifier for the API object being returned - always `realtime.item`. + */ + object?: 'realtime.item'; + + /** + * The output of the function call (for `function_call_output` items). + */ + output?: string; + + /** + * The role of the message sender (`user`, `assistant`, `system`), only applicable + * for `message` items. + */ + role?: 'user' | 'assistant' | 'system'; + + /** + * The status of the item (`completed`, `incomplete`, `in_progress`). These have no + * effect on the conversation, but are accepted for consistency with the + * `conversation.item.created` event. + */ + status?: 'completed' | 'incomplete' | 'in_progress'; + + /** + * The type of the item (`message`, `function_call`, `function_call_output`, + * `item_reference`). + */ + type?: 'message' | 'function_call' | 'function_call_output' | 'item_reference'; +} + +export namespace ConversationItemWithReference { + export interface Content { + /** + * ID of a previous conversation item to reference (for `item_reference` content + * types in `response.create` events). These can reference both client and server + * created items. + */ + id?: string; + + /** + * Base64-encoded audio bytes, used for `input_audio` content type. + */ + audio?: string; + + /** + * The text content, used for `input_text` and `text` content types. + */ + text?: string; + + /** + * The transcript of the audio, used for `input_audio` content type. + */ + transcript?: string; + + /** + * The content type (`input_text`, `input_audio`, `item_reference`, `text`). + */ + type?: 'input_text' | 'input_audio' | 'item_reference' | 'text'; + } +} + +/** + * Send this event to append audio bytes to the input audio buffer. The audio + * buffer is temporary storage you can write to and later commit. In Server VAD + * mode, the audio buffer is used to detect speech and the server will decide when + * to commit. When Server VAD is disabled, you must commit the audio buffer + * manually. + * + * The client may choose how much audio to place in each event up to a maximum of + * 15 MiB, for example streaming smaller chunks from the client may allow the VAD + * to be more responsive. Unlike made other client events, the server will not send + * a confirmation response to this event. + */ +export interface InputAudioBufferAppendEvent { + /** + * Base64-encoded audio bytes. This must be in the format specified by the + * `input_audio_format` field in the session configuration. + */ + audio: string; + + /** + * The event type, must be `input_audio_buffer.append`. + */ + type: 'input_audio_buffer.append'; + + /** + * Optional client-generated ID used to identify this event. + */ + event_id?: string; +} + +/** + * Send this event to clear the audio bytes in the buffer. The server will respond + * with an `input_audio_buffer.cleared` event. + */ +export interface InputAudioBufferClearEvent { + /** + * The event type, must be `input_audio_buffer.clear`. + */ + type: 'input_audio_buffer.clear'; + + /** + * Optional client-generated ID used to identify this event. + */ + event_id?: string; +} + +/** + * Returned when the input audio buffer is cleared by the client with a + * `input_audio_buffer.clear` event. + */ +export interface InputAudioBufferClearedEvent { + /** + * The unique ID of the server event. + */ + event_id: string; + + /** + * The event type, must be `input_audio_buffer.cleared`. + */ + type: 'input_audio_buffer.cleared'; +} + +/** + * Send this event to commit the user input audio buffer, which will create a new + * user message item in the conversation. This event will produce an error if the + * input audio buffer is empty. When in Server VAD mode, the client does not need + * to send this event, the server will commit the audio buffer automatically. + * + * Committing the input audio buffer will trigger input audio transcription (if + * enabled in session configuration), but it will not create a response from the + * model. The server will respond with an `input_audio_buffer.committed` event. + */ +export interface InputAudioBufferCommitEvent { + /** + * The event type, must be `input_audio_buffer.commit`. + */ + type: 'input_audio_buffer.commit'; + + /** + * Optional client-generated ID used to identify this event. + */ + event_id?: string; +} + +/** + * Returned when an input audio buffer is committed, either by the client or + * automatically in server VAD mode. The `item_id` property is the ID of the user + * message item that will be created, thus a `conversation.item.created` event will + * also be sent to the client. + */ +export interface InputAudioBufferCommittedEvent { + /** + * The unique ID of the server event. + */ + event_id: string; + + /** + * The ID of the user message item that will be created. + */ + item_id: string; + + /** + * The event type, must be `input_audio_buffer.committed`. + */ + type: 'input_audio_buffer.committed'; + + /** + * The ID of the preceding item after which the new item will be inserted. Can be + * `null` if the item has no predecessor. + */ + previous_item_id?: string | null; +} + +/** + * Sent by the server when in `server_vad` mode to indicate that speech has been + * detected in the audio buffer. This can happen any time audio is added to the + * buffer (unless speech is already detected). The client may want to use this + * event to interrupt audio playback or provide visual feedback to the user. + * + * The client should expect to receive a `input_audio_buffer.speech_stopped` event + * when speech stops. The `item_id` property is the ID of the user message item + * that will be created when speech stops and will also be included in the + * `input_audio_buffer.speech_stopped` event (unless the client manually commits + * the audio buffer during VAD activation). + */ +export interface InputAudioBufferSpeechStartedEvent { + /** + * Milliseconds from the start of all audio written to the buffer during the + * session when speech was first detected. This will correspond to the beginning of + * audio sent to the model, and thus includes the `prefix_padding_ms` configured in + * the Session. + */ + audio_start_ms: number; + + /** + * The unique ID of the server event. + */ + event_id: string; + + /** + * The ID of the user message item that will be created when speech stops. + */ + item_id: string; + + /** + * The event type, must be `input_audio_buffer.speech_started`. + */ + type: 'input_audio_buffer.speech_started'; +} + +/** + * Returned in `server_vad` mode when the server detects the end of speech in the + * audio buffer. The server will also send an `conversation.item.created` event + * with the user message item that is created from the audio buffer. + */ +export interface InputAudioBufferSpeechStoppedEvent { + /** + * Milliseconds since the session started when speech stopped. This will correspond + * to the end of audio sent to the model, and thus includes the + * `min_silence_duration_ms` configured in the Session. + */ + audio_end_ms: number; + + /** + * The unique ID of the server event. + */ + event_id: string; + + /** + * The ID of the user message item that will be created. + */ + item_id: string; + + /** + * The event type, must be `input_audio_buffer.speech_stopped`. + */ + type: 'input_audio_buffer.speech_stopped'; +} + +/** + * Returned when the server VAD timeout is triggered for the input audio buffer. + */ +export interface InputAudioBufferTimeoutTriggered { + /** + * Millisecond offset where speech ended within the buffered audio. + */ + audio_end_ms: number; + + /** + * Millisecond offset where speech started within the buffered audio. + */ + audio_start_ms: number; + + /** + * The unique ID of the server event. + */ + event_id: string; + + /** + * The ID of the item associated with this segment. + */ + item_id: string; + + /** + * The event type, must be `input_audio_buffer.timeout_triggered`. + */ + type: 'input_audio_buffer.timeout_triggered'; +} + +/** + * A log probability object. + */ +export interface LogProbProperties { + /** + * The token that was used to generate the log probability. + */ + token: string; + + /** + * The bytes that were used to generate the log probability. + */ + bytes: Array; + + /** + * The log probability of the token. + */ + logprob: number; +} + +/** + * Returned when listing MCP tools has completed for an item. + */ +export interface McpListToolsCompleted { + /** + * The unique ID of the server event. + */ + event_id: string; + + /** + * The ID of the MCP list tools item. + */ + item_id: string; + + /** + * The event type, must be `mcp_list_tools.completed`. + */ + type: 'mcp_list_tools.completed'; +} + +/** + * Returned when listing MCP tools has failed for an item. + */ +export interface McpListToolsFailed { + /** + * The unique ID of the server event. + */ + event_id: string; + + /** + * The ID of the MCP list tools item. + */ + item_id: string; + + /** + * The event type, must be `mcp_list_tools.failed`. + */ + type: 'mcp_list_tools.failed'; +} + +/** + * Returned when listing MCP tools is in progress for an item. + */ +export interface McpListToolsInProgress { + /** + * The unique ID of the server event. + */ + event_id: string; + + /** + * The ID of the MCP list tools item. + */ + item_id: string; + + /** + * The event type, must be `mcp_list_tools.in_progress`. + */ + type: 'mcp_list_tools.in_progress'; +} + +/** + * **WebRTC Only:** Emit to cut off the current audio response. This will trigger + * the server to stop generating audio and emit a `output_audio_buffer.cleared` + * event. This event should be preceded by a `response.cancel` client event to stop + * the generation of the current response. + * [Learn more](https://platform.openai.com/docs/guides/realtime-conversations#client-and-server-events-for-audio-in-webrtc). + */ +export interface OutputAudioBufferClearEvent { + /** + * The event type, must be `output_audio_buffer.clear`. + */ + type: 'output_audio_buffer.clear'; + + /** + * The unique ID of the client event used for error handling. + */ + event_id?: string; +} + +/** + * Emitted at the beginning of a Response to indicate the updated rate limits. When + * a Response is created some tokens will be "reserved" for the output tokens, the + * rate limits shown here reflect that reservation, which is then adjusted + * accordingly once the Response is completed. + */ +export interface RateLimitsUpdatedEvent { + /** + * The unique ID of the server event. + */ + event_id: string; + + /** + * List of rate limit information. + */ + rate_limits: Array; + + /** + * The event type, must be `rate_limits.updated`. + */ + type: 'rate_limits.updated'; +} + +export namespace RateLimitsUpdatedEvent { + export interface RateLimit { + /** + * The maximum allowed value for the rate limit. + */ + limit?: number; + + /** + * The name of the rate limit (`requests`, `tokens`). + */ + name?: 'requests' | 'tokens'; + + /** + * The remaining value before the limit is reached. + */ + remaining?: number; + + /** + * Seconds until the rate limit resets. + */ + reset_seconds?: number; + } +} + +/** + * Configuration for input and output audio. + */ +export interface RealtimeAudioConfig { + input?: RealtimeAudioConfig.Input; + + output?: RealtimeAudioConfig.Output; +} + +export namespace RealtimeAudioConfig { + export interface Input { + /** + * The format of input audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`. For + * `pcm16`, input audio must be 16-bit PCM at a 24kHz sample rate, single channel + * (mono), and little-endian byte order. + */ + format?: 'pcm16' | 'g711_ulaw' | 'g711_alaw'; + + /** + * Configuration for input audio noise reduction. This can be set to `null` to turn + * off. Noise reduction filters audio added to the input audio buffer before it is + * sent to VAD and the model. Filtering the audio can improve VAD and turn + * detection accuracy (reducing false positives) and model performance by improving + * perception of the input audio. + */ + noise_reduction?: Input.NoiseReduction; + + /** + * Configuration for input audio transcription, defaults to off and can be set to + * `null` to turn off once on. Input audio transcription is not native to the + * model, since the model consumes audio directly. Transcription runs + * asynchronously through + * [the /audio/transcriptions endpoint](https://platform.openai.com/docs/api-reference/audio/createTranscription) + * and should be treated as guidance of input audio content rather than precisely + * what the model heard. The client can optionally set the language and prompt for + * transcription, these offer additional guidance to the transcription service. + */ + transcription?: Input.Transcription; + + /** + * Configuration for turn detection, ether Server VAD or Semantic VAD. This can be + * set to `null` to turn off, in which case the client must manually trigger model + * response. Server VAD means that the model will detect the start and end of + * speech based on audio volume and respond at the end of user speech. Semantic VAD + * is more advanced and uses a turn detection model (in conjunction with VAD) to + * semantically estimate whether the user has finished speaking, then dynamically + * sets a timeout based on this probability. For example, if user audio trails off + * with "uhhm", the model will score a low probability of turn end and wait longer + * for the user to continue speaking. This can be useful for more natural + * conversations, but may have a higher latency. + */ + turn_detection?: Input.TurnDetection; + } + + export namespace Input { + /** + * Configuration for input audio noise reduction. This can be set to `null` to turn + * off. Noise reduction filters audio added to the input audio buffer before it is + * sent to VAD and the model. Filtering the audio can improve VAD and turn + * detection accuracy (reducing false positives) and model performance by improving + * perception of the input audio. + */ + export interface NoiseReduction { + /** + * Type of noise reduction. `near_field` is for close-talking microphones such as + * headphones, `far_field` is for far-field microphones such as laptop or + * conference room microphones. + */ + type?: 'near_field' | 'far_field'; + } + + /** + * Configuration for input audio transcription, defaults to off and can be set to + * `null` to turn off once on. Input audio transcription is not native to the + * model, since the model consumes audio directly. Transcription runs + * asynchronously through + * [the /audio/transcriptions endpoint](https://platform.openai.com/docs/api-reference/audio/createTranscription) + * and should be treated as guidance of input audio content rather than precisely + * what the model heard. The client can optionally set the language and prompt for + * transcription, these offer additional guidance to the transcription service. + */ + export interface Transcription { + /** + * The language of the input audio. Supplying the input language in + * [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`) + * format will improve accuracy and latency. + */ + language?: string; + + /** + * The model to use for transcription. Current options are `whisper-1`, + * `gpt-4o-transcribe-latest`, `gpt-4o-mini-transcribe`, `gpt-4o-transcribe`, and + * `gpt-4o-transcribe-diarize`. + */ + model?: + | 'whisper-1' + | 'gpt-4o-transcribe-latest' + | 'gpt-4o-mini-transcribe' + | 'gpt-4o-transcribe' + | 'gpt-4o-transcribe-diarize'; + + /** + * An optional text to guide the model's style or continue a previous audio + * segment. For `whisper-1`, the + * [prompt is a list of keywords](https://platform.openai.com/docs/guides/speech-to-text#prompting). + * For `gpt-4o-transcribe` models, the prompt is a free text string, for example + * "expect words related to technology". + */ + prompt?: string; + } + + /** + * Configuration for turn detection, ether Server VAD or Semantic VAD. This can be + * set to `null` to turn off, in which case the client must manually trigger model + * response. Server VAD means that the model will detect the start and end of + * speech based on audio volume and respond at the end of user speech. Semantic VAD + * is more advanced and uses a turn detection model (in conjunction with VAD) to + * semantically estimate whether the user has finished speaking, then dynamically + * sets a timeout based on this probability. For example, if user audio trails off + * with "uhhm", the model will score a low probability of turn end and wait longer + * for the user to continue speaking. This can be useful for more natural + * conversations, but may have a higher latency. + */ + export interface TurnDetection { + /** + * Whether or not to automatically generate a response when a VAD stop event + * occurs. + */ + create_response?: boolean; + + /** + * Used only for `semantic_vad` mode. The eagerness of the model to respond. `low` + * will wait longer for the user to continue speaking, `high` will respond more + * quickly. `auto` is the default and is equivalent to `medium`. + */ + eagerness?: 'low' | 'medium' | 'high' | 'auto'; + + /** + * Optional idle timeout after which turn detection will auto-timeout when no + * additional audio is received. + */ + idle_timeout_ms?: number | null; + + /** + * Whether or not to automatically interrupt any ongoing response with output to + * the default conversation (i.e. `conversation` of `auto`) when a VAD start event + * occurs. + */ + interrupt_response?: boolean; + + /** + * Used only for `server_vad` mode. Amount of audio to include before the VAD + * detected speech (in milliseconds). Defaults to 300ms. + */ + prefix_padding_ms?: number; + + /** + * Used only for `server_vad` mode. Duration of silence to detect speech stop (in + * milliseconds). Defaults to 500ms. With shorter values the model will respond + * more quickly, but may jump in on short pauses from the user. + */ + silence_duration_ms?: number; + + /** + * Used only for `server_vad` mode. Activation threshold for VAD (0.0 to 1.0), this + * defaults to 0.5. A higher threshold will require louder audio to activate the + * model, and thus might perform better in noisy environments. + */ + threshold?: number; + + /** + * Type of turn detection. + */ + type?: 'server_vad' | 'semantic_vad'; + } + } + + export interface Output { + /** + * The format of output audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`. + * For `pcm16`, output audio is sampled at a rate of 24kHz. + */ + format?: 'pcm16' | 'g711_ulaw' | 'g711_alaw'; + + /** + * The speed of the model's spoken response. 1.0 is the default speed. 0.25 is the + * minimum speed. 1.5 is the maximum speed. This value can only be changed in + * between model turns, not while a response is in progress. + */ + speed?: number; + + /** + * The voice the model uses to respond. Voice cannot be changed during the session + * once the model has responded with audio at least once. Current voice options are + * `alloy`, `ash`, `ballad`, `coral`, `echo`, `sage`, `shimmer`, `verse`, `marin`, + * and `cedar`. + */ + voice?: + | (string & {}) + | 'alloy' + | 'ash' + | 'ballad' + | 'coral' + | 'echo' + | 'sage' + | 'shimmer' + | 'verse' + | 'marin' + | 'cedar'; + } +} + +/** + * A realtime client event. + */ +export type RealtimeClientEvent = + | ConversationItemCreateEvent + | ConversationItemDeleteEvent + | ConversationItemRetrieveEvent + | ConversationItemTruncateEvent + | InputAudioBufferAppendEvent + | InputAudioBufferClearEvent + | OutputAudioBufferClearEvent + | InputAudioBufferCommitEvent + | ResponseCancelEvent + | ResponseCreateEvent + | SessionUpdateEvent + | TranscriptionSessionUpdate; + +/** + * Configuration options for the generated client secret. + */ +export interface RealtimeClientSecretConfig { + /** + * Configuration for the ephemeral token expiration. + */ + expires_after?: RealtimeClientSecretConfig.ExpiresAfter; +} + +export namespace RealtimeClientSecretConfig { + /** + * Configuration for the ephemeral token expiration. + */ + export interface ExpiresAfter { + /** + * The anchor point for the ephemeral token expiration. Only `created_at` is + * currently supported. + */ + anchor: 'created_at'; + + /** + * The number of seconds from the anchor point to the expiration. Select a value + * between `10` and `7200`. + */ + seconds?: number; + } +} + +/** + * An assistant message item in a Realtime conversation. + */ +export interface RealtimeConversationItemAssistantMessage { + /** + * The content of the message. + */ + content: Array; + + /** + * The role of the message sender. Always `assistant`. + */ + role: 'assistant'; + + /** + * The type of the item. Always `message`. + */ + type: 'message'; + + /** + * The unique ID of the item. + */ + id?: string; + + /** + * Identifier for the API object being returned - always `realtime.item`. + */ + object?: 'realtime.item'; + + /** + * The status of the item. Has no effect on the conversation. + */ + status?: 'completed' | 'incomplete' | 'in_progress'; +} + +export namespace RealtimeConversationItemAssistantMessage { + export interface Content { + /** + * The text content. + */ + text?: string; + + /** + * The content type. Always `text` for assistant messages. + */ + type?: 'text'; + } +} + +/** + * A function call item in a Realtime conversation. + */ +export interface RealtimeConversationItemFunctionCall { + /** + * The arguments of the function call. + */ + arguments: string; + + /** + * The name of the function being called. + */ + name: string; + + /** + * The type of the item. Always `function_call`. + */ + type: 'function_call'; + + /** + * The unique ID of the item. + */ + id?: string; + + /** + * The ID of the function call. + */ + call_id?: string; + + /** + * Identifier for the API object being returned - always `realtime.item`. + */ + object?: 'realtime.item'; + + /** + * The status of the item. Has no effect on the conversation. + */ + status?: 'completed' | 'incomplete' | 'in_progress'; +} + +/** + * A function call output item in a Realtime conversation. + */ +export interface RealtimeConversationItemFunctionCallOutput { + /** + * The ID of the function call this output is for. + */ + call_id: string; + + /** + * The output of the function call. + */ + output: string; + + /** + * The type of the item. Always `function_call_output`. + */ + type: 'function_call_output'; + + /** + * The unique ID of the item. + */ + id?: string; + + /** + * Identifier for the API object being returned - always `realtime.item`. + */ + object?: 'realtime.item'; + + /** + * The status of the item. Has no effect on the conversation. + */ + status?: 'completed' | 'incomplete' | 'in_progress'; +} + +/** + * A system message item in a Realtime conversation. + */ +export interface RealtimeConversationItemSystemMessage { + /** + * The content of the message. + */ + content: Array; + + /** + * The role of the message sender. Always `system`. + */ + role: 'system'; + + /** + * The type of the item. Always `message`. + */ + type: 'message'; + + /** + * The unique ID of the item. + */ + id?: string; + + /** + * Identifier for the API object being returned - always `realtime.item`. + */ + object?: 'realtime.item'; + + /** + * The status of the item. Has no effect on the conversation. + */ + status?: 'completed' | 'incomplete' | 'in_progress'; +} + +export namespace RealtimeConversationItemSystemMessage { + export interface Content { + /** + * The text content. + */ + text?: string; + + /** + * The content type. Always `input_text` for system messages. + */ + type?: 'input_text'; + } +} + +/** + * A user message item in a Realtime conversation. + */ +export interface RealtimeConversationItemUserMessage { + /** + * The content of the message. + */ + content: Array; + + /** + * The role of the message sender. Always `user`. + */ + role: 'user'; + + /** + * The type of the item. Always `message`. + */ + type: 'message'; + + /** + * The unique ID of the item. + */ + id?: string; + + /** + * Identifier for the API object being returned - always `realtime.item`. + */ + object?: 'realtime.item'; + + /** + * The status of the item. Has no effect on the conversation. + */ + status?: 'completed' | 'incomplete' | 'in_progress'; +} + +export namespace RealtimeConversationItemUserMessage { + export interface Content { + /** + * Base64-encoded audio bytes (for `input_audio`). + */ + audio?: string; + + /** + * The text content (for `input_text`). + */ + text?: string; + + /** + * Transcript of the audio (for `input_audio`). + */ + transcript?: string; + + /** + * The content type (`input_text` or `input_audio`). + */ + type?: 'input_text' | 'input_audio'; + } +} + +/** + * Details of the error. + */ +export interface RealtimeError { + /** + * A human-readable error message. + */ + message: string; + + /** + * The type of error (e.g., "invalid_request_error", "server_error"). + */ + type: string; + + /** + * Error code, if any. + */ + code?: string | null; + + /** + * The event_id of the client event that caused the error, if applicable. + */ + event_id?: string | null; + + /** + * Parameter related to the error, if any. + */ + param?: string | null; +} + +/** + * Returned when an error occurs, which could be a client problem or a server + * problem. Most errors are recoverable and the session will stay open, we + * recommend to implementors to monitor and log error messages by default. + */ +export interface RealtimeErrorEvent { + /** + * Details of the error. + */ + error: RealtimeError; + + /** + * The unique ID of the server event. + */ + event_id: string; + + /** + * The event type, must be `error`. + */ + type: 'error'; +} + +/** + * A Realtime item requesting human approval of a tool invocation. + */ +export interface RealtimeMcpApprovalRequest { + /** + * The unique ID of the approval request. + */ + id: string; + + /** + * A JSON string of arguments for the tool. + */ + arguments: string; + + /** + * The name of the tool to run. + */ + name: string; + + /** + * The label of the MCP server making the request. + */ + server_label: string; + + /** + * The type of the item. Always `mcp_approval_request`. + */ + type: 'mcp_approval_request'; +} + +/** + * A Realtime item responding to an MCP approval request. + */ +export interface RealtimeMcpApprovalResponse { + /** + * The unique ID of the approval response. + */ + id: string; + + /** + * The ID of the approval request being answered. + */ + approval_request_id: string; + + /** + * Whether the request was approved. + */ + approve: boolean; + + /** + * The type of the item. Always `mcp_approval_response`. + */ + type: 'mcp_approval_response'; + + /** + * Optional reason for the decision. + */ + reason?: string | null; +} + +/** + * A Realtime item listing tools available on an MCP server. + */ +export interface RealtimeMcpListTools { + /** + * The label of the MCP server. + */ + server_label: string; + + /** + * The tools available on the server. + */ + tools: Array; + + /** + * The type of the item. Always `mcp_list_tools`. + */ + type: 'mcp_list_tools'; + + /** + * The unique ID of the list. + */ + id?: string; +} + +export namespace RealtimeMcpListTools { + /** + * A tool available on an MCP server. + */ + export interface Tool { + /** + * The JSON schema describing the tool's input. + */ + input_schema: unknown; + + /** + * The name of the tool. + */ + name: string; + + /** + * Additional annotations about the tool. + */ + annotations?: unknown | null; + + /** + * The description of the tool. + */ + description?: string | null; + } +} + +export interface RealtimeMcpProtocolError { + code: number; + + message: string; + + type: 'protocol_error'; +} + +/** + * A Realtime item representing an invocation of a tool on an MCP server. + */ +export interface RealtimeMcpToolCall { + /** + * The unique ID of the tool call. + */ + id: string; + + /** + * A JSON string of the arguments passed to the tool. + */ + arguments: string; + + /** + * The name of the tool that was run. + */ + name: string; + + /** + * The label of the MCP server running the tool. + */ + server_label: string; + + /** + * The type of the item. Always `mcp_tool_call`. + */ + type: 'mcp_tool_call'; + + /** + * The ID of an associated approval request, if any. + */ + approval_request_id?: string | null; + + /** + * The error from the tool call, if any. + */ + error?: RealtimeMcpProtocolError | RealtimeMcpToolExecutionError | RealtimeMcphttpError | null; + + /** + * The output from the tool call. + */ + output?: string | null; +} + +export interface RealtimeMcpToolExecutionError { + message: string; + + type: 'tool_execution_error'; +} + +export interface RealtimeMcphttpError { + code: number; + + message: string; + + type: 'http_error'; +} + +/** + * The response resource. + */ +export interface RealtimeResponse { + /** + * The unique ID of the response. + */ + id?: string; + + /** + * Which conversation the response is added to, determined by the `conversation` + * field in the `response.create` event. If `auto`, the response will be added to + * the default conversation and the value of `conversation_id` will be an id like + * `conv_1234`. If `none`, the response will not be added to any conversation and + * the value of `conversation_id` will be `null`. If responses are being triggered + * by server VAD, the response will be added to the default conversation, thus the + * `conversation_id` will be an id like `conv_1234`. + */ + conversation_id?: string; + + /** + * Maximum number of output tokens for a single assistant response, inclusive of + * tool calls, that was used in this response. + */ + max_output_tokens?: number | 'inf'; + + /** + * Set of 16 key-value pairs that can be attached to an object. This can be useful + * for storing additional information about the object in a structured format, and + * querying for objects via API or the dashboard. + * + * Keys are strings with a maximum length of 64 characters. Values are strings with + * a maximum length of 512 characters. + */ + metadata?: Shared.Metadata | null; + + /** + * The set of modalities the model used to respond. If there are multiple + * modalities, the model will pick one, for example if `modalities` is + * `["text", "audio"]`, the model could be responding in either text or audio. + */ + modalities?: Array<'text' | 'audio'>; + + /** + * The object type, must be `realtime.response`. + */ + object?: 'realtime.response'; + + /** + * The list of output items generated by the response. + */ + output?: Array; + + /** + * The format of output audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`. + */ + output_audio_format?: 'pcm16' | 'g711_ulaw' | 'g711_alaw'; + + /** + * The final status of the response (`completed`, `cancelled`, `failed`, or + * `incomplete`, `in_progress`). + */ + status?: 'completed' | 'cancelled' | 'failed' | 'incomplete' | 'in_progress'; + + /** + * Additional details about the status. + */ + status_details?: RealtimeResponseStatus; + + /** + * Sampling temperature for the model, limited to [0.6, 1.2]. Defaults to 0.8. + */ + temperature?: number; + + /** + * Usage statistics for the Response, this will correspond to billing. A Realtime + * API session will maintain a conversation context and append new Items to the + * Conversation, thus output from previous turns (text and audio tokens) will + * become the input for later turns. + */ + usage?: RealtimeResponseUsage; + + /** + * The voice the model used to respond. Current voice options are `alloy`, `ash`, + * `ballad`, `coral`, `echo`, `sage`, `shimmer`, and `verse`. + */ + voice?: + | (string & {}) + | 'alloy' + | 'ash' + | 'ballad' + | 'coral' + | 'echo' + | 'sage' + | 'shimmer' + | 'verse' + | 'marin' + | 'cedar'; +} + +/** + * Additional details about the status. + */ +export interface RealtimeResponseStatus { + /** + * A description of the error that caused the response to fail, populated when the + * `status` is `failed`. + */ + error?: RealtimeResponseStatus.Error; + + /** + * The reason the Response did not complete. For a `cancelled` Response, one of + * `turn_detected` (the server VAD detected a new start of speech) or + * `client_cancelled` (the client sent a cancel event). For an `incomplete` + * Response, one of `max_output_tokens` or `content_filter` (the server-side safety + * filter activated and cut off the response). + */ + reason?: 'turn_detected' | 'client_cancelled' | 'max_output_tokens' | 'content_filter'; + + /** + * The type of error that caused the response to fail, corresponding with the + * `status` field (`completed`, `cancelled`, `incomplete`, `failed`). + */ + type?: 'completed' | 'cancelled' | 'incomplete' | 'failed'; +} + +export namespace RealtimeResponseStatus { + /** + * A description of the error that caused the response to fail, populated when the + * `status` is `failed`. + */ + export interface Error { + /** + * Error code, if any. + */ + code?: string; + + /** + * The type of error. + */ + type?: string; + } +} + +/** + * Usage statistics for the Response, this will correspond to billing. A Realtime + * API session will maintain a conversation context and append new Items to the + * Conversation, thus output from previous turns (text and audio tokens) will + * become the input for later turns. + */ +export interface RealtimeResponseUsage { + /** + * Details about the input tokens used in the Response. + */ + input_token_details?: RealtimeResponseUsageInputTokenDetails; + + /** + * The number of input tokens used in the Response, including text and audio + * tokens. + */ + input_tokens?: number; + + /** + * Details about the output tokens used in the Response. + */ + output_token_details?: RealtimeResponseUsageOutputTokenDetails; + + /** + * The number of output tokens sent in the Response, including text and audio + * tokens. + */ + output_tokens?: number; + + /** + * The total number of tokens in the Response including input and output text and + * audio tokens. + */ + total_tokens?: number; +} + +/** + * Details about the input tokens used in the Response. + */ +export interface RealtimeResponseUsageInputTokenDetails { + /** + * The number of audio tokens used in the Response. + */ + audio_tokens?: number; + + /** + * The number of cached tokens used in the Response. + */ + cached_tokens?: number; + + /** + * The number of text tokens used in the Response. + */ + text_tokens?: number; +} + +/** + * Details about the output tokens used in the Response. + */ +export interface RealtimeResponseUsageOutputTokenDetails { + /** + * The number of audio tokens used in the Response. + */ + audio_tokens?: number; + + /** + * The number of text tokens used in the Response. + */ + text_tokens?: number; +} + +/** + * A realtime server event. + */ +export type RealtimeServerEvent = + | ConversationCreatedEvent + | ConversationItemCreatedEvent + | ConversationItemDeletedEvent + | ConversationItemInputAudioTranscriptionCompletedEvent + | ConversationItemInputAudioTranscriptionDeltaEvent + | ConversationItemInputAudioTranscriptionFailedEvent + | RealtimeServerEvent.ConversationItemRetrieved + | ConversationItemTruncatedEvent + | RealtimeErrorEvent + | InputAudioBufferClearedEvent + | InputAudioBufferCommittedEvent + | InputAudioBufferSpeechStartedEvent + | InputAudioBufferSpeechStoppedEvent + | RateLimitsUpdatedEvent + | ResponseAudioDeltaEvent + | ResponseAudioDoneEvent + | ResponseAudioTranscriptDeltaEvent + | ResponseAudioTranscriptDoneEvent + | ResponseContentPartAddedEvent + | ResponseContentPartDoneEvent + | ResponseCreatedEvent + | ResponseDoneEvent + | ResponseFunctionCallArgumentsDeltaEvent + | ResponseFunctionCallArgumentsDoneEvent + | ResponseOutputItemAddedEvent + | ResponseOutputItemDoneEvent + | ResponseTextDeltaEvent + | ResponseTextDoneEvent + | SessionCreatedEvent + | SessionUpdatedEvent + | TranscriptionSessionUpdatedEvent + | TranscriptionSessionCreated + | RealtimeServerEvent.OutputAudioBufferStarted + | RealtimeServerEvent.OutputAudioBufferStopped + | RealtimeServerEvent.OutputAudioBufferCleared + | ConversationItemAdded + | ConversationItemDone + | InputAudioBufferTimeoutTriggered + | ConversationItemInputAudioTranscriptionSegment + | McpListToolsInProgress + | McpListToolsCompleted + | McpListToolsFailed + | ResponseMcpCallArgumentsDelta + | ResponseMcpCallArgumentsDone + | ResponseMcpCallInProgress + | ResponseMcpCallCompleted + | ResponseMcpCallFailed; + +export namespace RealtimeServerEvent { + /** + * Returned when a conversation item is retrieved with + * `conversation.item.retrieve`. + */ + export interface ConversationItemRetrieved { + /** + * The unique ID of the server event. + */ + event_id: string; + + /** + * A single item within a Realtime conversation. + */ + item: RealtimeAPI.ConversationItem; + + /** + * The event type, must be `conversation.item.retrieved`. + */ + type: 'conversation.item.retrieved'; + } + + /** + * **WebRTC Only:** Emitted when the server begins streaming audio to the client. + * This event is emitted after an audio content part has been added + * (`response.content_part.added`) to the response. + * [Learn more](https://platform.openai.com/docs/guides/realtime-conversations#client-and-server-events-for-audio-in-webrtc). + */ + export interface OutputAudioBufferStarted { + /** + * The unique ID of the server event. + */ + event_id: string; + + /** + * The unique ID of the response that produced the audio. + */ + response_id: string; + + /** + * The event type, must be `output_audio_buffer.started`. + */ + type: 'output_audio_buffer.started'; + } + + /** + * **WebRTC Only:** Emitted when the output audio buffer has been completely + * drained on the server, and no more audio is forthcoming. This event is emitted + * after the full response data has been sent to the client (`response.done`). + * [Learn more](https://platform.openai.com/docs/guides/realtime-conversations#client-and-server-events-for-audio-in-webrtc). + */ + export interface OutputAudioBufferStopped { + /** + * The unique ID of the server event. + */ + event_id: string; + + /** + * The unique ID of the response that produced the audio. + */ + response_id: string; + + /** + * The event type, must be `output_audio_buffer.stopped`. + */ + type: 'output_audio_buffer.stopped'; + } + + /** + * **WebRTC Only:** Emitted when the output audio buffer is cleared. This happens + * either in VAD mode when the user has interrupted + * (`input_audio_buffer.speech_started`), or when the client has emitted the + * `output_audio_buffer.clear` event to manually cut off the current audio + * response. + * [Learn more](https://platform.openai.com/docs/guides/realtime-conversations#client-and-server-events-for-audio-in-webrtc). + */ + export interface OutputAudioBufferCleared { + /** + * The unique ID of the server event. + */ + event_id: string; + + /** + * The unique ID of the response that produced the audio. + */ + response_id: string; + + /** + * The event type, must be `output_audio_buffer.cleared`. + */ + type: 'output_audio_buffer.cleared'; + } +} + +/** + * Realtime session object. + */ +export interface RealtimeSession { + /** + * Unique identifier for the session that looks like `sess_1234567890abcdef`. + */ + id?: string; + + /** + * Expiration timestamp for the session, in seconds since epoch. + */ + expires_at?: number; + + /** + * Additional fields to include in server outputs. + * + * - `item.input_audio_transcription.logprobs`: Include logprobs for input audio + * transcription. + */ + include?: Array<'item.input_audio_transcription.logprobs'> | null; + + /** + * The format of input audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`. For + * `pcm16`, input audio must be 16-bit PCM at a 24kHz sample rate, single channel + * (mono), and little-endian byte order. + */ + input_audio_format?: 'pcm16' | 'g711_ulaw' | 'g711_alaw'; + + /** + * Configuration for input audio noise reduction. This can be set to `null` to turn + * off. Noise reduction filters audio added to the input audio buffer before it is + * sent to VAD and the model. Filtering the audio can improve VAD and turn + * detection accuracy (reducing false positives) and model performance by improving + * perception of the input audio. + */ + input_audio_noise_reduction?: RealtimeSession.InputAudioNoiseReduction; + + /** + * Configuration for input audio transcription, defaults to off and can be set to + * `null` to turn off once on. Input audio transcription is not native to the + * model, since the model consumes audio directly. Transcription runs + * asynchronously through + * [the /audio/transcriptions endpoint](https://platform.openai.com/docs/api-reference/audio/createTranscription) + * and should be treated as guidance of input audio content rather than precisely + * what the model heard. The client can optionally set the language and prompt for + * transcription, these offer additional guidance to the transcription service. + */ + input_audio_transcription?: RealtimeSession.InputAudioTranscription | null; + + /** + * The default system instructions (i.e. system message) prepended to model calls. + * This field allows the client to guide the model on desired responses. The model + * can be instructed on response content and format, (e.g. "be extremely succinct", + * "act friendly", "here are examples of good responses") and on audio behavior + * (e.g. "talk quickly", "inject emotion into your voice", "laugh frequently"). The + * instructions are not guaranteed to be followed by the model, but they provide + * guidance to the model on the desired behavior. + * + * Note that the server sets default instructions which will be used if this field + * is not set and are visible in the `session.created` event at the start of the + * session. + */ + instructions?: string; + + /** + * Maximum number of output tokens for a single assistant response, inclusive of + * tool calls. Provide an integer between 1 and 4096 to limit output tokens, or + * `inf` for the maximum available tokens for a given model. Defaults to `inf`. + */ + max_response_output_tokens?: number | 'inf'; + + /** + * The set of modalities the model can respond with. To disable audio, set this to + * ["text"]. + */ + modalities?: Array<'text' | 'audio'>; + + /** + * The Realtime model used for this session. + */ + model?: + | 'gpt-4o-realtime-preview' + | 'gpt-4o-realtime-preview-2024-10-01' + | 'gpt-4o-realtime-preview-2024-12-17' + | 'gpt-4o-realtime-preview-2025-06-03' + | 'gpt-4o-mini-realtime-preview' + | 'gpt-4o-mini-realtime-preview-2024-12-17'; + + /** + * The object type. Always `realtime.session`. + */ + object?: 'realtime.session'; + + /** + * The format of output audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`. + * For `pcm16`, output audio is sampled at a rate of 24kHz. + */ + output_audio_format?: 'pcm16' | 'g711_ulaw' | 'g711_alaw'; + + /** + * Reference to a prompt template and its variables. + * [Learn more](https://platform.openai.com/docs/guides/text?api-mode=responses#reusable-prompts). + */ + prompt?: ResponsesAPI.ResponsePrompt | null; + + /** + * The speed of the model's spoken response. 1.0 is the default speed. 0.25 is the + * minimum speed. 1.5 is the maximum speed. This value can only be changed in + * between model turns, not while a response is in progress. + */ + speed?: number; + + /** + * Sampling temperature for the model, limited to [0.6, 1.2]. For audio models a + * temperature of 0.8 is highly recommended for best performance. + */ + temperature?: number; + + /** + * How the model chooses tools. Options are `auto`, `none`, `required`, or specify + * a function. + */ + tool_choice?: string; + + /** + * Tools (functions) available to the model. + */ + tools?: Array; + + /** + * Configuration options for tracing. Set to null to disable tracing. Once tracing + * is enabled for a session, the configuration cannot be modified. + * + * `auto` will create a trace for the session with default values for the workflow + * name, group id, and metadata. + */ + tracing?: 'auto' | RealtimeSession.TracingConfiguration | null; + + /** + * Configuration for turn detection, ether Server VAD or Semantic VAD. This can be + * set to `null` to turn off, in which case the client must manually trigger model + * response. Server VAD means that the model will detect the start and end of + * speech based on audio volume and respond at the end of user speech. Semantic VAD + * is more advanced and uses a turn detection model (in conjunction with VAD) to + * semantically estimate whether the user has finished speaking, then dynamically + * sets a timeout based on this probability. For example, if user audio trails off + * with "uhhm", the model will score a low probability of turn end and wait longer + * for the user to continue speaking. This can be useful for more natural + * conversations, but may have a higher latency. + */ + turn_detection?: RealtimeSession.TurnDetection | null; + + /** + * The voice the model uses to respond. Voice cannot be changed during the session + * once the model has responded with audio at least once. Current voice options are + * `alloy`, `ash`, `ballad`, `coral`, `echo`, `sage`, `shimmer`, and `verse`. + */ + voice?: + | (string & {}) + | 'alloy' + | 'ash' + | 'ballad' + | 'coral' + | 'echo' + | 'sage' + | 'shimmer' + | 'verse' + | 'marin' + | 'cedar'; +} + +export namespace RealtimeSession { + /** + * Configuration for input audio noise reduction. This can be set to `null` to turn + * off. Noise reduction filters audio added to the input audio buffer before it is + * sent to VAD and the model. Filtering the audio can improve VAD and turn + * detection accuracy (reducing false positives) and model performance by improving + * perception of the input audio. + */ + export interface InputAudioNoiseReduction { + /** + * Type of noise reduction. `near_field` is for close-talking microphones such as + * headphones, `far_field` is for far-field microphones such as laptop or + * conference room microphones. + */ + type?: 'near_field' | 'far_field'; + } + + /** + * Configuration for input audio transcription, defaults to off and can be set to + * `null` to turn off once on. Input audio transcription is not native to the + * model, since the model consumes audio directly. Transcription runs + * asynchronously through + * [the /audio/transcriptions endpoint](https://platform.openai.com/docs/api-reference/audio/createTranscription) + * and should be treated as guidance of input audio content rather than precisely + * what the model heard. The client can optionally set the language and prompt for + * transcription, these offer additional guidance to the transcription service. + */ + export interface InputAudioTranscription { + /** + * The language of the input audio. Supplying the input language in + * [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`) + * format will improve accuracy and latency. + */ + language?: string; + + /** + * The model to use for transcription, current options are `gpt-4o-transcribe`, + * `gpt-4o-mini-transcribe`, and `whisper-1`. + */ + model?: string; + + /** + * An optional text to guide the model's style or continue a previous audio + * segment. For `whisper-1`, the + * [prompt is a list of keywords](https://platform.openai.com/docs/guides/speech-to-text#prompting). + * For `gpt-4o-transcribe` models, the prompt is a free text string, for example + * "expect words related to technology". + */ + prompt?: string; + } + + export interface Tool { + /** + * The description of the function, including guidance on when and how to call it, + * and guidance about what to tell the user when calling (if anything). + */ + description?: string; + + /** + * The name of the function. + */ + name?: string; + + /** + * Parameters of the function in JSON Schema. + */ + parameters?: unknown; + + /** + * The type of the tool, i.e. `function`. + */ + type?: 'function'; + } + + /** + * Granular configuration for tracing. + */ + export interface TracingConfiguration { + /** + * The group id to attach to this trace to enable filtering and grouping in the + * traces dashboard. + */ + group_id?: string; + + /** + * The arbitrary metadata to attach to this trace to enable filtering in the traces + * dashboard. + */ + metadata?: unknown; + + /** + * The name of the workflow to attach to this trace. This is used to name the trace + * in the traces dashboard. + */ + workflow_name?: string; + } + + /** + * Configuration for turn detection, ether Server VAD or Semantic VAD. This can be + * set to `null` to turn off, in which case the client must manually trigger model + * response. Server VAD means that the model will detect the start and end of + * speech based on audio volume and respond at the end of user speech. Semantic VAD + * is more advanced and uses a turn detection model (in conjunction with VAD) to + * semantically estimate whether the user has finished speaking, then dynamically + * sets a timeout based on this probability. For example, if user audio trails off + * with "uhhm", the model will score a low probability of turn end and wait longer + * for the user to continue speaking. This can be useful for more natural + * conversations, but may have a higher latency. + */ + export interface TurnDetection { + /** + * Whether or not to automatically generate a response when a VAD stop event + * occurs. + */ + create_response?: boolean; + + /** + * Used only for `semantic_vad` mode. The eagerness of the model to respond. `low` + * will wait longer for the user to continue speaking, `high` will respond more + * quickly. `auto` is the default and is equivalent to `medium`. + */ + eagerness?: 'low' | 'medium' | 'high' | 'auto'; + + /** + * Optional idle timeout after which turn detection will auto-timeout when no + * additional audio is received. + */ + idle_timeout_ms?: number | null; + + /** + * Whether or not to automatically interrupt any ongoing response with output to + * the default conversation (i.e. `conversation` of `auto`) when a VAD start event + * occurs. + */ + interrupt_response?: boolean; + + /** + * Used only for `server_vad` mode. Amount of audio to include before the VAD + * detected speech (in milliseconds). Defaults to 300ms. + */ + prefix_padding_ms?: number; + + /** + * Used only for `server_vad` mode. Duration of silence to detect speech stop (in + * milliseconds). Defaults to 500ms. With shorter values the model will respond + * more quickly, but may jump in on short pauses from the user. + */ + silence_duration_ms?: number; + + /** + * Used only for `server_vad` mode. Activation threshold for VAD (0.0 to 1.0), this + * defaults to 0.5. A higher threshold will require louder audio to activate the + * model, and thus might perform better in noisy environments. + */ + threshold?: number; + + /** + * Type of turn detection. + */ + type?: 'server_vad' | 'semantic_vad'; + } +} + +/** + * Realtime session object configuration. + */ +export interface RealtimeSessionCreateRequest { + /** + * The Realtime model used for this session. + */ + model: + | (string & {}) + | 'gpt-4o-realtime' + | 'gpt-4o-mini-realtime' + | 'gpt-4o-realtime-preview' + | 'gpt-4o-realtime-preview-2024-10-01' + | 'gpt-4o-realtime-preview-2024-12-17' + | 'gpt-4o-realtime-preview-2025-06-03' + | 'gpt-4o-mini-realtime-preview' + | 'gpt-4o-mini-realtime-preview-2024-12-17'; + + /** + * The type of session to create. Always `realtime` for the Realtime API. + */ + type: 'realtime'; + + /** + * Configuration for input and output audio. + */ + audio?: RealtimeAudioConfig; + + /** + * Configuration options for the generated client secret. + */ + client_secret?: RealtimeClientSecretConfig; + + /** + * Additional fields to include in server outputs. + * + * - `item.input_audio_transcription.logprobs`: Include logprobs for input audio + * transcription. + */ + include?: Array<'item.input_audio_transcription.logprobs'>; + + /** + * The default system instructions (i.e. system message) prepended to model calls. + * This field allows the client to guide the model on desired responses. The model + * can be instructed on response content and format, (e.g. "be extremely succinct", + * "act friendly", "here are examples of good responses") and on audio behavior + * (e.g. "talk quickly", "inject emotion into your voice", "laugh frequently"). The + * instructions are not guaranteed to be followed by the model, but they provide + * guidance to the model on the desired behavior. + * + * Note that the server sets default instructions which will be used if this field + * is not set and are visible in the `session.created` event at the start of the + * session. + */ + instructions?: string; + + /** + * Maximum number of output tokens for a single assistant response, inclusive of + * tool calls. Provide an integer between 1 and 4096 to limit output tokens, or + * `inf` for the maximum available tokens for a given model. Defaults to `inf`. + */ + max_output_tokens?: number | 'inf'; + + /** + * The set of modalities the model can respond with. To disable audio, set this to + * ["text"]. + */ + output_modalities?: Array<'text' | 'audio'>; + + /** + * Reference to a prompt template and its variables. + * [Learn more](https://platform.openai.com/docs/guides/text?api-mode=responses#reusable-prompts). + */ + prompt?: ResponsesAPI.ResponsePrompt | null; + + /** + * Sampling temperature for the model, limited to [0.6, 1.2]. For audio models a + * temperature of 0.8 is highly recommended for best performance. + */ + temperature?: number; + + /** + * How the model chooses tools. Provide one of the string modes or force a specific + * function/MCP tool. + */ + tool_choice?: RealtimeToolChoiceConfig; + + /** + * Tools available to the model. + */ + tools?: RealtimeToolsConfig; + + /** + * Configuration options for tracing. Set to null to disable tracing. Once tracing + * is enabled for a session, the configuration cannot be modified. + * + * `auto` will create a trace for the session with default values for the workflow + * name, group id, and metadata. + */ + tracing?: RealtimeTracingConfig | null; + + /** + * Controls how the realtime conversation is truncated prior to model inference. + * The default is `auto`. When set to `retention_ratio`, the server retains a + * fraction of the conversation tokens prior to the instructions. + */ + truncation?: RealtimeTruncation; +} + +/** + * How the model chooses tools. Provide one of the string modes or force a specific + * function/MCP tool. + */ +export type RealtimeToolChoiceConfig = + | ResponsesAPI.ToolChoiceOptions + | ResponsesAPI.ToolChoiceFunction + | ResponsesAPI.ToolChoiceMcp; + +/** + * Tools available to the model. + */ +export type RealtimeToolsConfig = Array; + +/** + * Give the model access to additional tools via remote Model Context Protocol + * (MCP) servers. + * [Learn more about MCP](https://platform.openai.com/docs/guides/tools-remote-mcp). + */ +export type RealtimeToolsConfigUnion = RealtimeToolsConfigUnion.Function | RealtimeToolsConfigUnion.Mcp; + +export namespace RealtimeToolsConfigUnion { + export interface Function { + /** + * The description of the function, including guidance on when and how to call it, + * and guidance about what to tell the user when calling (if anything). + */ + description?: string; + + /** + * The name of the function. + */ + name?: string; + + /** + * Parameters of the function in JSON Schema. + */ + parameters?: unknown; + + /** + * The type of the tool, i.e. `function`. + */ + type?: 'function'; + } + + /** + * Give the model access to additional tools via remote Model Context Protocol + * (MCP) servers. + * [Learn more about MCP](https://platform.openai.com/docs/guides/tools-remote-mcp). + */ + export interface Mcp { + /** + * A label for this MCP server, used to identify it in tool calls. + */ + server_label: string; + + /** + * The type of the MCP tool. Always `mcp`. + */ + type: 'mcp'; + + /** + * List of allowed tool names or a filter object. + */ + allowed_tools?: Array | Mcp.McpToolFilter | null; + + /** + * An OAuth access token that can be used with a remote MCP server, either with a + * custom MCP server URL or a service connector. Your application must handle the + * OAuth authorization flow and provide the token here. + */ + authorization?: string; + + /** + * Identifier for service connectors, like those available in ChatGPT. One of + * `server_url` or `connector_id` must be provided. Learn more about service + * connectors + * [here](https://platform.openai.com/docs/guides/tools-remote-mcp#connectors). + * + * Currently supported `connector_id` values are: + * + * - Dropbox: `connector_dropbox` + * - Gmail: `connector_gmail` + * - Google Calendar: `connector_googlecalendar` + * - Google Drive: `connector_googledrive` + * - Microsoft Teams: `connector_microsoftteams` + * - Outlook Calendar: `connector_outlookcalendar` + * - Outlook Email: `connector_outlookemail` + * - SharePoint: `connector_sharepoint` + */ + connector_id?: + | 'connector_dropbox' + | 'connector_gmail' + | 'connector_googlecalendar' + | 'connector_googledrive' + | 'connector_microsoftteams' + | 'connector_outlookcalendar' + | 'connector_outlookemail' + | 'connector_sharepoint'; + + /** + * Optional HTTP headers to send to the MCP server. Use for authentication or other + * purposes. + */ + headers?: { [key: string]: string } | null; + + /** + * Specify which of the MCP server's tools require approval. + */ + require_approval?: Mcp.McpToolApprovalFilter | 'always' | 'never' | null; + + /** + * Optional description of the MCP server, used to provide more context. + */ + server_description?: string; + + /** + * The URL for the MCP server. One of `server_url` or `connector_id` must be + * provided. + */ + server_url?: string; + } + + export namespace Mcp { + /** + * A filter object to specify which tools are allowed. + */ + export interface McpToolFilter { + /** + * Indicates whether or not a tool modifies data or is read-only. If an MCP server + * is + * [annotated with `readOnlyHint`](https://modelcontextprotocol.io/specification/2025-06-18/schema#toolannotations-readonlyhint), + * it will match this filter. + */ + read_only?: boolean; + + /** + * List of allowed tool names. + */ + tool_names?: Array; + } + + /** + * Specify which of the MCP server's tools require approval. Can be `always`, + * `never`, or a filter object associated with tools that require approval. + */ + export interface McpToolApprovalFilter { + /** + * A filter object to specify which tools are allowed. + */ + always?: McpToolApprovalFilter.Always; + + /** + * A filter object to specify which tools are allowed. + */ + never?: McpToolApprovalFilter.Never; + } + + export namespace McpToolApprovalFilter { + /** + * A filter object to specify which tools are allowed. + */ + export interface Always { + /** + * Indicates whether or not a tool modifies data or is read-only. If an MCP server + * is + * [annotated with `readOnlyHint`](https://modelcontextprotocol.io/specification/2025-06-18/schema#toolannotations-readonlyhint), + * it will match this filter. + */ + read_only?: boolean; + + /** + * List of allowed tool names. + */ + tool_names?: Array; + } + + /** + * A filter object to specify which tools are allowed. + */ + export interface Never { + /** + * Indicates whether or not a tool modifies data or is read-only. If an MCP server + * is + * [annotated with `readOnlyHint`](https://modelcontextprotocol.io/specification/2025-06-18/schema#toolannotations-readonlyhint), + * it will match this filter. + */ + read_only?: boolean; + + /** + * List of allowed tool names. + */ + tool_names?: Array; + } + } + } +} + +/** + * Configuration options for tracing. Set to null to disable tracing. Once tracing + * is enabled for a session, the configuration cannot be modified. + * + * `auto` will create a trace for the session with default values for the workflow + * name, group id, and metadata. + */ +export type RealtimeTracingConfig = 'auto' | RealtimeTracingConfig.TracingConfiguration; + +export namespace RealtimeTracingConfig { + /** + * Granular configuration for tracing. + */ + export interface TracingConfiguration { + /** + * The group id to attach to this trace to enable filtering and grouping in the + * traces dashboard. + */ + group_id?: string; + + /** + * The arbitrary metadata to attach to this trace to enable filtering in the traces + * dashboard. + */ + metadata?: unknown; + + /** + * The name of the workflow to attach to this trace. This is used to name the trace + * in the traces dashboard. + */ + workflow_name?: string; + } +} + +/** + * Realtime transcription session object configuration. + */ +export interface RealtimeTranscriptionSessionCreateRequest { + /** + * ID of the model to use. The options are `gpt-4o-transcribe`, + * `gpt-4o-mini-transcribe`, and `whisper-1` (which is powered by our open source + * Whisper V2 model). + */ + model: (string & {}) | 'whisper-1' | 'gpt-4o-transcribe' | 'gpt-4o-mini-transcribe'; + + /** + * The type of session to create. Always `transcription` for transcription + * sessions. + */ + type: 'transcription'; + + /** + * The set of items to include in the transcription. Current available items are: + * + * - `item.input_audio_transcription.logprobs` + */ + include?: Array<'item.input_audio_transcription.logprobs'>; + + /** + * The format of input audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`. For + * `pcm16`, input audio must be 16-bit PCM at a 24kHz sample rate, single channel + * (mono), and little-endian byte order. + */ + input_audio_format?: 'pcm16' | 'g711_ulaw' | 'g711_alaw'; + + /** + * Configuration for input audio noise reduction. This can be set to `null` to turn + * off. Noise reduction filters audio added to the input audio buffer before it is + * sent to VAD and the model. Filtering the audio can improve VAD and turn + * detection accuracy (reducing false positives) and model performance by improving + * perception of the input audio. + */ + input_audio_noise_reduction?: RealtimeTranscriptionSessionCreateRequest.InputAudioNoiseReduction; + + /** + * Configuration for input audio transcription. The client can optionally set the + * language and prompt for transcription, these offer additional guidance to the + * transcription service. + */ + input_audio_transcription?: RealtimeTranscriptionSessionCreateRequest.InputAudioTranscription; + + /** + * Configuration for turn detection. Can be set to `null` to turn off. Server VAD + * means that the model will detect the start and end of speech based on audio + * volume and respond at the end of user speech. + */ + turn_detection?: RealtimeTranscriptionSessionCreateRequest.TurnDetection; +} + +export namespace RealtimeTranscriptionSessionCreateRequest { + /** + * Configuration for input audio noise reduction. This can be set to `null` to turn + * off. Noise reduction filters audio added to the input audio buffer before it is + * sent to VAD and the model. Filtering the audio can improve VAD and turn + * detection accuracy (reducing false positives) and model performance by improving + * perception of the input audio. + */ + export interface InputAudioNoiseReduction { + /** + * Type of noise reduction. `near_field` is for close-talking microphones such as + * headphones, `far_field` is for far-field microphones such as laptop or + * conference room microphones. + */ + type?: 'near_field' | 'far_field'; + } + + /** + * Configuration for input audio transcription. The client can optionally set the + * language and prompt for transcription, these offer additional guidance to the + * transcription service. + */ + export interface InputAudioTranscription { + /** + * The language of the input audio. Supplying the input language in + * [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`) + * format will improve accuracy and latency. + */ + language?: string; + + /** + * The model to use for transcription, current options are `gpt-4o-transcribe`, + * `gpt-4o-mini-transcribe`, and `whisper-1`. + */ + model?: 'gpt-4o-transcribe' | 'gpt-4o-mini-transcribe' | 'whisper-1'; + + /** + * An optional text to guide the model's style or continue a previous audio + * segment. For `whisper-1`, the + * [prompt is a list of keywords](https://platform.openai.com/docs/guides/speech-to-text#prompting). + * For `gpt-4o-transcribe` models, the prompt is a free text string, for example + * "expect words related to technology". + */ + prompt?: string; + } + + /** + * Configuration for turn detection. Can be set to `null` to turn off. Server VAD + * means that the model will detect the start and end of speech based on audio + * volume and respond at the end of user speech. + */ + export interface TurnDetection { + /** + * Amount of audio to include before the VAD detected speech (in milliseconds). + * Defaults to 300ms. + */ + prefix_padding_ms?: number; + + /** + * Duration of silence to detect speech stop (in milliseconds). Defaults to 500ms. + * With shorter values the model will respond more quickly, but may jump in on + * short pauses from the user. + */ + silence_duration_ms?: number; + + /** + * Activation threshold for VAD (0.0 to 1.0), this defaults to 0.5. A higher + * threshold will require louder audio to activate the model, and thus might + * perform better in noisy environments. + */ + threshold?: number; + + /** + * Type of turn detection. Only `server_vad` is currently supported for + * transcription sessions. + */ + type?: 'server_vad'; + } +} + +/** + * Controls how the realtime conversation is truncated prior to model inference. + * The default is `auto`. When set to `retention_ratio`, the server retains a + * fraction of the conversation tokens prior to the instructions. + */ +export type RealtimeTruncation = 'auto' | 'disabled' | RealtimeTruncation.RetentionRatioTruncation; + +export namespace RealtimeTruncation { + /** + * Retain a fraction of the conversation tokens. + */ + export interface RetentionRatioTruncation { + /** + * Fraction of pre-instruction conversation tokens to retain (0.0 - 1.0). + */ + retention_ratio: number; + + /** + * Use retention ratio truncation. + */ + type: 'retention_ratio'; + + /** + * Optional cap on tokens allowed after the instructions. + */ + post_instructions_token_limit?: number | null; + } +} + +/** + * Returned when the model-generated audio is updated. + */ +export interface ResponseAudioDeltaEvent { + /** + * The index of the content part in the item's content array. + */ + content_index: number; + + /** + * Base64-encoded audio data delta. + */ + delta: string; + + /** + * The unique ID of the server event. + */ + event_id: string; + + /** + * The ID of the item. + */ + item_id: string; + + /** + * The index of the output item in the response. + */ + output_index: number; + + /** + * The ID of the response. + */ + response_id: string; + + /** + * The event type, must be `response.output_audio.delta`. + */ + type: 'response.output_audio.delta'; +} + +/** + * Returned when the model-generated audio is done. Also emitted when a Response is + * interrupted, incomplete, or cancelled. + */ +export interface ResponseAudioDoneEvent { + /** + * The index of the content part in the item's content array. + */ + content_index: number; + + /** + * The unique ID of the server event. + */ + event_id: string; + + /** + * The ID of the item. + */ + item_id: string; + + /** + * The index of the output item in the response. + */ + output_index: number; + + /** + * The ID of the response. + */ + response_id: string; + + /** + * The event type, must be `response.output_audio.done`. + */ + type: 'response.output_audio.done'; +} + +/** + * Returned when the model-generated transcription of audio output is updated. + */ +export interface ResponseAudioTranscriptDeltaEvent { + /** + * The index of the content part in the item's content array. + */ + content_index: number; + + /** + * The transcript delta. + */ + delta: string; + + /** + * The unique ID of the server event. + */ + event_id: string; + + /** + * The ID of the item. + */ + item_id: string; + + /** + * The index of the output item in the response. + */ + output_index: number; + + /** + * The ID of the response. + */ + response_id: string; + + /** + * The event type, must be `response.output_audio_transcript.delta`. + */ + type: 'response.output_audio_transcript.delta'; +} + +/** + * Returned when the model-generated transcription of audio output is done + * streaming. Also emitted when a Response is interrupted, incomplete, or + * cancelled. + */ +export interface ResponseAudioTranscriptDoneEvent { + /** + * The index of the content part in the item's content array. + */ + content_index: number; + + /** + * The unique ID of the server event. + */ + event_id: string; + + /** + * The ID of the item. + */ + item_id: string; + + /** + * The index of the output item in the response. + */ + output_index: number; + + /** + * The ID of the response. + */ + response_id: string; + + /** + * The final transcript of the audio. + */ + transcript: string; + + /** + * The event type, must be `response.output_audio_transcript.done`. + */ + type: 'response.output_audio_transcript.done'; +} + +/** + * Send this event to cancel an in-progress response. The server will respond with + * a `response.done` event with a status of `response.status=cancelled`. If there + * is no response to cancel, the server will respond with an error. + */ +export interface ResponseCancelEvent { + /** + * The event type, must be `response.cancel`. + */ + type: 'response.cancel'; + + /** + * Optional client-generated ID used to identify this event. + */ + event_id?: string; + + /** + * A specific response ID to cancel - if not provided, will cancel an in-progress + * response in the default conversation. + */ + response_id?: string; +} + +/** + * Returned when a new content part is added to an assistant message item during + * response generation. + */ +export interface ResponseContentPartAddedEvent { + /** + * The index of the content part in the item's content array. + */ + content_index: number; + + /** + * The unique ID of the server event. + */ + event_id: string; + + /** + * The ID of the item to which the content part was added. + */ + item_id: string; + + /** + * The index of the output item in the response. + */ + output_index: number; + + /** + * The content part that was added. + */ + part: ResponseContentPartAddedEvent.Part; + + /** + * The ID of the response. + */ + response_id: string; + + /** + * The event type, must be `response.content_part.added`. + */ + type: 'response.content_part.added'; +} + +export namespace ResponseContentPartAddedEvent { + /** + * The content part that was added. + */ + export interface Part { + /** + * Base64-encoded audio data (if type is "audio"). + */ + audio?: string; + + /** + * The text content (if type is "text"). + */ + text?: string; + + /** + * The transcript of the audio (if type is "audio"). + */ + transcript?: string; + + /** + * The content type ("text", "audio"). + */ + type?: 'text' | 'audio'; + } +} + +/** + * Returned when a content part is done streaming in an assistant message item. + * Also emitted when a Response is interrupted, incomplete, or cancelled. + */ +export interface ResponseContentPartDoneEvent { + /** + * The index of the content part in the item's content array. + */ + content_index: number; + + /** + * The unique ID of the server event. + */ + event_id: string; + + /** + * The ID of the item. + */ + item_id: string; + + /** + * The index of the output item in the response. + */ + output_index: number; + + /** + * The content part that is done. + */ + part: ResponseContentPartDoneEvent.Part; + + /** + * The ID of the response. + */ + response_id: string; + + /** + * The event type, must be `response.content_part.done`. + */ + type: 'response.content_part.done'; +} + +export namespace ResponseContentPartDoneEvent { + /** + * The content part that is done. + */ + export interface Part { + /** + * Base64-encoded audio data (if type is "audio"). + */ + audio?: string; + + /** + * The text content (if type is "text"). + */ + text?: string; + + /** + * The transcript of the audio (if type is "audio"). + */ + transcript?: string; + + /** + * The content type ("text", "audio"). + */ + type?: 'text' | 'audio'; + } +} + +/** + * This event instructs the server to create a Response, which means triggering + * model inference. When in Server VAD mode, the server will create Responses + * automatically. + * + * A Response will include at least one Item, and may have two, in which case the + * second will be a function call. These Items will be appended to the conversation + * history. + * + * The server will respond with a `response.created` event, events for Items and + * content created, and finally a `response.done` event to indicate the Response is + * complete. + * + * The `response.create` event includes inference configuration like + * `instructions`, and `temperature`. These fields will override the Session's + * configuration for this Response only. + */ +export interface ResponseCreateEvent { + /** + * The event type, must be `response.create`. + */ + type: 'response.create'; + + /** + * Optional client-generated ID used to identify this event. + */ + event_id?: string; + + /** + * Create a new Realtime response with these parameters + */ + response?: ResponseCreateEvent.Response; +} + +export namespace ResponseCreateEvent { + /** + * Create a new Realtime response with these parameters + */ + export interface Response { + /** + * Controls which conversation the response is added to. Currently supports `auto` + * and `none`, with `auto` as the default value. The `auto` value means that the + * contents of the response will be added to the default conversation. Set this to + * `none` to create an out-of-band response which will not add items to default + * conversation. + */ + conversation?: (string & {}) | 'auto' | 'none'; + + /** + * Input items to include in the prompt for the model. Using this field creates a + * new context for this Response instead of using the default conversation. An + * empty array `[]` will clear the context for this Response. Note that this can + * include references to items from the default conversation. + */ + input?: Array; + + /** + * The default system instructions (i.e. system message) prepended to model calls. + * This field allows the client to guide the model on desired responses. The model + * can be instructed on response content and format, (e.g. "be extremely succinct", + * "act friendly", "here are examples of good responses") and on audio behavior + * (e.g. "talk quickly", "inject emotion into your voice", "laugh frequently"). The + * instructions are not guaranteed to be followed by the model, but they provide + * guidance to the model on the desired behavior. + * + * Note that the server sets default instructions which will be used if this field + * is not set and are visible in the `session.created` event at the start of the + * session. + */ + instructions?: string; + + /** + * Maximum number of output tokens for a single assistant response, inclusive of + * tool calls. Provide an integer between 1 and 4096 to limit output tokens, or + * `inf` for the maximum available tokens for a given model. Defaults to `inf`. + */ + max_output_tokens?: number | 'inf'; + + /** + * Set of 16 key-value pairs that can be attached to an object. This can be useful + * for storing additional information about the object in a structured format, and + * querying for objects via API or the dashboard. + * + * Keys are strings with a maximum length of 64 characters. Values are strings with + * a maximum length of 512 characters. + */ + metadata?: Shared.Metadata | null; + + /** + * The set of modalities the model can respond with. To disable audio, set this to + * ["text"]. + */ + modalities?: Array<'text' | 'audio'>; + + /** + * The format of output audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`. + */ + output_audio_format?: 'pcm16' | 'g711_ulaw' | 'g711_alaw'; + + /** + * Reference to a prompt template and its variables. + * [Learn more](https://platform.openai.com/docs/guides/text?api-mode=responses#reusable-prompts). + */ + prompt?: ResponsesAPI.ResponsePrompt | null; + + /** + * Sampling temperature for the model, limited to [0.6, 1.2]. Defaults to 0.8. + */ + temperature?: number; + + /** + * How the model chooses tools. Provide one of the string modes or force a specific + * function/MCP tool. + */ + tool_choice?: + | ResponsesAPI.ToolChoiceOptions + | ResponsesAPI.ToolChoiceFunction + | ResponsesAPI.ToolChoiceMcp; + + /** + * Tools (functions) available to the model. + */ + tools?: Array; + + /** + * The voice the model uses to respond. Voice cannot be changed during the session + * once the model has responded with audio at least once. Current voice options are + * `alloy`, `ash`, `ballad`, `coral`, `echo`, `sage`, `shimmer`, and `verse`. + */ + voice?: + | (string & {}) + | 'alloy' + | 'ash' + | 'ballad' + | 'coral' + | 'echo' + | 'sage' + | 'shimmer' + | 'verse' + | 'marin' + | 'cedar'; + } + + export namespace Response { + export interface Tool { + /** + * The description of the function, including guidance on when and how to call it, + * and guidance about what to tell the user when calling (if anything). + */ + description?: string; + + /** + * The name of the function. + */ + name?: string; + + /** + * Parameters of the function in JSON Schema. + */ + parameters?: unknown; + + /** + * The type of the tool, i.e. `function`. + */ + type?: 'function'; + } + } +} + +/** + * Returned when a new Response is created. The first event of response creation, + * where the response is in an initial state of `in_progress`. + */ +export interface ResponseCreatedEvent { + /** + * The unique ID of the server event. + */ + event_id: string; + + /** + * The response resource. + */ + response: RealtimeResponse; + + /** + * The event type, must be `response.created`. + */ + type: 'response.created'; +} + +/** + * Returned when a Response is done streaming. Always emitted, no matter the final + * state. The Response object included in the `response.done` event will include + * all output Items in the Response but will omit the raw audio data. + */ +export interface ResponseDoneEvent { + /** + * The unique ID of the server event. + */ + event_id: string; + + /** + * The response resource. + */ + response: RealtimeResponse; + + /** + * The event type, must be `response.done`. + */ + type: 'response.done'; +} + +/** + * Returned when the model-generated function call arguments are updated. + */ +export interface ResponseFunctionCallArgumentsDeltaEvent { + /** + * The ID of the function call. + */ + call_id: string; + + /** + * The arguments delta as a JSON string. + */ + delta: string; + + /** + * The unique ID of the server event. + */ + event_id: string; + + /** + * The ID of the function call item. + */ + item_id: string; + + /** + * The index of the output item in the response. + */ + output_index: number; + + /** + * The ID of the response. + */ + response_id: string; + + /** + * The event type, must be `response.function_call_arguments.delta`. + */ + type: 'response.function_call_arguments.delta'; +} + +/** + * Returned when the model-generated function call arguments are done streaming. + * Also emitted when a Response is interrupted, incomplete, or cancelled. + */ +export interface ResponseFunctionCallArgumentsDoneEvent { + /** + * The final arguments as a JSON string. + */ + arguments: string; + + /** + * The ID of the function call. + */ + call_id: string; + + /** + * The unique ID of the server event. + */ + event_id: string; + + /** + * The ID of the function call item. + */ + item_id: string; + + /** + * The index of the output item in the response. + */ + output_index: number; + + /** + * The ID of the response. + */ + response_id: string; + + /** + * The event type, must be `response.function_call_arguments.done`. + */ + type: 'response.function_call_arguments.done'; +} + +/** + * Returned when MCP tool call arguments are updated during response generation. + */ +export interface ResponseMcpCallArgumentsDelta { + /** + * The JSON-encoded arguments delta. + */ + delta: string; + + /** + * The unique ID of the server event. + */ + event_id: string; + + /** + * The ID of the MCP tool call item. + */ + item_id: string; + + /** + * The index of the output item in the response. + */ + output_index: number; + + /** + * The ID of the response. + */ + response_id: string; + + /** + * The event type, must be `response.mcp_call_arguments.delta`. + */ + type: 'response.mcp_call_arguments.delta'; + + /** + * If present, indicates the delta text was obfuscated. + */ + obfuscation?: string | null; +} + +/** + * Returned when MCP tool call arguments are finalized during response generation. + */ +export interface ResponseMcpCallArgumentsDone { + /** + * The final JSON-encoded arguments string. + */ + arguments: string; + + /** + * The unique ID of the server event. + */ + event_id: string; + + /** + * The ID of the MCP tool call item. + */ + item_id: string; + + /** + * The index of the output item in the response. + */ + output_index: number; + + /** + * The ID of the response. + */ + response_id: string; + + /** + * The event type, must be `response.mcp_call_arguments.done`. + */ + type: 'response.mcp_call_arguments.done'; +} + +/** + * Returned when an MCP tool call has completed successfully. + */ +export interface ResponseMcpCallCompleted { + /** + * The unique ID of the server event. + */ + event_id: string; + + /** + * The ID of the MCP tool call item. + */ + item_id: string; + + /** + * The index of the output item in the response. + */ + output_index: number; + + /** + * The event type, must be `response.mcp_call.completed`. + */ + type: 'response.mcp_call.completed'; +} + +/** + * Returned when an MCP tool call has failed. + */ +export interface ResponseMcpCallFailed { + /** + * The unique ID of the server event. + */ + event_id: string; + + /** + * The ID of the MCP tool call item. + */ + item_id: string; + + /** + * The index of the output item in the response. + */ + output_index: number; + + /** + * The event type, must be `response.mcp_call.failed`. + */ + type: 'response.mcp_call.failed'; +} + +/** + * Returned when an MCP tool call has started and is in progress. + */ +export interface ResponseMcpCallInProgress { + /** + * The unique ID of the server event. + */ + event_id: string; + + /** + * The ID of the MCP tool call item. + */ + item_id: string; + + /** + * The index of the output item in the response. + */ + output_index: number; + + /** + * The event type, must be `response.mcp_call.in_progress`. + */ + type: 'response.mcp_call.in_progress'; +} + +/** + * Returned when a new Item is created during Response generation. + */ +export interface ResponseOutputItemAddedEvent { + /** + * The unique ID of the server event. + */ + event_id: string; + + /** + * A single item within a Realtime conversation. + */ + item: ConversationItem; + + /** + * The index of the output item in the Response. + */ + output_index: number; + + /** + * The ID of the Response to which the item belongs. + */ + response_id: string; + + /** + * The event type, must be `response.output_item.added`. + */ + type: 'response.output_item.added'; +} + +/** + * Returned when an Item is done streaming. Also emitted when a Response is + * interrupted, incomplete, or cancelled. + */ +export interface ResponseOutputItemDoneEvent { + /** + * The unique ID of the server event. + */ + event_id: string; + + /** + * A single item within a Realtime conversation. + */ + item: ConversationItem; + + /** + * The index of the output item in the Response. + */ + output_index: number; + + /** + * The ID of the Response to which the item belongs. + */ + response_id: string; + + /** + * The event type, must be `response.output_item.done`. + */ + type: 'response.output_item.done'; +} + +/** + * Returned when the text value of an "output_text" content part is updated. + */ +export interface ResponseTextDeltaEvent { + /** + * The index of the content part in the item's content array. + */ + content_index: number; + + /** + * The text delta. + */ + delta: string; + + /** + * The unique ID of the server event. + */ + event_id: string; + + /** + * The ID of the item. + */ + item_id: string; + + /** + * The index of the output item in the response. + */ + output_index: number; + + /** + * The ID of the response. + */ + response_id: string; + + /** + * The event type, must be `response.output_text.delta`. + */ + type: 'response.output_text.delta'; +} + +/** + * Returned when the text value of an "output_text" content part is done streaming. + * Also emitted when a Response is interrupted, incomplete, or cancelled. + */ +export interface ResponseTextDoneEvent { + /** + * The index of the content part in the item's content array. + */ + content_index: number; + + /** + * The unique ID of the server event. + */ + event_id: string; + + /** + * The ID of the item. + */ + item_id: string; + + /** + * The index of the output item in the response. + */ + output_index: number; + + /** + * The ID of the response. + */ + response_id: string; + + /** + * The final text content. + */ + text: string; + + /** + * The event type, must be `response.output_text.done`. + */ + type: 'response.output_text.done'; +} + +/** + * Returned when a Session is created. Emitted automatically when a new connection + * is established as the first server event. This event will contain the default + * Session configuration. + */ +export interface SessionCreatedEvent { + /** + * The unique ID of the server event. + */ + event_id: string; + + /** + * Realtime session object. + */ + session: RealtimeSession; + + /** + * The event type, must be `session.created`. + */ + type: 'session.created'; +} + +/** + * Send this event to update the session’s default configuration. The client may + * send this event at any time to update any field, except for `voice`. However, + * note that once a session has been initialized with a particular `model`, it + * can’t be changed to another model using `session.update`. + * + * When the server receives a `session.update`, it will respond with a + * `session.updated` event showing the full, effective configuration. Only the + * fields that are present are updated. To clear a field like `instructions`, pass + * an empty string. + */ +export interface SessionUpdateEvent { + /** + * Realtime session object configuration. + */ + session: RealtimeSessionCreateRequest; + + /** + * The event type, must be `session.update`. + */ + type: 'session.update'; + + /** + * Optional client-generated ID used to identify this event. + */ + event_id?: string; +} + +/** + * Returned when a session is updated with a `session.update` event, unless there + * is an error. + */ +export interface SessionUpdatedEvent { + /** + * The unique ID of the server event. + */ + event_id: string; + + /** + * Realtime session object. + */ + session: RealtimeSession; + + /** + * The event type, must be `session.updated`. + */ + type: 'session.updated'; +} + +/** + * Returned when a transcription session is created. + */ +export interface TranscriptionSessionCreated { + /** + * The unique ID of the server event. + */ + event_id: string; + + /** + * A Realtime transcription session configuration object. + */ + session: TranscriptionSessionCreated.Session; + + /** + * The event type, must be `transcription_session.created`. + */ + type: 'transcription_session.created'; +} + +export namespace TranscriptionSessionCreated { + /** + * A Realtime transcription session configuration object. + */ + export interface Session { + /** + * Unique identifier for the session that looks like `sess_1234567890abcdef`. + */ + id?: string; + + /** + * Configuration for input audio for the session. + */ + audio?: Session.Audio; + + /** + * Expiration timestamp for the session, in seconds since epoch. + */ + expires_at?: number; + + /** + * Additional fields to include in server outputs. + * + * - `item.input_audio_transcription.logprobs`: Include logprobs for input audio + * transcription. + */ + include?: Array<'item.input_audio_transcription.logprobs'>; + + /** + * The object type. Always `realtime.transcription_session`. + */ + object?: string; + } + + export namespace Session { + /** + * Configuration for input audio for the session. + */ + export interface Audio { + input?: Audio.Input; + } + + export namespace Audio { + export interface Input { + /** + * The format of input audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`. + */ + format?: string; + + /** + * Configuration for input audio noise reduction. + */ + noise_reduction?: Input.NoiseReduction; + + /** + * Configuration of the transcription model. + */ + transcription?: Input.Transcription; + + /** + * Configuration for turn detection. + */ + turn_detection?: Input.TurnDetection; + } + + export namespace Input { + /** + * Configuration for input audio noise reduction. + */ + export interface NoiseReduction { + type?: 'near_field' | 'far_field'; + } + + /** + * Configuration of the transcription model. + */ + export interface Transcription { + /** + * The language of the input audio. Supplying the input language in + * [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`) + * format will improve accuracy and latency. + */ + language?: string; + + /** + * The model to use for transcription. Can be `gpt-4o-transcribe`, + * `gpt-4o-mini-transcribe`, or `whisper-1`. + */ + model?: 'gpt-4o-transcribe' | 'gpt-4o-mini-transcribe' | 'whisper-1'; + + /** + * An optional text to guide the model's style or continue a previous audio + * segment. The + * [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting) + * should match the audio language. + */ + prompt?: string; + } + + /** + * Configuration for turn detection. + */ + export interface TurnDetection { + prefix_padding_ms?: number; + + silence_duration_ms?: number; + + threshold?: number; + + /** + * Type of turn detection, only `server_vad` is currently supported. + */ + type?: string; + } + } + } + } +} + +/** + * Send this event to update a transcription session. + */ +export interface TranscriptionSessionUpdate { + /** + * Realtime transcription session object configuration. + */ + session: RealtimeTranscriptionSessionCreateRequest; + + /** + * The event type, must be `transcription_session.update`. + */ + type: 'transcription_session.update'; + + /** + * Optional client-generated ID used to identify this event. + */ + event_id?: string; +} + +/** + * Returned when a transcription session is updated with a + * `transcription_session.update` event, unless there is an error. + */ +export interface TranscriptionSessionUpdatedEvent { + /** + * The unique ID of the server event. + */ + event_id: string; + + /** + * A Realtime transcription session configuration object. + */ + session: TranscriptionSessionUpdatedEvent.Session; + + /** + * The event type, must be `transcription_session.updated`. + */ + type: 'transcription_session.updated'; +} + +export namespace TranscriptionSessionUpdatedEvent { + /** + * A Realtime transcription session configuration object. + */ + export interface Session { + /** + * Unique identifier for the session that looks like `sess_1234567890abcdef`. + */ + id?: string; + + /** + * Configuration for input audio for the session. + */ + audio?: Session.Audio; + + /** + * Expiration timestamp for the session, in seconds since epoch. + */ + expires_at?: number; + + /** + * Additional fields to include in server outputs. + * + * - `item.input_audio_transcription.logprobs`: Include logprobs for input audio + * transcription. + */ + include?: Array<'item.input_audio_transcription.logprobs'>; + + /** + * The object type. Always `realtime.transcription_session`. + */ + object?: string; + } + + export namespace Session { + /** + * Configuration for input audio for the session. + */ + export interface Audio { + input?: Audio.Input; + } + + export namespace Audio { + export interface Input { + /** + * The format of input audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`. + */ + format?: string; + + /** + * Configuration for input audio noise reduction. + */ + noise_reduction?: Input.NoiseReduction; + + /** + * Configuration of the transcription model. + */ + transcription?: Input.Transcription; + + /** + * Configuration for turn detection. + */ + turn_detection?: Input.TurnDetection; + } + + export namespace Input { + /** + * Configuration for input audio noise reduction. + */ + export interface NoiseReduction { + type?: 'near_field' | 'far_field'; + } + + /** + * Configuration of the transcription model. + */ + export interface Transcription { + /** + * The language of the input audio. Supplying the input language in + * [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`) + * format will improve accuracy and latency. + */ + language?: string; + + /** + * The model to use for transcription. Can be `gpt-4o-transcribe`, + * `gpt-4o-mini-transcribe`, or `whisper-1`. + */ + model?: 'gpt-4o-transcribe' | 'gpt-4o-mini-transcribe' | 'whisper-1'; + + /** + * An optional text to guide the model's style or continue a previous audio + * segment. The + * [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting) + * should match the audio language. + */ + prompt?: string; + } + + /** + * Configuration for turn detection. + */ + export interface TurnDetection { + prefix_padding_ms?: number; + + silence_duration_ms?: number; + + threshold?: number; + + /** + * Type of turn detection, only `server_vad` is currently supported. + */ + type?: string; + } + } + } + } +} + +Realtime.ClientSecrets = ClientSecrets; + +export declare namespace Realtime { + export { + type ConversationCreatedEvent as ConversationCreatedEvent, + type ConversationItem as ConversationItem, + type ConversationItemAdded as ConversationItemAdded, + type ConversationItemCreateEvent as ConversationItemCreateEvent, + type ConversationItemCreatedEvent as ConversationItemCreatedEvent, + type ConversationItemDeleteEvent as ConversationItemDeleteEvent, + type ConversationItemDeletedEvent as ConversationItemDeletedEvent, + type ConversationItemDone as ConversationItemDone, + type ConversationItemInputAudioTranscriptionCompletedEvent as ConversationItemInputAudioTranscriptionCompletedEvent, + type ConversationItemInputAudioTranscriptionDeltaEvent as ConversationItemInputAudioTranscriptionDeltaEvent, + type ConversationItemInputAudioTranscriptionFailedEvent as ConversationItemInputAudioTranscriptionFailedEvent, + type ConversationItemInputAudioTranscriptionSegment as ConversationItemInputAudioTranscriptionSegment, + type ConversationItemRetrieveEvent as ConversationItemRetrieveEvent, + type ConversationItemTruncateEvent as ConversationItemTruncateEvent, + type ConversationItemTruncatedEvent as ConversationItemTruncatedEvent, + type ConversationItemWithReference as ConversationItemWithReference, + type InputAudioBufferAppendEvent as InputAudioBufferAppendEvent, + type InputAudioBufferClearEvent as InputAudioBufferClearEvent, + type InputAudioBufferClearedEvent as InputAudioBufferClearedEvent, + type InputAudioBufferCommitEvent as InputAudioBufferCommitEvent, + type InputAudioBufferCommittedEvent as InputAudioBufferCommittedEvent, + type InputAudioBufferSpeechStartedEvent as InputAudioBufferSpeechStartedEvent, + type InputAudioBufferSpeechStoppedEvent as InputAudioBufferSpeechStoppedEvent, + type InputAudioBufferTimeoutTriggered as InputAudioBufferTimeoutTriggered, + type LogProbProperties as LogProbProperties, + type McpListToolsCompleted as McpListToolsCompleted, + type McpListToolsFailed as McpListToolsFailed, + type McpListToolsInProgress as McpListToolsInProgress, + type OutputAudioBufferClearEvent as OutputAudioBufferClearEvent, + type RateLimitsUpdatedEvent as RateLimitsUpdatedEvent, + type RealtimeAudioConfig as RealtimeAudioConfig, + type RealtimeClientEvent as RealtimeClientEvent, + type RealtimeClientSecretConfig as RealtimeClientSecretConfig, + type RealtimeConversationItemAssistantMessage as RealtimeConversationItemAssistantMessage, + type RealtimeConversationItemFunctionCall as RealtimeConversationItemFunctionCall, + type RealtimeConversationItemFunctionCallOutput as RealtimeConversationItemFunctionCallOutput, + type RealtimeConversationItemSystemMessage as RealtimeConversationItemSystemMessage, + type RealtimeConversationItemUserMessage as RealtimeConversationItemUserMessage, + type RealtimeError as RealtimeError, + type RealtimeErrorEvent as RealtimeErrorEvent, + type RealtimeMcpApprovalRequest as RealtimeMcpApprovalRequest, + type RealtimeMcpApprovalResponse as RealtimeMcpApprovalResponse, + type RealtimeMcpListTools as RealtimeMcpListTools, + type RealtimeMcpProtocolError as RealtimeMcpProtocolError, + type RealtimeMcpToolCall as RealtimeMcpToolCall, + type RealtimeMcpToolExecutionError as RealtimeMcpToolExecutionError, + type RealtimeMcphttpError as RealtimeMcphttpError, + type RealtimeResponse as RealtimeResponse, + type RealtimeResponseStatus as RealtimeResponseStatus, + type RealtimeResponseUsage as RealtimeResponseUsage, + type RealtimeResponseUsageInputTokenDetails as RealtimeResponseUsageInputTokenDetails, + type RealtimeResponseUsageOutputTokenDetails as RealtimeResponseUsageOutputTokenDetails, + type RealtimeServerEvent as RealtimeServerEvent, + type RealtimeSession as RealtimeSession, + type RealtimeSessionCreateRequest as RealtimeSessionCreateRequest, + type RealtimeToolChoiceConfig as RealtimeToolChoiceConfig, + type RealtimeToolsConfig as RealtimeToolsConfig, + type RealtimeToolsConfigUnion as RealtimeToolsConfigUnion, + type RealtimeTracingConfig as RealtimeTracingConfig, + type RealtimeTranscriptionSessionCreateRequest as RealtimeTranscriptionSessionCreateRequest, + type RealtimeTruncation as RealtimeTruncation, + type ResponseAudioDeltaEvent as ResponseAudioDeltaEvent, + type ResponseAudioDoneEvent as ResponseAudioDoneEvent, + type ResponseAudioTranscriptDeltaEvent as ResponseAudioTranscriptDeltaEvent, + type ResponseAudioTranscriptDoneEvent as ResponseAudioTranscriptDoneEvent, + type ResponseCancelEvent as ResponseCancelEvent, + type ResponseContentPartAddedEvent as ResponseContentPartAddedEvent, + type ResponseContentPartDoneEvent as ResponseContentPartDoneEvent, + type ResponseCreateEvent as ResponseCreateEvent, + type ResponseCreatedEvent as ResponseCreatedEvent, + type ResponseDoneEvent as ResponseDoneEvent, + type ResponseFunctionCallArgumentsDeltaEvent as ResponseFunctionCallArgumentsDeltaEvent, + type ResponseFunctionCallArgumentsDoneEvent as ResponseFunctionCallArgumentsDoneEvent, + type ResponseMcpCallArgumentsDelta as ResponseMcpCallArgumentsDelta, + type ResponseMcpCallArgumentsDone as ResponseMcpCallArgumentsDone, + type ResponseMcpCallCompleted as ResponseMcpCallCompleted, + type ResponseMcpCallFailed as ResponseMcpCallFailed, + type ResponseMcpCallInProgress as ResponseMcpCallInProgress, + type ResponseOutputItemAddedEvent as ResponseOutputItemAddedEvent, + type ResponseOutputItemDoneEvent as ResponseOutputItemDoneEvent, + type ResponseTextDeltaEvent as ResponseTextDeltaEvent, + type ResponseTextDoneEvent as ResponseTextDoneEvent, + type SessionCreatedEvent as SessionCreatedEvent, + type SessionUpdateEvent as SessionUpdateEvent, + type SessionUpdatedEvent as SessionUpdatedEvent, + type TranscriptionSessionCreated as TranscriptionSessionCreated, + type TranscriptionSessionUpdate as TranscriptionSessionUpdate, + type TranscriptionSessionUpdatedEvent as TranscriptionSessionUpdatedEvent, + }; + + export { + ClientSecrets as ClientSecrets, + type RealtimeSessionCreateResponse as RealtimeSessionCreateResponse, + type ClientSecretCreateResponse as ClientSecretCreateResponse, + type ClientSecretCreateParams as ClientSecretCreateParams, + }; +} diff --git a/src/resources/responses/responses.ts b/src/resources/responses/responses.ts index 5512b0e11..5a8f1a446 100644 --- a/src/resources/responses/responses.ts +++ b/src/resources/responses/responses.ts @@ -463,7 +463,7 @@ export interface Response { * An array of tools the model may call while generating a response. You can * specify which tool to use by setting the `tool_choice` parameter. * - * The two categories of tools you can provide the model are: + * We support the following categories of tools: * * - **Built-in tools**: Tools that are provided by OpenAI that extend the model's * capabilities, like @@ -471,6 +471,9 @@ export interface Response { * [file search](https://platform.openai.com/docs/guides/tools-file-search). * Learn more about * [built-in tools](https://platform.openai.com/docs/guides/tools). + * - **MCP Tools**: Integrations with third-party systems via custom MCP servers or + * predefined connectors such as Google Drive and Notion. Learn more about + * [MCP Tools](https://platform.openai.com/docs/guides/tools-connectors-mcp). * - **Function calls (custom tools)**: Functions that are defined by you, enabling * the model to call your own code with strongly typed arguments and outputs. * Learn more about @@ -4654,89 +4657,15 @@ export type Tool = | FunctionTool | FileSearchTool | ComputerTool - | Tool.WebSearchTool + | WebSearchTool | Tool.Mcp | Tool.CodeInterpreter | Tool.ImageGeneration | Tool.LocalShell | CustomTool - | WebSearchTool; + | WebSearchPreviewTool; export namespace Tool { - /** - * Search the Internet for sources related to the prompt. Learn more about the - * [web search tool](https://platform.openai.com/docs/guides/tools-web-search). - */ - export interface WebSearchTool { - /** - * The type of the web search tool. One of `web_search` or `web_search_2025_08_26`. - */ - type: 'web_search' | 'web_search_2025_08_26'; - - /** - * Filters for the search. - */ - filters?: WebSearchTool.Filters | null; - - /** - * High level guidance for the amount of context window space to use for the - * search. One of `low`, `medium`, or `high`. `medium` is the default. - */ - search_context_size?: 'low' | 'medium' | 'high'; - - /** - * The approximate location of the user. - */ - user_location?: WebSearchTool.UserLocation | null; - } - - export namespace WebSearchTool { - /** - * Filters for the search. - */ - export interface Filters { - /** - * Allowed domains for the search. If not provided, all domains are allowed. - * Subdomains of the provided domains are allowed as well. - * - * Example: `["pubmed.ncbi.nlm.nih.gov"]` - */ - allowed_domains?: Array | null; - } - - /** - * The approximate location of the user. - */ - export interface UserLocation { - /** - * Free text input for the city of the user, e.g. `San Francisco`. - */ - city?: string | null; - - /** - * The two-letter [ISO country code](https://en.wikipedia.org/wiki/ISO_3166-1) of - * the user, e.g. `US`. - */ - country?: string | null; - - /** - * Free text input for the region of the user, e.g. `California`. - */ - region?: string | null; - - /** - * The [IANA timezone](https://timeapi.io/documentation/iana-timezones) of the - * user, e.g. `America/Los_Angeles`. - */ - timezone?: string | null; - - /** - * The type of location approximation. Always `approximate`. - */ - type?: 'approximate'; - } - } - /** * Give the model access to additional tools via remote Model Context Protocol * (MCP) servers. @@ -5151,7 +5080,7 @@ export interface ToolChoiceTypes { * about the * [web search tool](https://platform.openai.com/docs/guides/tools-web-search). */ -export interface WebSearchTool { +export interface WebSearchPreviewTool { /** * The type of the web search tool. One of `web_search_preview` or * `web_search_preview_2025_03_11`. @@ -5167,10 +5096,10 @@ export interface WebSearchTool { /** * The user's location. */ - user_location?: WebSearchTool.UserLocation | null; + user_location?: WebSearchPreviewTool.UserLocation | null; } -export namespace WebSearchTool { +export namespace WebSearchPreviewTool { /** * The user's location. */ @@ -5204,6 +5133,80 @@ export namespace WebSearchTool { } } +/** + * Search the Internet for sources related to the prompt. Learn more about the + * [web search tool](https://platform.openai.com/docs/guides/tools-web-search). + */ +export interface WebSearchTool { + /** + * The type of the web search tool. One of `web_search` or `web_search_2025_08_26`. + */ + type: 'web_search' | 'web_search_2025_08_26'; + + /** + * Filters for the search. + */ + filters?: WebSearchTool.Filters | null; + + /** + * High level guidance for the amount of context window space to use for the + * search. One of `low`, `medium`, or `high`. `medium` is the default. + */ + search_context_size?: 'low' | 'medium' | 'high'; + + /** + * The approximate location of the user. + */ + user_location?: WebSearchTool.UserLocation | null; +} + +export namespace WebSearchTool { + /** + * Filters for the search. + */ + export interface Filters { + /** + * Allowed domains for the search. If not provided, all domains are allowed. + * Subdomains of the provided domains are allowed as well. + * + * Example: `["pubmed.ncbi.nlm.nih.gov"]` + */ + allowed_domains?: Array | null; + } + + /** + * The approximate location of the user. + */ + export interface UserLocation { + /** + * Free text input for the city of the user, e.g. `San Francisco`. + */ + city?: string | null; + + /** + * The two-letter [ISO country code](https://en.wikipedia.org/wiki/ISO_3166-1) of + * the user, e.g. `US`. + */ + country?: string | null; + + /** + * Free text input for the region of the user, e.g. `California`. + */ + region?: string | null; + + /** + * The [IANA timezone](https://timeapi.io/documentation/iana-timezones) of the + * user, e.g. `America/Los_Angeles`. + */ + timezone?: string | null; + + /** + * The type of location approximation. Always `approximate`. + */ + type?: 'approximate'; + } +} + export type ResponseCreateParams = ResponseCreateParamsNonStreaming | ResponseCreateParamsStreaming; export interface ResponseCreateParamsBase { @@ -5410,7 +5413,7 @@ export interface ResponseCreateParamsBase { * An array of tools the model may call while generating a response. You can * specify which tool to use by setting the `tool_choice` parameter. * - * The two categories of tools you can provide the model are: + * We support the following categories of tools: * * - **Built-in tools**: Tools that are provided by OpenAI that extend the model's * capabilities, like @@ -5418,6 +5421,9 @@ export interface ResponseCreateParamsBase { * [file search](https://platform.openai.com/docs/guides/tools-file-search). * Learn more about * [built-in tools](https://platform.openai.com/docs/guides/tools). + * - **MCP Tools**: Integrations with third-party systems via custom MCP servers or + * predefined connectors such as Google Drive and Notion. Learn more about + * [MCP Tools](https://platform.openai.com/docs/guides/tools-connectors-mcp). * - **Function calls (custom tools)**: Functions that are defined by you, enabling * the model to call your own code with strongly typed arguments and outputs. * Learn more about @@ -5673,6 +5679,7 @@ export declare namespace Responses { type ToolChoiceMcp as ToolChoiceMcp, type ToolChoiceOptions as ToolChoiceOptions, type ToolChoiceTypes as ToolChoiceTypes, + type WebSearchPreviewTool as WebSearchPreviewTool, type WebSearchTool as WebSearchTool, type ResponseCreateParams as ResponseCreateParams, type ResponseCreateParamsNonStreaming as ResponseCreateParamsNonStreaming, diff --git a/src/resources/webhooks.ts b/src/resources/webhooks.ts index fa337478b..7449d0830 100644 --- a/src/resources/webhooks.ts +++ b/src/resources/webhooks.ts @@ -559,6 +559,70 @@ export namespace FineTuningJobSucceededWebhookEvent { } } +/** + * Sent when Realtime API Receives a incoming SIP call. + */ +export interface RealtimeCallIncomingWebhookEvent { + /** + * The unique ID of the event. + */ + id: string; + + /** + * The Unix timestamp (in seconds) of when the model response was completed. + */ + created_at: number; + + /** + * Event data payload. + */ + data: RealtimeCallIncomingWebhookEvent.Data; + + /** + * The type of the event. Always `realtime.call.incoming`. + */ + type: 'realtime.call.incoming'; + + /** + * The object of the event. Always `event`. + */ + object?: 'event'; +} + +export namespace RealtimeCallIncomingWebhookEvent { + /** + * Event data payload. + */ + export interface Data { + /** + * The unique ID of this call. + */ + call_id: string; + + /** + * Headers from the SIP Invite. + */ + sip_headers: Array; + } + + export namespace Data { + /** + * A header from the SIP Invite. + */ + export interface SipHeader { + /** + * Name of the SIP Header. + */ + name: string; + + /** + * Value of the SIP Header. + */ + value: string; + } + } +} + /** * Sent when a background response has been cancelled. */ @@ -741,6 +805,7 @@ export type UnwrapWebhookEvent = | FineTuningJobCancelledWebhookEvent | FineTuningJobFailedWebhookEvent | FineTuningJobSucceededWebhookEvent + | RealtimeCallIncomingWebhookEvent | ResponseCancelledWebhookEvent | ResponseCompletedWebhookEvent | ResponseFailedWebhookEvent @@ -758,6 +823,7 @@ export declare namespace Webhooks { type FineTuningJobCancelledWebhookEvent as FineTuningJobCancelledWebhookEvent, type FineTuningJobFailedWebhookEvent as FineTuningJobFailedWebhookEvent, type FineTuningJobSucceededWebhookEvent as FineTuningJobSucceededWebhookEvent, + type RealtimeCallIncomingWebhookEvent as RealtimeCallIncomingWebhookEvent, type ResponseCancelledWebhookEvent as ResponseCancelledWebhookEvent, type ResponseCompletedWebhookEvent as ResponseCompletedWebhookEvent, type ResponseFailedWebhookEvent as ResponseFailedWebhookEvent, diff --git a/src/version.ts b/src/version.ts index cf8aa5418..02ab094c5 100644 --- a/src/version.ts +++ b/src/version.ts @@ -1 +1 @@ -export const VERSION = '5.16.0'; // x-release-please-version +export const VERSION = '5.17.0'; // x-release-please-version diff --git a/tests/api-resources/beta/realtime/transcription-sessions.test.ts b/tests/api-resources/beta/realtime/transcription-sessions.test.ts deleted file mode 100644 index 2c7cbbb15..000000000 --- a/tests/api-resources/beta/realtime/transcription-sessions.test.ts +++ /dev/null @@ -1,21 +0,0 @@ -// File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. - -import OpenAI from 'openai'; - -const client = new OpenAI({ - apiKey: 'My API Key', - baseURL: process.env['TEST_API_BASE_URL'] ?? 'http://127.0.0.1:4010', -}); - -describe('resource transcriptionSessions', () => { - test('create', async () => { - const responsePromise = client.beta.realtime.transcriptionSessions.create({}); - const rawResponse = await responsePromise.asResponse(); - expect(rawResponse).toBeInstanceOf(Response); - const response = await responsePromise; - expect(response).not.toBeInstanceOf(Response); - const dataAndResponse = await responsePromise.withResponse(); - expect(dataAndResponse.data).toBe(response); - expect(dataAndResponse.response).toBe(rawResponse); - }); -}); diff --git a/tests/api-resources/beta/realtime/sessions.test.ts b/tests/api-resources/realtime/client-secrets.test.ts similarity index 86% rename from tests/api-resources/beta/realtime/sessions.test.ts rename to tests/api-resources/realtime/client-secrets.test.ts index 1a75a532c..105cdfe7f 100644 --- a/tests/api-resources/beta/realtime/sessions.test.ts +++ b/tests/api-resources/realtime/client-secrets.test.ts @@ -7,9 +7,9 @@ const client = new OpenAI({ baseURL: process.env['TEST_API_BASE_URL'] ?? 'http://127.0.0.1:4010', }); -describe('resource sessions', () => { +describe('resource clientSecrets', () => { test('create', async () => { - const responsePromise = client.beta.realtime.sessions.create({}); + const responsePromise = client.realtime.clientSecrets.create({}); const rawResponse = await responsePromise.asResponse(); expect(rawResponse).toBeInstanceOf(Response); const response = await responsePromise;