diff --git a/.release-please-manifest.json b/.release-please-manifest.json
index 5c753b441..8e67d2540 100644
--- a/.release-please-manifest.json
+++ b/.release-please-manifest.json
@@ -1,3 +1,3 @@
{
- ".": "6.3.0"
+ ".": "6.4.0"
}
diff --git a/.stats.yml b/.stats.yml
index e68631a07..09d2eb1de 100644
--- a/.stats.yml
+++ b/.stats.yml
@@ -1,4 +1,4 @@
configured_endpoints: 135
-openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/openai%2Fopenai-e66e85fb7f72477256dca1acb6b23396989d381c5c1b318de564195436bcb93f.yml
-openapi_spec_hash: 0a4bbb5aa0ae532a072bd6b3854e70b1
-config_hash: 89bf7bb3a1f9439ffc6ea0e7dc57ba9b
+openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/openai%2Fopenai-104cced8f4c7436a76eea02e26307828166405ccfb296faffb008b72772c11a7.yml
+openapi_spec_hash: fdc03ed84a65a31b80da909255e53924
+config_hash: 03b48e9b8c7231a902403210dbd7dfa0
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 8e057855f..d5e2b756c 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,13 @@
# Changelog
+## 6.4.0 (2025-10-16)
+
+Full Changelog: [v6.3.0...v6.4.0](https://github.com/openai/openai-node/compare/v6.3.0...v6.4.0)
+
+### Features
+
+* **api:** Add support for gpt-4o-transcribe-diarize on audio/transcriptions endpoint ([2d27392](https://github.com/openai/openai-node/commit/2d27392ac1cd082f7defb730326d11d8e733353f))
+
## 6.3.0 (2025-10-10)
Full Changelog: [v6.2.0...v6.3.0](https://github.com/openai/openai-node/compare/v6.2.0...v6.3.0)
diff --git a/api.md b/api.md
index 17b9d6797..4f01df498 100644
--- a/api.md
+++ b/api.md
@@ -156,11 +156,14 @@ Types:
Types:
- Transcription
+- TranscriptionDiarized
+- TranscriptionDiarizedSegment
- TranscriptionInclude
- TranscriptionSegment
- TranscriptionStreamEvent
- TranscriptionTextDeltaEvent
- TranscriptionTextDoneEvent
+- TranscriptionTextSegmentEvent
- TranscriptionVerbose
- TranscriptionWord
- TranscriptionCreateResponse
diff --git a/jsr.json b/jsr.json
index e23f193a5..0f22fd072 100644
--- a/jsr.json
+++ b/jsr.json
@@ -1,6 +1,6 @@
{
"name": "@openai/openai",
- "version": "6.3.0",
+ "version": "6.4.0",
"exports": {
".": "./index.ts",
"./helpers/zod": "./helpers/zod.ts",
diff --git a/package.json b/package.json
index da7ad39b5..7c9108122 100644
--- a/package.json
+++ b/package.json
@@ -1,6 +1,6 @@
{
"name": "openai",
- "version": "6.3.0",
+ "version": "6.4.0",
"description": "The official TypeScript library for the OpenAI API",
"author": "OpenAI ",
"types": "dist/index.d.ts",
diff --git a/src/resources/audio/audio.ts b/src/resources/audio/audio.ts
index 081db7d99..b17ae2863 100644
--- a/src/resources/audio/audio.ts
+++ b/src/resources/audio/audio.ts
@@ -10,11 +10,14 @@ import {
TranscriptionCreateParamsNonStreaming,
TranscriptionCreateParamsStreaming,
TranscriptionCreateResponse,
+ TranscriptionDiarized,
+ TranscriptionDiarizedSegment,
TranscriptionInclude,
TranscriptionSegment,
TranscriptionStreamEvent,
TranscriptionTextDeltaEvent,
TranscriptionTextDoneEvent,
+ TranscriptionTextSegmentEvent,
TranscriptionVerbose,
TranscriptionWord,
Transcriptions,
@@ -34,14 +37,20 @@ export class Audio extends APIResource {
speech: SpeechAPI.Speech = new SpeechAPI.Speech(this._client);
}
-export type AudioModel = 'whisper-1' | 'gpt-4o-transcribe' | 'gpt-4o-mini-transcribe';
+export type AudioModel =
+ | 'whisper-1'
+ | 'gpt-4o-transcribe'
+ | 'gpt-4o-mini-transcribe'
+ | 'gpt-4o-transcribe-diarize';
/**
* The format of the output, in one of these options: `json`, `text`, `srt`,
- * `verbose_json`, or `vtt`. For `gpt-4o-transcribe` and `gpt-4o-mini-transcribe`,
- * the only supported format is `json`.
+ * `verbose_json`, `vtt`, or `diarized_json`. For `gpt-4o-transcribe` and
+ * `gpt-4o-mini-transcribe`, the only supported format is `json`. For
+ * `gpt-4o-transcribe-diarize`, the supported formats are `json`, `text`, and
+ * `diarized_json`, with `diarized_json` required to receive speaker annotations.
*/
-export type AudioResponseFormat = 'json' | 'text' | 'srt' | 'verbose_json' | 'vtt';
+export type AudioResponseFormat = 'json' | 'text' | 'srt' | 'verbose_json' | 'vtt' | 'diarized_json';
Audio.Transcriptions = Transcriptions;
Audio.Translations = Translations;
@@ -53,11 +62,14 @@ export declare namespace Audio {
export {
Transcriptions as Transcriptions,
type Transcription as Transcription,
+ type TranscriptionDiarized as TranscriptionDiarized,
+ type TranscriptionDiarizedSegment as TranscriptionDiarizedSegment,
type TranscriptionInclude as TranscriptionInclude,
type TranscriptionSegment as TranscriptionSegment,
type TranscriptionStreamEvent as TranscriptionStreamEvent,
type TranscriptionTextDeltaEvent as TranscriptionTextDeltaEvent,
type TranscriptionTextDoneEvent as TranscriptionTextDoneEvent,
+ type TranscriptionTextSegmentEvent as TranscriptionTextSegmentEvent,
type TranscriptionVerbose as TranscriptionVerbose,
type TranscriptionWord as TranscriptionWord,
type TranscriptionCreateResponse as TranscriptionCreateResponse,
diff --git a/src/resources/audio/index.ts b/src/resources/audio/index.ts
index deed39ede..f030c4ee9 100644
--- a/src/resources/audio/index.ts
+++ b/src/resources/audio/index.ts
@@ -5,11 +5,14 @@ export { Speech, type SpeechModel, type SpeechCreateParams } from './speech';
export {
Transcriptions,
type Transcription,
+ type TranscriptionDiarized,
+ type TranscriptionDiarizedSegment,
type TranscriptionInclude,
type TranscriptionSegment,
type TranscriptionStreamEvent,
type TranscriptionTextDeltaEvent,
type TranscriptionTextDoneEvent,
+ type TranscriptionTextSegmentEvent,
type TranscriptionVerbose,
type TranscriptionWord,
type TranscriptionCreateResponse,
diff --git a/src/resources/audio/transcriptions.ts b/src/resources/audio/transcriptions.ts
index 6fbbf8c3a..3cf6c84ad 100644
--- a/src/resources/audio/transcriptions.ts
+++ b/src/resources/audio/transcriptions.ts
@@ -166,6 +166,138 @@ export namespace Transcription {
}
}
+/**
+ * Represents a diarized transcription response returned by the model, including
+ * the combined transcript and speaker-segment annotations.
+ */
+export interface TranscriptionDiarized {
+ /**
+ * Duration of the input audio in seconds.
+ */
+ duration: number;
+
+ /**
+ * Segments of the transcript annotated with timestamps and speaker labels.
+ */
+ segments: Array;
+
+ /**
+ * The type of task that was run. Always `transcribe`.
+ */
+ task: 'transcribe';
+
+ /**
+ * The concatenated transcript text for the entire audio input.
+ */
+ text: string;
+
+ /**
+ * Token or duration usage statistics for the request.
+ */
+ usage?: TranscriptionDiarized.Tokens | TranscriptionDiarized.Duration;
+}
+
+export namespace TranscriptionDiarized {
+ /**
+ * Usage statistics for models billed by token usage.
+ */
+ export interface Tokens {
+ /**
+ * Number of input tokens billed for this request.
+ */
+ input_tokens: number;
+
+ /**
+ * Number of output tokens generated.
+ */
+ output_tokens: number;
+
+ /**
+ * Total number of tokens used (input + output).
+ */
+ total_tokens: number;
+
+ /**
+ * The type of the usage object. Always `tokens` for this variant.
+ */
+ type: 'tokens';
+
+ /**
+ * Details about the input tokens billed for this request.
+ */
+ input_token_details?: Tokens.InputTokenDetails;
+ }
+
+ export namespace Tokens {
+ /**
+ * Details about the input tokens billed for this request.
+ */
+ export interface InputTokenDetails {
+ /**
+ * Number of audio tokens billed for this request.
+ */
+ audio_tokens?: number;
+
+ /**
+ * Number of text tokens billed for this request.
+ */
+ text_tokens?: number;
+ }
+ }
+
+ /**
+ * Usage statistics for models billed by audio input duration.
+ */
+ export interface Duration {
+ /**
+ * Duration of the input audio in seconds.
+ */
+ seconds: number;
+
+ /**
+ * The type of the usage object. Always `duration` for this variant.
+ */
+ type: 'duration';
+ }
+}
+
+/**
+ * A segment of diarized transcript text with speaker metadata.
+ */
+export interface TranscriptionDiarizedSegment {
+ /**
+ * Unique identifier for the segment.
+ */
+ id: string;
+
+ /**
+ * End timestamp of the segment in seconds.
+ */
+ end: number;
+
+ /**
+ * Speaker label for this segment. When known speakers are provided, the label
+ * matches `known_speaker_names[]`. Otherwise speakers are labeled sequentially
+ * using capital letters (`A`, `B`, ...).
+ */
+ speaker: string;
+
+ /**
+ * Start timestamp of the segment in seconds.
+ */
+ start: number;
+
+ /**
+ * Transcript text for this segment.
+ */
+ text: string;
+
+ /**
+ * The type of the segment. Always `transcript.text.segment`.
+ */
+ type: 'transcript.text.segment';
+}
+
export type TranscriptionInclude = 'logprobs';
export interface TranscriptionSegment {
@@ -224,12 +356,15 @@ export interface TranscriptionSegment {
}
/**
- * Emitted when there is an additional text delta. This is also the first event
- * emitted when the transcription starts. Only emitted when you
+ * Emitted when a diarized transcription returns a completed segment with speaker
+ * information. Only emitted when you
* [create a transcription](https://platform.openai.com/docs/api-reference/audio/create-transcription)
- * with the `Stream` parameter set to `true`.
+ * with `stream` set to `true` and `response_format` set to `diarized_json`.
*/
-export type TranscriptionStreamEvent = TranscriptionTextDeltaEvent | TranscriptionTextDoneEvent;
+export type TranscriptionStreamEvent =
+ | TranscriptionTextSegmentEvent
+ | TranscriptionTextDeltaEvent
+ | TranscriptionTextDoneEvent;
/**
* Emitted when there is an additional text delta. This is also the first event
@@ -254,6 +389,12 @@ export interface TranscriptionTextDeltaEvent {
* with the `include[]` parameter set to `logprobs`.
*/
logprobs?: Array;
+
+ /**
+ * Identifier of the diarized segment that this delta belongs to. Only present when
+ * using `gpt-4o-transcribe-diarize`.
+ */
+ segment_id?: string;
}
export namespace TranscriptionTextDeltaEvent {
@@ -372,6 +513,44 @@ export namespace TranscriptionTextDoneEvent {
}
}
+/**
+ * Emitted when a diarized transcription returns a completed segment with speaker
+ * information. Only emitted when you
+ * [create a transcription](https://platform.openai.com/docs/api-reference/audio/create-transcription)
+ * with `stream` set to `true` and `response_format` set to `diarized_json`.
+ */
+export interface TranscriptionTextSegmentEvent {
+ /**
+ * Unique identifier for the segment.
+ */
+ id: string;
+
+ /**
+ * End timestamp of the segment in seconds.
+ */
+ end: number;
+
+ /**
+ * Speaker label for this segment.
+ */
+ speaker: string;
+
+ /**
+ * Start timestamp of the segment in seconds.
+ */
+ start: number;
+
+ /**
+ * Transcript text for this segment.
+ */
+ text: string;
+
+ /**
+ * The type of the event. Always `transcript.text.segment`.
+ */
+ type: 'transcript.text.segment';
+}
+
/**
* Represents a verbose json transcription response returned by model, based on the
* provided input.
@@ -446,7 +625,7 @@ export interface TranscriptionWord {
* Represents a transcription response returned by model, based on the provided
* input.
*/
-export type TranscriptionCreateResponse = Transcription | TranscriptionVerbose;
+export type TranscriptionCreateResponse = Transcription | TranscriptionDiarized | TranscriptionVerbose;
export type TranscriptionCreateParams<
ResponseFormat extends AudioAPI.AudioResponseFormat | undefined = AudioAPI.AudioResponseFormat | undefined,
@@ -463,8 +642,8 @@ export interface TranscriptionCreateParamsBase<
/**
* ID of the model to use. The options are `gpt-4o-transcribe`,
- * `gpt-4o-mini-transcribe`, and `whisper-1` (which is powered by our open source
- * Whisper V2 model).
+ * `gpt-4o-mini-transcribe`, `whisper-1` (which is powered by our open source
+ * Whisper V2 model), and `gpt-4o-transcribe-diarize`.
*/
model: (string & {}) | AudioAPI.AudioModel;
@@ -473,6 +652,8 @@ export interface TranscriptionCreateParamsBase<
* first normalizes loudness and then uses voice activity detection (VAD) to choose
* boundaries. `server_vad` object can be provided to tweak VAD detection
* parameters manually. If unset, the audio is transcribed as a single block.
+ * Required when using `gpt-4o-transcribe-diarize` for inputs longer than 30
+ * seconds.
*/
chunking_strategy?: 'auto' | TranscriptionCreateParams.VadConfig | null;
@@ -481,10 +662,27 @@ export interface TranscriptionCreateParamsBase<
* return the log probabilities of the tokens in the response to understand the
* model's confidence in the transcription. `logprobs` only works with
* response_format set to `json` and only with the models `gpt-4o-transcribe` and
- * `gpt-4o-mini-transcribe`.
+ * `gpt-4o-mini-transcribe`. This field is not supported when using
+ * `gpt-4o-transcribe-diarize`.
*/
include?: Array;
+ /**
+ * Optional list of speaker names that correspond to the audio samples provided in
+ * `known_speaker_references[]`. Each entry should be a short identifier (for
+ * example `customer` or `agent`). Up to 4 speakers are supported.
+ */
+ known_speaker_names?: Array;
+
+ /**
+ * Optional list of audio samples (as
+ * [data URLs](https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/Data_URLs))
+ * that contain known speaker references matching `known_speaker_names[]`. Each
+ * sample must be between 2 and 10 seconds, and can use any of the same input audio
+ * formats supported by `file`.
+ */
+ known_speaker_references?: Array;
+
/**
* The language of the input audio. Supplying the input language in
* [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`)
@@ -496,14 +694,17 @@ export interface TranscriptionCreateParamsBase<
* An optional text to guide the model's style or continue a previous audio
* segment. The
* [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting)
- * should match the audio language.
+ * should match the audio language. This field is not supported when using
+ * `gpt-4o-transcribe-diarize`.
*/
prompt?: string;
/**
* The format of the output, in one of these options: `json`, `text`, `srt`,
- * `verbose_json`, or `vtt`. For `gpt-4o-transcribe` and `gpt-4o-mini-transcribe`,
- * the only supported format is `json`.
+ * `verbose_json`, `vtt`, or `diarized_json`. For `gpt-4o-transcribe` and
+ * `gpt-4o-mini-transcribe`, the only supported format is `json`. For
+ * `gpt-4o-transcribe-diarize`, the supported formats are `json`, `text`, and
+ * `diarized_json`, with `diarized_json` required to receive speaker annotations.
*/
response_format?: ResponseFormat;
@@ -533,7 +734,8 @@ export interface TranscriptionCreateParamsBase<
* `response_format` must be set `verbose_json` to use timestamp granularities.
* Either or both of these options are supported: `word`, or `segment`. Note: There
* is no additional latency for segment timestamps, but generating word timestamps
- * incurs additional latency.
+ * incurs additional latency. This option is not available for
+ * `gpt-4o-transcribe-diarize`.
*/
timestamp_granularities?: Array<'word' | 'segment'>;
}
@@ -602,11 +804,14 @@ export interface TranscriptionCreateParamsStreaming extends TranscriptionCreateP
export declare namespace Transcriptions {
export {
type Transcription as Transcription,
+ type TranscriptionDiarized as TranscriptionDiarized,
+ type TranscriptionDiarizedSegment as TranscriptionDiarizedSegment,
type TranscriptionInclude as TranscriptionInclude,
type TranscriptionSegment as TranscriptionSegment,
type TranscriptionStreamEvent as TranscriptionStreamEvent,
type TranscriptionTextDeltaEvent as TranscriptionTextDeltaEvent,
type TranscriptionTextDoneEvent as TranscriptionTextDoneEvent,
+ type TranscriptionTextSegmentEvent as TranscriptionTextSegmentEvent,
type TranscriptionVerbose as TranscriptionVerbose,
type TranscriptionWord as TranscriptionWord,
type TranscriptionCreateResponse as TranscriptionCreateResponse,
diff --git a/src/resources/realtime/realtime.ts b/src/resources/realtime/realtime.ts
index 1e5a09c4e..2eb4bb776 100644
--- a/src/resources/realtime/realtime.ts
+++ b/src/resources/realtime/realtime.ts
@@ -32,16 +32,17 @@ export interface AudioTranscription {
/**
* The model to use for transcription. Current options are `whisper-1`,
- * `gpt-4o-transcribe-latest`, `gpt-4o-mini-transcribe`, and `gpt-4o-transcribe`.
+ * `gpt-4o-mini-transcribe`, `gpt-4o-transcribe`, and `gpt-4o-transcribe-diarize`.
+ * Use `gpt-4o-transcribe-diarize` when you need diarization with speaker labels.
*/
- model?: 'whisper-1' | 'gpt-4o-transcribe-latest' | 'gpt-4o-mini-transcribe' | 'gpt-4o-transcribe';
+ model?: 'whisper-1' | 'gpt-4o-mini-transcribe' | 'gpt-4o-transcribe' | 'gpt-4o-transcribe-diarize';
/**
* An optional text to guide the model's style or continue a previous audio
* segment. For `whisper-1`, the
* [prompt is a list of keywords](https://platform.openai.com/docs/guides/speech-to-text#prompting).
- * For `gpt-4o-transcribe` models, the prompt is a free text string, for example
- * "expect words related to technology".
+ * For `gpt-4o-transcribe` models (excluding `gpt-4o-transcribe-diarize`), the
+ * prompt is a free text string, for example "expect words related to technology".
*/
prompt?: string;
}
diff --git a/src/resources/vector-stores/vector-stores.ts b/src/resources/vector-stores/vector-stores.ts
index 4026c0f15..e0ee9d88b 100644
--- a/src/resources/vector-stores/vector-stores.ts
+++ b/src/resources/vector-stores/vector-stores.ts
@@ -363,6 +363,12 @@ export interface VectorStoreCreateParams {
*/
chunking_strategy?: FileChunkingStrategyParam;
+ /**
+ * A description for the vector store. Can be used to describe the vector store's
+ * purpose.
+ */
+ description?: string;
+
/**
* The expiration policy for a vector store.
*/
diff --git a/src/version.ts b/src/version.ts
index e33d7ab0f..200bf6322 100644
--- a/src/version.ts
+++ b/src/version.ts
@@ -1 +1 @@
-export const VERSION = '6.3.0'; // x-release-please-version
+export const VERSION = '6.4.0'; // x-release-please-version
diff --git a/tests/api-resources/audio/transcriptions.test.ts b/tests/api-resources/audio/transcriptions.test.ts
index 4111b519f..34441c3f3 100644
--- a/tests/api-resources/audio/transcriptions.test.ts
+++ b/tests/api-resources/audio/transcriptions.test.ts
@@ -28,6 +28,8 @@ describe('resource transcriptions', () => {
model: 'gpt-4o-transcribe',
chunking_strategy: 'auto',
include: ['logprobs'],
+ known_speaker_names: ['string'],
+ known_speaker_references: ['string'],
language: 'language',
prompt: 'prompt',
response_format: 'json',