diff --git a/.release-please-manifest.json b/.release-please-manifest.json index 5c753b441..8e67d2540 100644 --- a/.release-please-manifest.json +++ b/.release-please-manifest.json @@ -1,3 +1,3 @@ { - ".": "6.3.0" + ".": "6.4.0" } diff --git a/.stats.yml b/.stats.yml index e68631a07..09d2eb1de 100644 --- a/.stats.yml +++ b/.stats.yml @@ -1,4 +1,4 @@ configured_endpoints: 135 -openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/openai%2Fopenai-e66e85fb7f72477256dca1acb6b23396989d381c5c1b318de564195436bcb93f.yml -openapi_spec_hash: 0a4bbb5aa0ae532a072bd6b3854e70b1 -config_hash: 89bf7bb3a1f9439ffc6ea0e7dc57ba9b +openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/openai%2Fopenai-104cced8f4c7436a76eea02e26307828166405ccfb296faffb008b72772c11a7.yml +openapi_spec_hash: fdc03ed84a65a31b80da909255e53924 +config_hash: 03b48e9b8c7231a902403210dbd7dfa0 diff --git a/CHANGELOG.md b/CHANGELOG.md index 8e057855f..d5e2b756c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,13 @@ # Changelog +## 6.4.0 (2025-10-16) + +Full Changelog: [v6.3.0...v6.4.0](https://github.com/openai/openai-node/compare/v6.3.0...v6.4.0) + +### Features + +* **api:** Add support for gpt-4o-transcribe-diarize on audio/transcriptions endpoint ([2d27392](https://github.com/openai/openai-node/commit/2d27392ac1cd082f7defb730326d11d8e733353f)) + ## 6.3.0 (2025-10-10) Full Changelog: [v6.2.0...v6.3.0](https://github.com/openai/openai-node/compare/v6.2.0...v6.3.0) diff --git a/api.md b/api.md index 17b9d6797..4f01df498 100644 --- a/api.md +++ b/api.md @@ -156,11 +156,14 @@ Types: Types: - Transcription +- TranscriptionDiarized +- TranscriptionDiarizedSegment - TranscriptionInclude - TranscriptionSegment - TranscriptionStreamEvent - TranscriptionTextDeltaEvent - TranscriptionTextDoneEvent +- TranscriptionTextSegmentEvent - TranscriptionVerbose - TranscriptionWord - TranscriptionCreateResponse diff --git a/jsr.json b/jsr.json index e23f193a5..0f22fd072 100644 --- a/jsr.json +++ b/jsr.json @@ -1,6 +1,6 @@ { "name": "@openai/openai", - "version": "6.3.0", + "version": "6.4.0", "exports": { ".": "./index.ts", "./helpers/zod": "./helpers/zod.ts", diff --git a/package.json b/package.json index da7ad39b5..7c9108122 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "openai", - "version": "6.3.0", + "version": "6.4.0", "description": "The official TypeScript library for the OpenAI API", "author": "OpenAI ", "types": "dist/index.d.ts", diff --git a/src/resources/audio/audio.ts b/src/resources/audio/audio.ts index 081db7d99..b17ae2863 100644 --- a/src/resources/audio/audio.ts +++ b/src/resources/audio/audio.ts @@ -10,11 +10,14 @@ import { TranscriptionCreateParamsNonStreaming, TranscriptionCreateParamsStreaming, TranscriptionCreateResponse, + TranscriptionDiarized, + TranscriptionDiarizedSegment, TranscriptionInclude, TranscriptionSegment, TranscriptionStreamEvent, TranscriptionTextDeltaEvent, TranscriptionTextDoneEvent, + TranscriptionTextSegmentEvent, TranscriptionVerbose, TranscriptionWord, Transcriptions, @@ -34,14 +37,20 @@ export class Audio extends APIResource { speech: SpeechAPI.Speech = new SpeechAPI.Speech(this._client); } -export type AudioModel = 'whisper-1' | 'gpt-4o-transcribe' | 'gpt-4o-mini-transcribe'; +export type AudioModel = + | 'whisper-1' + | 'gpt-4o-transcribe' + | 'gpt-4o-mini-transcribe' + | 'gpt-4o-transcribe-diarize'; /** * The format of the output, in one of these options: `json`, `text`, `srt`, - * `verbose_json`, or `vtt`. For `gpt-4o-transcribe` and `gpt-4o-mini-transcribe`, - * the only supported format is `json`. + * `verbose_json`, `vtt`, or `diarized_json`. For `gpt-4o-transcribe` and + * `gpt-4o-mini-transcribe`, the only supported format is `json`. For + * `gpt-4o-transcribe-diarize`, the supported formats are `json`, `text`, and + * `diarized_json`, with `diarized_json` required to receive speaker annotations. */ -export type AudioResponseFormat = 'json' | 'text' | 'srt' | 'verbose_json' | 'vtt'; +export type AudioResponseFormat = 'json' | 'text' | 'srt' | 'verbose_json' | 'vtt' | 'diarized_json'; Audio.Transcriptions = Transcriptions; Audio.Translations = Translations; @@ -53,11 +62,14 @@ export declare namespace Audio { export { Transcriptions as Transcriptions, type Transcription as Transcription, + type TranscriptionDiarized as TranscriptionDiarized, + type TranscriptionDiarizedSegment as TranscriptionDiarizedSegment, type TranscriptionInclude as TranscriptionInclude, type TranscriptionSegment as TranscriptionSegment, type TranscriptionStreamEvent as TranscriptionStreamEvent, type TranscriptionTextDeltaEvent as TranscriptionTextDeltaEvent, type TranscriptionTextDoneEvent as TranscriptionTextDoneEvent, + type TranscriptionTextSegmentEvent as TranscriptionTextSegmentEvent, type TranscriptionVerbose as TranscriptionVerbose, type TranscriptionWord as TranscriptionWord, type TranscriptionCreateResponse as TranscriptionCreateResponse, diff --git a/src/resources/audio/index.ts b/src/resources/audio/index.ts index deed39ede..f030c4ee9 100644 --- a/src/resources/audio/index.ts +++ b/src/resources/audio/index.ts @@ -5,11 +5,14 @@ export { Speech, type SpeechModel, type SpeechCreateParams } from './speech'; export { Transcriptions, type Transcription, + type TranscriptionDiarized, + type TranscriptionDiarizedSegment, type TranscriptionInclude, type TranscriptionSegment, type TranscriptionStreamEvent, type TranscriptionTextDeltaEvent, type TranscriptionTextDoneEvent, + type TranscriptionTextSegmentEvent, type TranscriptionVerbose, type TranscriptionWord, type TranscriptionCreateResponse, diff --git a/src/resources/audio/transcriptions.ts b/src/resources/audio/transcriptions.ts index 6fbbf8c3a..3cf6c84ad 100644 --- a/src/resources/audio/transcriptions.ts +++ b/src/resources/audio/transcriptions.ts @@ -166,6 +166,138 @@ export namespace Transcription { } } +/** + * Represents a diarized transcription response returned by the model, including + * the combined transcript and speaker-segment annotations. + */ +export interface TranscriptionDiarized { + /** + * Duration of the input audio in seconds. + */ + duration: number; + + /** + * Segments of the transcript annotated with timestamps and speaker labels. + */ + segments: Array; + + /** + * The type of task that was run. Always `transcribe`. + */ + task: 'transcribe'; + + /** + * The concatenated transcript text for the entire audio input. + */ + text: string; + + /** + * Token or duration usage statistics for the request. + */ + usage?: TranscriptionDiarized.Tokens | TranscriptionDiarized.Duration; +} + +export namespace TranscriptionDiarized { + /** + * Usage statistics for models billed by token usage. + */ + export interface Tokens { + /** + * Number of input tokens billed for this request. + */ + input_tokens: number; + + /** + * Number of output tokens generated. + */ + output_tokens: number; + + /** + * Total number of tokens used (input + output). + */ + total_tokens: number; + + /** + * The type of the usage object. Always `tokens` for this variant. + */ + type: 'tokens'; + + /** + * Details about the input tokens billed for this request. + */ + input_token_details?: Tokens.InputTokenDetails; + } + + export namespace Tokens { + /** + * Details about the input tokens billed for this request. + */ + export interface InputTokenDetails { + /** + * Number of audio tokens billed for this request. + */ + audio_tokens?: number; + + /** + * Number of text tokens billed for this request. + */ + text_tokens?: number; + } + } + + /** + * Usage statistics for models billed by audio input duration. + */ + export interface Duration { + /** + * Duration of the input audio in seconds. + */ + seconds: number; + + /** + * The type of the usage object. Always `duration` for this variant. + */ + type: 'duration'; + } +} + +/** + * A segment of diarized transcript text with speaker metadata. + */ +export interface TranscriptionDiarizedSegment { + /** + * Unique identifier for the segment. + */ + id: string; + + /** + * End timestamp of the segment in seconds. + */ + end: number; + + /** + * Speaker label for this segment. When known speakers are provided, the label + * matches `known_speaker_names[]`. Otherwise speakers are labeled sequentially + * using capital letters (`A`, `B`, ...). + */ + speaker: string; + + /** + * Start timestamp of the segment in seconds. + */ + start: number; + + /** + * Transcript text for this segment. + */ + text: string; + + /** + * The type of the segment. Always `transcript.text.segment`. + */ + type: 'transcript.text.segment'; +} + export type TranscriptionInclude = 'logprobs'; export interface TranscriptionSegment { @@ -224,12 +356,15 @@ export interface TranscriptionSegment { } /** - * Emitted when there is an additional text delta. This is also the first event - * emitted when the transcription starts. Only emitted when you + * Emitted when a diarized transcription returns a completed segment with speaker + * information. Only emitted when you * [create a transcription](https://platform.openai.com/docs/api-reference/audio/create-transcription) - * with the `Stream` parameter set to `true`. + * with `stream` set to `true` and `response_format` set to `diarized_json`. */ -export type TranscriptionStreamEvent = TranscriptionTextDeltaEvent | TranscriptionTextDoneEvent; +export type TranscriptionStreamEvent = + | TranscriptionTextSegmentEvent + | TranscriptionTextDeltaEvent + | TranscriptionTextDoneEvent; /** * Emitted when there is an additional text delta. This is also the first event @@ -254,6 +389,12 @@ export interface TranscriptionTextDeltaEvent { * with the `include[]` parameter set to `logprobs`. */ logprobs?: Array; + + /** + * Identifier of the diarized segment that this delta belongs to. Only present when + * using `gpt-4o-transcribe-diarize`. + */ + segment_id?: string; } export namespace TranscriptionTextDeltaEvent { @@ -372,6 +513,44 @@ export namespace TranscriptionTextDoneEvent { } } +/** + * Emitted when a diarized transcription returns a completed segment with speaker + * information. Only emitted when you + * [create a transcription](https://platform.openai.com/docs/api-reference/audio/create-transcription) + * with `stream` set to `true` and `response_format` set to `diarized_json`. + */ +export interface TranscriptionTextSegmentEvent { + /** + * Unique identifier for the segment. + */ + id: string; + + /** + * End timestamp of the segment in seconds. + */ + end: number; + + /** + * Speaker label for this segment. + */ + speaker: string; + + /** + * Start timestamp of the segment in seconds. + */ + start: number; + + /** + * Transcript text for this segment. + */ + text: string; + + /** + * The type of the event. Always `transcript.text.segment`. + */ + type: 'transcript.text.segment'; +} + /** * Represents a verbose json transcription response returned by model, based on the * provided input. @@ -446,7 +625,7 @@ export interface TranscriptionWord { * Represents a transcription response returned by model, based on the provided * input. */ -export type TranscriptionCreateResponse = Transcription | TranscriptionVerbose; +export type TranscriptionCreateResponse = Transcription | TranscriptionDiarized | TranscriptionVerbose; export type TranscriptionCreateParams< ResponseFormat extends AudioAPI.AudioResponseFormat | undefined = AudioAPI.AudioResponseFormat | undefined, @@ -463,8 +642,8 @@ export interface TranscriptionCreateParamsBase< /** * ID of the model to use. The options are `gpt-4o-transcribe`, - * `gpt-4o-mini-transcribe`, and `whisper-1` (which is powered by our open source - * Whisper V2 model). + * `gpt-4o-mini-transcribe`, `whisper-1` (which is powered by our open source + * Whisper V2 model), and `gpt-4o-transcribe-diarize`. */ model: (string & {}) | AudioAPI.AudioModel; @@ -473,6 +652,8 @@ export interface TranscriptionCreateParamsBase< * first normalizes loudness and then uses voice activity detection (VAD) to choose * boundaries. `server_vad` object can be provided to tweak VAD detection * parameters manually. If unset, the audio is transcribed as a single block. + * Required when using `gpt-4o-transcribe-diarize` for inputs longer than 30 + * seconds. */ chunking_strategy?: 'auto' | TranscriptionCreateParams.VadConfig | null; @@ -481,10 +662,27 @@ export interface TranscriptionCreateParamsBase< * return the log probabilities of the tokens in the response to understand the * model's confidence in the transcription. `logprobs` only works with * response_format set to `json` and only with the models `gpt-4o-transcribe` and - * `gpt-4o-mini-transcribe`. + * `gpt-4o-mini-transcribe`. This field is not supported when using + * `gpt-4o-transcribe-diarize`. */ include?: Array; + /** + * Optional list of speaker names that correspond to the audio samples provided in + * `known_speaker_references[]`. Each entry should be a short identifier (for + * example `customer` or `agent`). Up to 4 speakers are supported. + */ + known_speaker_names?: Array; + + /** + * Optional list of audio samples (as + * [data URLs](https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/Data_URLs)) + * that contain known speaker references matching `known_speaker_names[]`. Each + * sample must be between 2 and 10 seconds, and can use any of the same input audio + * formats supported by `file`. + */ + known_speaker_references?: Array; + /** * The language of the input audio. Supplying the input language in * [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`) @@ -496,14 +694,17 @@ export interface TranscriptionCreateParamsBase< * An optional text to guide the model's style or continue a previous audio * segment. The * [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting) - * should match the audio language. + * should match the audio language. This field is not supported when using + * `gpt-4o-transcribe-diarize`. */ prompt?: string; /** * The format of the output, in one of these options: `json`, `text`, `srt`, - * `verbose_json`, or `vtt`. For `gpt-4o-transcribe` and `gpt-4o-mini-transcribe`, - * the only supported format is `json`. + * `verbose_json`, `vtt`, or `diarized_json`. For `gpt-4o-transcribe` and + * `gpt-4o-mini-transcribe`, the only supported format is `json`. For + * `gpt-4o-transcribe-diarize`, the supported formats are `json`, `text`, and + * `diarized_json`, with `diarized_json` required to receive speaker annotations. */ response_format?: ResponseFormat; @@ -533,7 +734,8 @@ export interface TranscriptionCreateParamsBase< * `response_format` must be set `verbose_json` to use timestamp granularities. * Either or both of these options are supported: `word`, or `segment`. Note: There * is no additional latency for segment timestamps, but generating word timestamps - * incurs additional latency. + * incurs additional latency. This option is not available for + * `gpt-4o-transcribe-diarize`. */ timestamp_granularities?: Array<'word' | 'segment'>; } @@ -602,11 +804,14 @@ export interface TranscriptionCreateParamsStreaming extends TranscriptionCreateP export declare namespace Transcriptions { export { type Transcription as Transcription, + type TranscriptionDiarized as TranscriptionDiarized, + type TranscriptionDiarizedSegment as TranscriptionDiarizedSegment, type TranscriptionInclude as TranscriptionInclude, type TranscriptionSegment as TranscriptionSegment, type TranscriptionStreamEvent as TranscriptionStreamEvent, type TranscriptionTextDeltaEvent as TranscriptionTextDeltaEvent, type TranscriptionTextDoneEvent as TranscriptionTextDoneEvent, + type TranscriptionTextSegmentEvent as TranscriptionTextSegmentEvent, type TranscriptionVerbose as TranscriptionVerbose, type TranscriptionWord as TranscriptionWord, type TranscriptionCreateResponse as TranscriptionCreateResponse, diff --git a/src/resources/realtime/realtime.ts b/src/resources/realtime/realtime.ts index 1e5a09c4e..2eb4bb776 100644 --- a/src/resources/realtime/realtime.ts +++ b/src/resources/realtime/realtime.ts @@ -32,16 +32,17 @@ export interface AudioTranscription { /** * The model to use for transcription. Current options are `whisper-1`, - * `gpt-4o-transcribe-latest`, `gpt-4o-mini-transcribe`, and `gpt-4o-transcribe`. + * `gpt-4o-mini-transcribe`, `gpt-4o-transcribe`, and `gpt-4o-transcribe-diarize`. + * Use `gpt-4o-transcribe-diarize` when you need diarization with speaker labels. */ - model?: 'whisper-1' | 'gpt-4o-transcribe-latest' | 'gpt-4o-mini-transcribe' | 'gpt-4o-transcribe'; + model?: 'whisper-1' | 'gpt-4o-mini-transcribe' | 'gpt-4o-transcribe' | 'gpt-4o-transcribe-diarize'; /** * An optional text to guide the model's style or continue a previous audio * segment. For `whisper-1`, the * [prompt is a list of keywords](https://platform.openai.com/docs/guides/speech-to-text#prompting). - * For `gpt-4o-transcribe` models, the prompt is a free text string, for example - * "expect words related to technology". + * For `gpt-4o-transcribe` models (excluding `gpt-4o-transcribe-diarize`), the + * prompt is a free text string, for example "expect words related to technology". */ prompt?: string; } diff --git a/src/resources/vector-stores/vector-stores.ts b/src/resources/vector-stores/vector-stores.ts index 4026c0f15..e0ee9d88b 100644 --- a/src/resources/vector-stores/vector-stores.ts +++ b/src/resources/vector-stores/vector-stores.ts @@ -363,6 +363,12 @@ export interface VectorStoreCreateParams { */ chunking_strategy?: FileChunkingStrategyParam; + /** + * A description for the vector store. Can be used to describe the vector store's + * purpose. + */ + description?: string; + /** * The expiration policy for a vector store. */ diff --git a/src/version.ts b/src/version.ts index e33d7ab0f..200bf6322 100644 --- a/src/version.ts +++ b/src/version.ts @@ -1 +1 @@ -export const VERSION = '6.3.0'; // x-release-please-version +export const VERSION = '6.4.0'; // x-release-please-version diff --git a/tests/api-resources/audio/transcriptions.test.ts b/tests/api-resources/audio/transcriptions.test.ts index 4111b519f..34441c3f3 100644 --- a/tests/api-resources/audio/transcriptions.test.ts +++ b/tests/api-resources/audio/transcriptions.test.ts @@ -28,6 +28,8 @@ describe('resource transcriptions', () => { model: 'gpt-4o-transcribe', chunking_strategy: 'auto', include: ['logprobs'], + known_speaker_names: ['string'], + known_speaker_references: ['string'], language: 'language', prompt: 'prompt', response_format: 'json',