Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .release-please-manifest.json
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
{
".": "5.20.1"
".": "5.20.2"
}
4 changes: 2 additions & 2 deletions .stats.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
configured_endpoints: 118
openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/openai%2Fopenai-16cb18bed32bae8c5840fb39a1bf664026cc40463ad0c487dcb0df1bd3d72db0.yml
openapi_spec_hash: 4cb51b22f98dee1a90bc7add82d1d132
openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/openai%2Fopenai-94b1e3cb0bdc616ff0c2f267c33dadd95f133b1f64e647aab6c64afb292b2793.yml
openapi_spec_hash: 2395319ac9befd59b6536ae7f9564a05
config_hash: 930dac3aa861344867e4ac84f037b5df
13 changes: 13 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,18 @@
# Changelog

## 5.20.2 (2025-09-12)

Full Changelog: [v5.20.1...v5.20.2](https://github.com/openai/openai-node/compare/v5.20.1...v5.20.2)

### Bug Fixes

* coerce nullable values to undefined ([836d1b4](https://github.com/openai/openai-node/commit/836d1b4cdd077c206e1c647c762f4c16e9db444c))


### Chores

* **api:** Minor docs and type updates for realtime ([ccb00dc](https://github.com/openai/openai-node/commit/ccb00dcbd1466976045aafee152cbc038bb293b9))

## 5.20.1 (2025-09-10)

Full Changelog: [v5.20.0...v5.20.1](https://github.com/openai/openai-node/compare/v5.20.0...v5.20.1)
Expand Down
2 changes: 1 addition & 1 deletion jsr.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "@openai/openai",
"version": "5.20.1",
"version": "5.20.2",
"exports": {
".": "./index.ts",
"./helpers/zod": "./helpers/zod.ts",
Expand Down
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "openai",
"version": "5.20.1",
"version": "5.20.2",
"description": "The official TypeScript library for the OpenAI API",
"author": "OpenAI <[email protected]>",
"types": "dist/index.d.ts",
Expand Down
6 changes: 3 additions & 3 deletions src/internal/utils/values.ts
Original file line number Diff line number Diff line change
Expand Up @@ -76,21 +76,21 @@ export const coerceBoolean = (value: unknown): boolean => {
};

export const maybeCoerceInteger = (value: unknown): number | undefined => {
if (value === undefined) {
if (value == null) {
return undefined;
}
return coerceInteger(value);
};

export const maybeCoerceFloat = (value: unknown): number | undefined => {
if (value === undefined) {
if (value == null) {
return undefined;
}
return coerceFloat(value);
};

export const maybeCoerceBoolean = (value: unknown): boolean | undefined => {
if (value === undefined) {
if (value == null) {
return undefined;
}
return coerceBoolean(value);
Expand Down
93 changes: 61 additions & 32 deletions src/resources/realtime/client-secrets.ts
Original file line number Diff line number Diff line change
Expand Up @@ -181,16 +181,19 @@ export namespace RealtimeSessionCreateResponse {
/**
* Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
* set to `null` to turn off, in which case the client must manually trigger model
* response. Server VAD means that the model will detect the start and end of
* speech based on audio volume and respond at the end of user speech. Semantic VAD
* is more advanced and uses a turn detection model (in conjunction with VAD) to
* semantically estimate whether the user has finished speaking, then dynamically
* sets a timeout based on this probability. For example, if user audio trails off
* with "uhhm", the model will score a low probability of turn end and wait longer
* for the user to continue speaking. This can be useful for more natural
* conversations, but may have a higher latency.
* response.
*
* Server VAD means that the model will detect the start and end of speech based on
* audio volume and respond at the end of user speech.
*
* Semantic VAD is more advanced and uses a turn detection model (in conjunction
* with VAD) to semantically estimate whether the user has finished speaking, then
* dynamically sets a timeout based on this probability. For example, if user audio
* trails off with "uhhm", the model will score a low probability of turn end and
* wait longer for the user to continue speaking. This can be useful for more
* natural conversations, but may have a higher latency.
*/
turn_detection?: Input.TurnDetection;
turn_detection?: Input.ServerVad | Input.SemanticVad | null;
}

export namespace Input {
Expand All @@ -211,35 +214,34 @@ export namespace RealtimeSessionCreateResponse {
}

/**
* Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
* set to `null` to turn off, in which case the client must manually trigger model
* response. Server VAD means that the model will detect the start and end of
* speech based on audio volume and respond at the end of user speech. Semantic VAD
* is more advanced and uses a turn detection model (in conjunction with VAD) to
* semantically estimate whether the user has finished speaking, then dynamically
* sets a timeout based on this probability. For example, if user audio trails off
* with "uhhm", the model will score a low probability of turn end and wait longer
* for the user to continue speaking. This can be useful for more natural
* conversations, but may have a higher latency.
* Server-side voice activity detection (VAD) which flips on when user speech is
* detected and off after a period of silence.
*/
export interface TurnDetection {
export interface ServerVad {
/**
* Whether or not to automatically generate a response when a VAD stop event
* occurs.
* Type of turn detection, `server_vad` to turn on simple Server VAD.
*/
create_response?: boolean;
type: 'server_vad';

/**
* Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
* will wait longer for the user to continue speaking, `high` will respond more
* quickly. `auto` is the default and is equivalent to `medium`. `low`, `medium`,
* and `high` have max timeouts of 8s, 4s, and 2s respectively.
* Whether or not to automatically generate a response when a VAD stop event
* occurs.
*/
eagerness?: 'low' | 'medium' | 'high' | 'auto';
create_response?: boolean;

/**
* Optional idle timeout after which turn detection will auto-timeout when no
* additional audio is received and emits a `timeout_triggered` event.
* Optional timeout after which a model response will be triggered automatically.
* This is useful for situations in which a long pause from the user is unexpected,
* such as a phone call. The model will effectively prompt the user to continue the
* conversation based on the current context.
*
* The timeout value will be applied after the last model response's audio has
* finished playing, i.e. it's set to the `response.done` time plus audio playback
* duration.
*
* An `input_audio_buffer.timeout_triggered` event (plus events associated with the
* Response) will be emitted when the timeout is reached. Idle timeout is currently
* only supported for `server_vad` mode.
*/
idle_timeout_ms?: number | null;

Expand Down Expand Up @@ -269,11 +271,38 @@ export namespace RealtimeSessionCreateResponse {
* model, and thus might perform better in noisy environments.
*/
threshold?: number;
}

/**
* Server-side semantic turn detection which uses a model to determine when the
* user has finished speaking.
*/
export interface SemanticVad {
/**
* Type of turn detection.
* Type of turn detection, `semantic_vad` to turn on Semantic VAD.
*/
type?: 'server_vad' | 'semantic_vad';
type: 'semantic_vad';

/**
* Whether or not to automatically generate a response when a VAD stop event
* occurs.
*/
create_response?: boolean;

/**
* Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
* will wait longer for the user to continue speaking, `high` will respond more
* quickly. `auto` is the default and is equivalent to `medium`. `low`, `medium`,
* and `high` have max timeouts of 8s, 4s, and 2s respectively.
*/
eagerness?: 'low' | 'medium' | 'high' | 'auto';

/**
* Whether or not to automatically interrupt any ongoing response with output to
* the default conversation (i.e. `conversation` of `auto`) when a VAD start event
* occurs.
*/
interrupt_response?: boolean;
}
}

Expand Down
Loading