From ccb00dcbd1466976045aafee152cbc038bb293b9 Mon Sep 17 00:00:00 2001 From: "stainless-app[bot]" <142633134+stainless-app[bot]@users.noreply.github.com> Date: Thu, 11 Sep 2025 18:04:06 +0000 Subject: [PATCH 1/4] chore(api): Minor docs and type updates for realtime --- .stats.yml | 4 +- src/resources/realtime/client-secrets.ts | 93 +++-- src/resources/realtime/realtime.ts | 452 +++++++++++++++-------- src/resources/responses/responses.ts | 16 +- 4 files changed, 371 insertions(+), 194 deletions(-) diff --git a/.stats.yml b/.stats.yml index 2aa16be87..5388f2463 100644 --- a/.stats.yml +++ b/.stats.yml @@ -1,4 +1,4 @@ configured_endpoints: 118 -openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/openai%2Fopenai-16cb18bed32bae8c5840fb39a1bf664026cc40463ad0c487dcb0df1bd3d72db0.yml -openapi_spec_hash: 4cb51b22f98dee1a90bc7add82d1d132 +openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/openai%2Fopenai-c829f9e7f51d4946dae7b02eb37eb857b538a464cf54c7ced5eff1b1c93e07db.yml +openapi_spec_hash: 1b2eaba46b264bcec8831bc496543649 config_hash: 930dac3aa861344867e4ac84f037b5df diff --git a/src/resources/realtime/client-secrets.ts b/src/resources/realtime/client-secrets.ts index 5c53b2e5a..70abeabbf 100644 --- a/src/resources/realtime/client-secrets.ts +++ b/src/resources/realtime/client-secrets.ts @@ -181,16 +181,19 @@ export namespace RealtimeSessionCreateResponse { /** * Configuration for turn detection, ether Server VAD or Semantic VAD. This can be * set to `null` to turn off, in which case the client must manually trigger model - * response. Server VAD means that the model will detect the start and end of - * speech based on audio volume and respond at the end of user speech. Semantic VAD - * is more advanced and uses a turn detection model (in conjunction with VAD) to - * semantically estimate whether the user has finished speaking, then dynamically - * sets a timeout based on this probability. For example, if user audio trails off - * with "uhhm", the model will score a low probability of turn end and wait longer - * for the user to continue speaking. This can be useful for more natural - * conversations, but may have a higher latency. + * response. + * + * Server VAD means that the model will detect the start and end of speech based on + * audio volume and respond at the end of user speech. + * + * Semantic VAD is more advanced and uses a turn detection model (in conjunction + * with VAD) to semantically estimate whether the user has finished speaking, then + * dynamically sets a timeout based on this probability. For example, if user audio + * trails off with "uhhm", the model will score a low probability of turn end and + * wait longer for the user to continue speaking. This can be useful for more + * natural conversations, but may have a higher latency. */ - turn_detection?: Input.TurnDetection; + turn_detection?: Input.ServerVad | Input.SemanticVad | null; } export namespace Input { @@ -211,35 +214,34 @@ export namespace RealtimeSessionCreateResponse { } /** - * Configuration for turn detection, ether Server VAD or Semantic VAD. This can be - * set to `null` to turn off, in which case the client must manually trigger model - * response. Server VAD means that the model will detect the start and end of - * speech based on audio volume and respond at the end of user speech. Semantic VAD - * is more advanced and uses a turn detection model (in conjunction with VAD) to - * semantically estimate whether the user has finished speaking, then dynamically - * sets a timeout based on this probability. For example, if user audio trails off - * with "uhhm", the model will score a low probability of turn end and wait longer - * for the user to continue speaking. This can be useful for more natural - * conversations, but may have a higher latency. + * Server-side voice activity detection (VAD) which flips on when user speech is + * detected and off after a period of silence. */ - export interface TurnDetection { + export interface ServerVad { /** - * Whether or not to automatically generate a response when a VAD stop event - * occurs. + * Type of turn detection, `server_vad` to turn on simple Server VAD. */ - create_response?: boolean; + type: 'server_vad'; /** - * Used only for `semantic_vad` mode. The eagerness of the model to respond. `low` - * will wait longer for the user to continue speaking, `high` will respond more - * quickly. `auto` is the default and is equivalent to `medium`. `low`, `medium`, - * and `high` have max timeouts of 8s, 4s, and 2s respectively. + * Whether or not to automatically generate a response when a VAD stop event + * occurs. */ - eagerness?: 'low' | 'medium' | 'high' | 'auto'; + create_response?: boolean; /** - * Optional idle timeout after which turn detection will auto-timeout when no - * additional audio is received and emits a `timeout_triggered` event. + * Optional timeout after which a model response will be triggered automatically. + * This is useful for situations in which a long pause from the user is unexpected, + * such as a phone call. The model will effectively prompt the user to continue the + * conversation based on the current context. + * + * The timeout value will be applied after the last model response's audio has + * finished playing, i.e. it's set to the `response.done` time plus audio playback + * duration. + * + * An `input_audio_buffer.timeout_triggered` event (plus events associated with the + * Response) will be emitted when the timeout is reached. Idle timeout is currently + * only supported for `server_vad` mode. */ idle_timeout_ms?: number | null; @@ -269,11 +271,38 @@ export namespace RealtimeSessionCreateResponse { * model, and thus might perform better in noisy environments. */ threshold?: number; + } + /** + * Server-side semantic turn detection which uses a model to determine when the + * user has finished speaking. + */ + export interface SemanticVad { /** - * Type of turn detection. + * Type of turn detection, `semantic_vad` to turn on Semantic VAD. */ - type?: 'server_vad' | 'semantic_vad'; + type: 'semantic_vad'; + + /** + * Whether or not to automatically generate a response when a VAD stop event + * occurs. + */ + create_response?: boolean; + + /** + * Used only for `semantic_vad` mode. The eagerness of the model to respond. `low` + * will wait longer for the user to continue speaking, `high` will respond more + * quickly. `auto` is the default and is equivalent to `medium`. `low`, `medium`, + * and `high` have max timeouts of 8s, 4s, and 2s respectively. + */ + eagerness?: 'low' | 'medium' | 'high' | 'auto'; + + /** + * Whether or not to automatically interrupt any ongoing response with output to + * the default conversation (i.e. `conversation` of `auto`) when a VAD start event + * occurs. + */ + interrupt_response?: boolean; } } diff --git a/src/resources/realtime/realtime.ts b/src/resources/realtime/realtime.ts index 12f1f6848..3fe58e7be 100644 --- a/src/resources/realtime/realtime.ts +++ b/src/resources/realtime/realtime.ts @@ -933,16 +933,33 @@ export interface InputAudioBufferSpeechStoppedEvent { } /** - * Returned when the server VAD timeout is triggered for the input audio buffer. + * Returned when the Server VAD timeout is triggered for the input audio buffer. + * This is configured with `idle_timeout_ms` in the `turn_detection` settings of + * the session, and it indicates that there hasn't been any speech detected for the + * configured duration. + * + * The `audio_start_ms` and `audio_end_ms` fields indicate the segment of audio + * after the last model response up to the triggering time, as an offset from the + * beginning of audio written to the input audio buffer. This means it demarcates + * the segment of audio that was silent and the difference between the start and + * end values will roughly match the configured timeout. + * + * The empty audio will be committed to the conversation as an `input_audio` item + * (there will be a `input_audio_buffer.committed` event) and a model response will + * be generated. There may be speech that didn't trigger VAD but is still detected + * by the model, so the model may respond with something relevant to the + * conversation or a prompt to continue speaking. */ export interface InputAudioBufferTimeoutTriggered { /** - * Millisecond offset where speech ended within the buffered audio. + * Millisecond offset of audio written to the input audio buffer at the time the + * timeout was triggered. */ audio_end_ms: number; /** - * Millisecond offset where speech started within the buffered audio. + * Millisecond offset of audio written to the input audio buffer that was after the + * playback time of the last model response. */ audio_start_ms: number; @@ -1154,16 +1171,19 @@ export interface RealtimeAudioConfigInput { /** * Configuration for turn detection, ether Server VAD or Semantic VAD. This can be * set to `null` to turn off, in which case the client must manually trigger model - * response. Server VAD means that the model will detect the start and end of - * speech based on audio volume and respond at the end of user speech. Semantic VAD - * is more advanced and uses a turn detection model (in conjunction with VAD) to - * semantically estimate whether the user has finished speaking, then dynamically - * sets a timeout based on this probability. For example, if user audio trails off - * with "uhhm", the model will score a low probability of turn end and wait longer - * for the user to continue speaking. This can be useful for more natural - * conversations, but may have a higher latency. - */ - turn_detection?: RealtimeAudioInputTurnDetection; + * response. + * + * Server VAD means that the model will detect the start and end of speech based on + * audio volume and respond at the end of user speech. + * + * Semantic VAD is more advanced and uses a turn detection model (in conjunction + * with VAD) to semantically estimate whether the user has finished speaking, then + * dynamically sets a timeout based on this probability. For example, if user audio + * trails off with "uhhm", the model will score a low probability of turn end and + * wait longer for the user to continue speaking. This can be useful for more + * natural conversations, but may have a higher latency. + */ + turn_detection?: RealtimeAudioInputTurnDetection | null; } export namespace RealtimeAudioConfigInput { @@ -1269,67 +1289,114 @@ export namespace RealtimeAudioFormats { /** * Configuration for turn detection, ether Server VAD or Semantic VAD. This can be * set to `null` to turn off, in which case the client must manually trigger model - * response. Server VAD means that the model will detect the start and end of - * speech based on audio volume and respond at the end of user speech. Semantic VAD - * is more advanced and uses a turn detection model (in conjunction with VAD) to - * semantically estimate whether the user has finished speaking, then dynamically - * sets a timeout based on this probability. For example, if user audio trails off - * with "uhhm", the model will score a low probability of turn end and wait longer - * for the user to continue speaking. This can be useful for more natural - * conversations, but may have a higher latency. + * response. + * + * Server VAD means that the model will detect the start and end of speech based on + * audio volume and respond at the end of user speech. + * + * Semantic VAD is more advanced and uses a turn detection model (in conjunction + * with VAD) to semantically estimate whether the user has finished speaking, then + * dynamically sets a timeout based on this probability. For example, if user audio + * trails off with "uhhm", the model will score a low probability of turn end and + * wait longer for the user to continue speaking. This can be useful for more + * natural conversations, but may have a higher latency. */ -export interface RealtimeAudioInputTurnDetection { - /** - * Whether or not to automatically generate a response when a VAD stop event - * occurs. - */ - create_response?: boolean; +export type RealtimeAudioInputTurnDetection = + | RealtimeAudioInputTurnDetection.ServerVad + | RealtimeAudioInputTurnDetection.SemanticVad; +export namespace RealtimeAudioInputTurnDetection { /** - * Used only for `semantic_vad` mode. The eagerness of the model to respond. `low` - * will wait longer for the user to continue speaking, `high` will respond more - * quickly. `auto` is the default and is equivalent to `medium`. `low`, `medium`, - * and `high` have max timeouts of 8s, 4s, and 2s respectively. + * Server-side voice activity detection (VAD) which flips on when user speech is + * detected and off after a period of silence. */ - eagerness?: 'low' | 'medium' | 'high' | 'auto'; + export interface ServerVad { + /** + * Type of turn detection, `server_vad` to turn on simple Server VAD. + */ + type: 'server_vad'; - /** - * Optional idle timeout after which turn detection will auto-timeout when no - * additional audio is received and emits a `timeout_triggered` event. - */ - idle_timeout_ms?: number | null; + /** + * Whether or not to automatically generate a response when a VAD stop event + * occurs. + */ + create_response?: boolean; - /** - * Whether or not to automatically interrupt any ongoing response with output to - * the default conversation (i.e. `conversation` of `auto`) when a VAD start event - * occurs. - */ - interrupt_response?: boolean; + /** + * Optional timeout after which a model response will be triggered automatically. + * This is useful for situations in which a long pause from the user is unexpected, + * such as a phone call. The model will effectively prompt the user to continue the + * conversation based on the current context. + * + * The timeout value will be applied after the last model response's audio has + * finished playing, i.e. it's set to the `response.done` time plus audio playback + * duration. + * + * An `input_audio_buffer.timeout_triggered` event (plus events associated with the + * Response) will be emitted when the timeout is reached. Idle timeout is currently + * only supported for `server_vad` mode. + */ + idle_timeout_ms?: number | null; - /** - * Used only for `server_vad` mode. Amount of audio to include before the VAD - * detected speech (in milliseconds). Defaults to 300ms. - */ - prefix_padding_ms?: number; + /** + * Whether or not to automatically interrupt any ongoing response with output to + * the default conversation (i.e. `conversation` of `auto`) when a VAD start event + * occurs. + */ + interrupt_response?: boolean; - /** - * Used only for `server_vad` mode. Duration of silence to detect speech stop (in - * milliseconds). Defaults to 500ms. With shorter values the model will respond - * more quickly, but may jump in on short pauses from the user. - */ - silence_duration_ms?: number; + /** + * Used only for `server_vad` mode. Amount of audio to include before the VAD + * detected speech (in milliseconds). Defaults to 300ms. + */ + prefix_padding_ms?: number; - /** - * Used only for `server_vad` mode. Activation threshold for VAD (0.0 to 1.0), this - * defaults to 0.5. A higher threshold will require louder audio to activate the - * model, and thus might perform better in noisy environments. - */ - threshold?: number; + /** + * Used only for `server_vad` mode. Duration of silence to detect speech stop (in + * milliseconds). Defaults to 500ms. With shorter values the model will respond + * more quickly, but may jump in on short pauses from the user. + */ + silence_duration_ms?: number; + + /** + * Used only for `server_vad` mode. Activation threshold for VAD (0.0 to 1.0), this + * defaults to 0.5. A higher threshold will require louder audio to activate the + * model, and thus might perform better in noisy environments. + */ + threshold?: number; + } /** - * Type of turn detection. + * Server-side semantic turn detection which uses a model to determine when the + * user has finished speaking. */ - type?: 'server_vad' | 'semantic_vad'; + export interface SemanticVad { + /** + * Type of turn detection, `semantic_vad` to turn on Semantic VAD. + */ + type: 'semantic_vad'; + + /** + * Whether or not to automatically generate a response when a VAD stop event + * occurs. + */ + create_response?: boolean; + + /** + * Used only for `semantic_vad` mode. The eagerness of the model to respond. `low` + * will wait longer for the user to continue speaking, `high` will respond more + * quickly. `auto` is the default and is equivalent to `medium`. `low`, `medium`, + * and `high` have max timeouts of 8s, 4s, and 2s respectively. + */ + eagerness?: 'low' | 'medium' | 'high' | 'auto'; + + /** + * Whether or not to automatically interrupt any ongoing response with output to + * the default conversation (i.e. `conversation` of `auto`) when a VAD start event + * occurs. + */ + interrupt_response?: boolean; + } } /** @@ -2568,7 +2635,7 @@ export namespace RealtimeServerEvent { } /** - * Realtime session object. + * Realtime session object for the beta interface. */ export interface RealtimeSession { /** @@ -2711,16 +2778,19 @@ export interface RealtimeSession { /** * Configuration for turn detection, ether Server VAD or Semantic VAD. This can be * set to `null` to turn off, in which case the client must manually trigger model - * response. Server VAD means that the model will detect the start and end of - * speech based on audio volume and respond at the end of user speech. Semantic VAD - * is more advanced and uses a turn detection model (in conjunction with VAD) to - * semantically estimate whether the user has finished speaking, then dynamically - * sets a timeout based on this probability. For example, if user audio trails off - * with "uhhm", the model will score a low probability of turn end and wait longer - * for the user to continue speaking. This can be useful for more natural - * conversations, but may have a higher latency. + * response. + * + * Server VAD means that the model will detect the start and end of speech based on + * audio volume and respond at the end of user speech. + * + * Semantic VAD is more advanced and uses a turn detection model (in conjunction + * with VAD) to semantically estimate whether the user has finished speaking, then + * dynamically sets a timeout based on this probability. For example, if user audio + * trails off with "uhhm", the model will score a low probability of turn end and + * wait longer for the user to continue speaking. This can be useful for more + * natural conversations, but may have a higher latency. */ - turn_detection?: RealtimeSession.TurnDetection | null; + turn_detection?: RealtimeSession.ServerVad | RealtimeSession.SemanticVad | null; /** * The voice the model uses to respond. Voice cannot be changed during the session @@ -2782,34 +2852,34 @@ export namespace RealtimeSession { } /** - * Configuration for turn detection, ether Server VAD or Semantic VAD. This can be - * set to `null` to turn off, in which case the client must manually trigger model - * response. Server VAD means that the model will detect the start and end of - * speech based on audio volume and respond at the end of user speech. Semantic VAD - * is more advanced and uses a turn detection model (in conjunction with VAD) to - * semantically estimate whether the user has finished speaking, then dynamically - * sets a timeout based on this probability. For example, if user audio trails off - * with "uhhm", the model will score a low probability of turn end and wait longer - * for the user to continue speaking. This can be useful for more natural - * conversations, but may have a higher latency. + * Server-side voice activity detection (VAD) which flips on when user speech is + * detected and off after a period of silence. */ - export interface TurnDetection { + export interface ServerVad { /** - * Whether or not to automatically generate a response when a VAD stop event - * occurs. + * Type of turn detection, `server_vad` to turn on simple Server VAD. */ - create_response?: boolean; + type: 'server_vad'; /** - * Used only for `semantic_vad` mode. The eagerness of the model to respond. `low` - * will wait longer for the user to continue speaking, `high` will respond more - * quickly. `auto` is the default and is equivalent to `medium`. + * Whether or not to automatically generate a response when a VAD stop event + * occurs. */ - eagerness?: 'low' | 'medium' | 'high' | 'auto'; + create_response?: boolean; /** - * Optional idle timeout after which turn detection will auto-timeout when no - * additional audio is received. + * Optional timeout after which a model response will be triggered automatically. + * This is useful for situations in which a long pause from the user is unexpected, + * such as a phone call. The model will effectively prompt the user to continue the + * conversation based on the current context. + * + * The timeout value will be applied after the last model response's audio has + * finished playing, i.e. it's set to the `response.done` time plus audio playback + * duration. + * + * An `input_audio_buffer.timeout_triggered` event (plus events associated with the + * Response) will be emitted when the timeout is reached. Idle timeout is currently + * only supported for `server_vad` mode. */ idle_timeout_ms?: number | null; @@ -2839,11 +2909,38 @@ export namespace RealtimeSession { * model, and thus might perform better in noisy environments. */ threshold?: number; + } + + /** + * Server-side semantic turn detection which uses a model to determine when the + * user has finished speaking. + */ + export interface SemanticVad { + /** + * Type of turn detection, `semantic_vad` to turn on Semantic VAD. + */ + type: 'semantic_vad'; + + /** + * Whether or not to automatically generate a response when a VAD stop event + * occurs. + */ + create_response?: boolean; + + /** + * Used only for `semantic_vad` mode. The eagerness of the model to respond. `low` + * will wait longer for the user to continue speaking, `high` will respond more + * quickly. `auto` is the default and is equivalent to `medium`. `low`, `medium`, + * and `high` have max timeouts of 8s, 4s, and 2s respectively. + */ + eagerness?: 'low' | 'medium' | 'high' | 'auto'; /** - * Type of turn detection. + * Whether or not to automatically interrupt any ongoing response with output to + * the default conversation (i.e. `conversation` of `auto`) when a VAD start event + * occurs. */ - type?: 'server_vad' | 'semantic_vad'; + interrupt_response?: boolean; } } @@ -3194,16 +3291,19 @@ export interface RealtimeTranscriptionSessionAudioInput { /** * Configuration for turn detection, ether Server VAD or Semantic VAD. This can be * set to `null` to turn off, in which case the client must manually trigger model - * response. Server VAD means that the model will detect the start and end of - * speech based on audio volume and respond at the end of user speech. Semantic VAD - * is more advanced and uses a turn detection model (in conjunction with VAD) to - * semantically estimate whether the user has finished speaking, then dynamically - * sets a timeout based on this probability. For example, if user audio trails off - * with "uhhm", the model will score a low probability of turn end and wait longer - * for the user to continue speaking. This can be useful for more natural - * conversations, but may have a higher latency. - */ - turn_detection?: RealtimeTranscriptionSessionAudioInputTurnDetection; + * response. + * + * Server VAD means that the model will detect the start and end of speech based on + * audio volume and respond at the end of user speech. + * + * Semantic VAD is more advanced and uses a turn detection model (in conjunction + * with VAD) to semantically estimate whether the user has finished speaking, then + * dynamically sets a timeout based on this probability. For example, if user audio + * trails off with "uhhm", the model will score a low probability of turn end and + * wait longer for the user to continue speaking. This can be useful for more + * natural conversations, but may have a higher latency. + */ + turn_detection?: RealtimeTranscriptionSessionAudioInputTurnDetection | null; } export namespace RealtimeTranscriptionSessionAudioInput { @@ -3227,66 +3327,114 @@ export namespace RealtimeTranscriptionSessionAudioInput { /** * Configuration for turn detection, ether Server VAD or Semantic VAD. This can be * set to `null` to turn off, in which case the client must manually trigger model - * response. Server VAD means that the model will detect the start and end of - * speech based on audio volume and respond at the end of user speech. Semantic VAD - * is more advanced and uses a turn detection model (in conjunction with VAD) to - * semantically estimate whether the user has finished speaking, then dynamically - * sets a timeout based on this probability. For example, if user audio trails off - * with "uhhm", the model will score a low probability of turn end and wait longer - * for the user to continue speaking. This can be useful for more natural - * conversations, but may have a higher latency. + * response. + * + * Server VAD means that the model will detect the start and end of speech based on + * audio volume and respond at the end of user speech. + * + * Semantic VAD is more advanced and uses a turn detection model (in conjunction + * with VAD) to semantically estimate whether the user has finished speaking, then + * dynamically sets a timeout based on this probability. For example, if user audio + * trails off with "uhhm", the model will score a low probability of turn end and + * wait longer for the user to continue speaking. This can be useful for more + * natural conversations, but may have a higher latency. */ -export interface RealtimeTranscriptionSessionAudioInputTurnDetection { - /** - * Whether or not to automatically generate a response when a VAD stop event - * occurs. - */ - create_response?: boolean; +export type RealtimeTranscriptionSessionAudioInputTurnDetection = + | RealtimeTranscriptionSessionAudioInputTurnDetection.ServerVad + | RealtimeTranscriptionSessionAudioInputTurnDetection.SemanticVad; +export namespace RealtimeTranscriptionSessionAudioInputTurnDetection { /** - * Used only for `semantic_vad` mode. The eagerness of the model to respond. `low` - * will wait longer for the user to continue speaking, `high` will respond more - * quickly. `auto` is the default and is equivalent to `medium`. + * Server-side voice activity detection (VAD) which flips on when user speech is + * detected and off after a period of silence. */ - eagerness?: 'low' | 'medium' | 'high' | 'auto'; + export interface ServerVad { + /** + * Type of turn detection, `server_vad` to turn on simple Server VAD. + */ + type: 'server_vad'; - /** - * Optional idle timeout after which turn detection will auto-timeout when no - * additional audio is received. - */ - idle_timeout_ms?: number | null; + /** + * Whether or not to automatically generate a response when a VAD stop event + * occurs. + */ + create_response?: boolean; - /** - * Whether or not to automatically interrupt any ongoing response with output to - * the default conversation (i.e. `conversation` of `auto`) when a VAD start event - * occurs. - */ - interrupt_response?: boolean; + /** + * Optional timeout after which a model response will be triggered automatically. + * This is useful for situations in which a long pause from the user is unexpected, + * such as a phone call. The model will effectively prompt the user to continue the + * conversation based on the current context. + * + * The timeout value will be applied after the last model response's audio has + * finished playing, i.e. it's set to the `response.done` time plus audio playback + * duration. + * + * An `input_audio_buffer.timeout_triggered` event (plus events associated with the + * Response) will be emitted when the timeout is reached. Idle timeout is currently + * only supported for `server_vad` mode. + */ + idle_timeout_ms?: number | null; - /** - * Used only for `server_vad` mode. Amount of audio to include before the VAD - * detected speech (in milliseconds). Defaults to 300ms. - */ - prefix_padding_ms?: number; + /** + * Whether or not to automatically interrupt any ongoing response with output to + * the default conversation (i.e. `conversation` of `auto`) when a VAD start event + * occurs. + */ + interrupt_response?: boolean; - /** - * Used only for `server_vad` mode. Duration of silence to detect speech stop (in - * milliseconds). Defaults to 500ms. With shorter values the model will respond - * more quickly, but may jump in on short pauses from the user. - */ - silence_duration_ms?: number; + /** + * Used only for `server_vad` mode. Amount of audio to include before the VAD + * detected speech (in milliseconds). Defaults to 300ms. + */ + prefix_padding_ms?: number; - /** - * Used only for `server_vad` mode. Activation threshold for VAD (0.0 to 1.0), this - * defaults to 0.5. A higher threshold will require louder audio to activate the - * model, and thus might perform better in noisy environments. - */ - threshold?: number; + /** + * Used only for `server_vad` mode. Duration of silence to detect speech stop (in + * milliseconds). Defaults to 500ms. With shorter values the model will respond + * more quickly, but may jump in on short pauses from the user. + */ + silence_duration_ms?: number; + + /** + * Used only for `server_vad` mode. Activation threshold for VAD (0.0 to 1.0), this + * defaults to 0.5. A higher threshold will require louder audio to activate the + * model, and thus might perform better in noisy environments. + */ + threshold?: number; + } /** - * Type of turn detection. + * Server-side semantic turn detection which uses a model to determine when the + * user has finished speaking. */ - type?: 'server_vad' | 'semantic_vad'; + export interface SemanticVad { + /** + * Type of turn detection, `semantic_vad` to turn on Semantic VAD. + */ + type: 'semantic_vad'; + + /** + * Whether or not to automatically generate a response when a VAD stop event + * occurs. + */ + create_response?: boolean; + + /** + * Used only for `semantic_vad` mode. The eagerness of the model to respond. `low` + * will wait longer for the user to continue speaking, `high` will respond more + * quickly. `auto` is the default and is equivalent to `medium`. `low`, `medium`, + * and `high` have max timeouts of 8s, 4s, and 2s respectively. + */ + eagerness?: 'low' | 'medium' | 'high' | 'auto'; + + /** + * Whether or not to automatically interrupt any ongoing response with output to + * the default conversation (i.e. `conversation` of `auto`) when a VAD start event + * occurs. + */ + interrupt_response?: boolean; + } } /** diff --git a/src/resources/responses/responses.ts b/src/resources/responses/responses.ts index b3da02889..91e01bed2 100644 --- a/src/resources/responses/responses.ts +++ b/src/resources/responses/responses.ts @@ -585,10 +585,10 @@ export interface Response { /** * The truncation strategy to use for the model response. * - * - `auto`: If the context of this response and previous ones exceeds the model's - * context window size, the model will truncate the response to fit the context - * window by dropping input items in the middle of the conversation. - * - `disabled` (default): If a model response will exceed the context window size + * - `auto`: If the input to this Response exceeds the model's context window size, + * the model will truncate the response to fit the context window by dropping + * items from the beginning of the conversation. + * - `disabled` (default): If the input size will exceed the context window size * for a model, the request will fail with a 400 error. */ truncation?: 'auto' | 'disabled' | null; @@ -5455,10 +5455,10 @@ export interface ResponseCreateParamsBase { /** * The truncation strategy to use for the model response. * - * - `auto`: If the context of this response and previous ones exceeds the model's - * context window size, the model will truncate the response to fit the context - * window by dropping input items in the middle of the conversation. - * - `disabled` (default): If a model response will exceed the context window size + * - `auto`: If the input to this Response exceeds the model's context window size, + * the model will truncate the response to fit the context window by dropping + * items from the beginning of the conversation. + * - `disabled` (default): If the input size will exceed the context window size * for a model, the request will fail with a 400 error. */ truncation?: 'auto' | 'disabled' | null; From 18c029ab3bcdc4c4fd6a78c660d4ebd6e52b0bf1 Mon Sep 17 00:00:00 2001 From: "stainless-app[bot]" <142633134+stainless-app[bot]@users.noreply.github.com> Date: Thu, 11 Sep 2025 18:27:35 +0000 Subject: [PATCH 2/4] codegen metadata --- .stats.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.stats.yml b/.stats.yml index 5388f2463..e38971896 100644 --- a/.stats.yml +++ b/.stats.yml @@ -1,4 +1,4 @@ configured_endpoints: 118 -openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/openai%2Fopenai-c829f9e7f51d4946dae7b02eb37eb857b538a464cf54c7ced5eff1b1c93e07db.yml -openapi_spec_hash: 1b2eaba46b264bcec8831bc496543649 +openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/openai%2Fopenai-94b1e3cb0bdc616ff0c2f267c33dadd95f133b1f64e647aab6c64afb292b2793.yml +openapi_spec_hash: 2395319ac9befd59b6536ae7f9564a05 config_hash: 930dac3aa861344867e4ac84f037b5df From 836d1b4cdd077c206e1c647c762f4c16e9db444c Mon Sep 17 00:00:00 2001 From: "stainless-app[bot]" <142633134+stainless-app[bot]@users.noreply.github.com> Date: Thu, 11 Sep 2025 19:14:44 +0000 Subject: [PATCH 3/4] fix: coerce nullable values to undefined --- src/internal/utils/values.ts | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/internal/utils/values.ts b/src/internal/utils/values.ts index 801974e84..284ff5cde 100644 --- a/src/internal/utils/values.ts +++ b/src/internal/utils/values.ts @@ -76,21 +76,21 @@ export const coerceBoolean = (value: unknown): boolean => { }; export const maybeCoerceInteger = (value: unknown): number | undefined => { - if (value === undefined) { + if (value == null) { return undefined; } return coerceInteger(value); }; export const maybeCoerceFloat = (value: unknown): number | undefined => { - if (value === undefined) { + if (value == null) { return undefined; } return coerceFloat(value); }; export const maybeCoerceBoolean = (value: unknown): boolean | undefined => { - if (value === undefined) { + if (value == null) { return undefined; } return coerceBoolean(value); From cc7ce47f67ff2fcaf59d4e203b75d6c50a522f5a Mon Sep 17 00:00:00 2001 From: "stainless-app[bot]" <142633134+stainless-app[bot]@users.noreply.github.com> Date: Fri, 12 Sep 2025 05:07:27 +0000 Subject: [PATCH 4/4] release: 5.20.2 --- .release-please-manifest.json | 2 +- CHANGELOG.md | 13 +++++++++++++ jsr.json | 2 +- package.json | 2 +- src/version.ts | 2 +- 5 files changed, 17 insertions(+), 4 deletions(-) diff --git a/.release-please-manifest.json b/.release-please-manifest.json index 83fac5c78..afa75b89f 100644 --- a/.release-please-manifest.json +++ b/.release-please-manifest.json @@ -1,3 +1,3 @@ { - ".": "5.20.1" + ".": "5.20.2" } diff --git a/CHANGELOG.md b/CHANGELOG.md index b0daffdad..80c50ea85 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,18 @@ # Changelog +## 5.20.2 (2025-09-12) + +Full Changelog: [v5.20.1...v5.20.2](https://github.com/openai/openai-node/compare/v5.20.1...v5.20.2) + +### Bug Fixes + +* coerce nullable values to undefined ([836d1b4](https://github.com/openai/openai-node/commit/836d1b4cdd077c206e1c647c762f4c16e9db444c)) + + +### Chores + +* **api:** Minor docs and type updates for realtime ([ccb00dc](https://github.com/openai/openai-node/commit/ccb00dcbd1466976045aafee152cbc038bb293b9)) + ## 5.20.1 (2025-09-10) Full Changelog: [v5.20.0...v5.20.1](https://github.com/openai/openai-node/compare/v5.20.0...v5.20.1) diff --git a/jsr.json b/jsr.json index af3e71220..961bdcde2 100644 --- a/jsr.json +++ b/jsr.json @@ -1,6 +1,6 @@ { "name": "@openai/openai", - "version": "5.20.1", + "version": "5.20.2", "exports": { ".": "./index.ts", "./helpers/zod": "./helpers/zod.ts", diff --git a/package.json b/package.json index 7dac7c900..11e6d6361 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "openai", - "version": "5.20.1", + "version": "5.20.2", "description": "The official TypeScript library for the OpenAI API", "author": "OpenAI ", "types": "dist/index.d.ts", diff --git a/src/version.ts b/src/version.ts index 95318e579..da9ff343d 100644 --- a/src/version.ts +++ b/src/version.ts @@ -1 +1 @@ -export const VERSION = '5.20.1'; // x-release-please-version +export const VERSION = '5.20.2'; // x-release-please-version