From ccb00dcbd1466976045aafee152cbc038bb293b9 Mon Sep 17 00:00:00 2001
From: "stainless-app[bot]"
 <142633134+stainless-app[bot]@users.noreply.github.com>
Date: Thu, 11 Sep 2025 18:04:06 +0000
Subject: [PATCH 1/4] chore(api): Minor docs and type updates for realtime

---
 .stats.yml                               |   4 +-
 src/resources/realtime/client-secrets.ts |  93 +++--
 src/resources/realtime/realtime.ts       | 452 +++++++++++++++--------
 src/resources/responses/responses.ts     |  16 +-
 4 files changed, 371 insertions(+), 194 deletions(-)

diff --git a/.stats.yml b/.stats.yml
index 2aa16be87..5388f2463 100644
--- a/.stats.yml
+++ b/.stats.yml
@@ -1,4 +1,4 @@
 configured_endpoints: 118
-openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/openai%2Fopenai-16cb18bed32bae8c5840fb39a1bf664026cc40463ad0c487dcb0df1bd3d72db0.yml
-openapi_spec_hash: 4cb51b22f98dee1a90bc7add82d1d132
+openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/openai%2Fopenai-c829f9e7f51d4946dae7b02eb37eb857b538a464cf54c7ced5eff1b1c93e07db.yml
+openapi_spec_hash: 1b2eaba46b264bcec8831bc496543649
 config_hash: 930dac3aa861344867e4ac84f037b5df
diff --git a/src/resources/realtime/client-secrets.ts b/src/resources/realtime/client-secrets.ts
index 5c53b2e5a..70abeabbf 100644
--- a/src/resources/realtime/client-secrets.ts
+++ b/src/resources/realtime/client-secrets.ts
@@ -181,16 +181,19 @@ export namespace RealtimeSessionCreateResponse {
       /**
        * Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
        * set to `null` to turn off, in which case the client must manually trigger model
-       * response. Server VAD means that the model will detect the start and end of
-       * speech based on audio volume and respond at the end of user speech. Semantic VAD
-       * is more advanced and uses a turn detection model (in conjunction with VAD) to
-       * semantically estimate whether the user has finished speaking, then dynamically
-       * sets a timeout based on this probability. For example, if user audio trails off
-       * with "uhhm", the model will score a low probability of turn end and wait longer
-       * for the user to continue speaking. This can be useful for more natural
-       * conversations, but may have a higher latency.
+       * response.
+       *
+       * Server VAD means that the model will detect the start and end of speech based on
+       * audio volume and respond at the end of user speech.
+       *
+       * Semantic VAD is more advanced and uses a turn detection model (in conjunction
+       * with VAD) to semantically estimate whether the user has finished speaking, then
+       * dynamically sets a timeout based on this probability. For example, if user audio
+       * trails off with "uhhm", the model will score a low probability of turn end and
+       * wait longer for the user to continue speaking. This can be useful for more
+       * natural conversations, but may have a higher latency.
        */
-      turn_detection?: Input.TurnDetection;
+      turn_detection?: Input.ServerVad | Input.SemanticVad | null;
     }
 
     export namespace Input {
@@ -211,35 +214,34 @@ export namespace RealtimeSessionCreateResponse {
       }
 
       /**
-       * Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
-       * set to `null` to turn off, in which case the client must manually trigger model
-       * response. Server VAD means that the model will detect the start and end of
-       * speech based on audio volume and respond at the end of user speech. Semantic VAD
-       * is more advanced and uses a turn detection model (in conjunction with VAD) to
-       * semantically estimate whether the user has finished speaking, then dynamically
-       * sets a timeout based on this probability. For example, if user audio trails off
-       * with "uhhm", the model will score a low probability of turn end and wait longer
-       * for the user to continue speaking. This can be useful for more natural
-       * conversations, but may have a higher latency.
+       * Server-side voice activity detection (VAD) which flips on when user speech is
+       * detected and off after a period of silence.
        */
-      export interface TurnDetection {
+      export interface ServerVad {
         /**
-         * Whether or not to automatically generate a response when a VAD stop event
-         * occurs.
+         * Type of turn detection, `server_vad` to turn on simple Server VAD.
          */
-        create_response?: boolean;
+        type: 'server_vad';
 
         /**
-         * Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
-         * will wait longer for the user to continue speaking, `high` will respond more
-         * quickly. `auto` is the default and is equivalent to `medium`. `low`, `medium`,
-         * and `high` have max timeouts of 8s, 4s, and 2s respectively.
+         * Whether or not to automatically generate a response when a VAD stop event
+         * occurs.
          */
-        eagerness?: 'low' | 'medium' | 'high' | 'auto';
+        create_response?: boolean;
 
         /**
-         * Optional idle timeout after which turn detection will auto-timeout when no
-         * additional audio is received and emits a `timeout_triggered` event.
+         * Optional timeout after which a model response will be triggered automatically.
+         * This is useful for situations in which a long pause from the user is unexpected,
+         * such as a phone call. The model will effectively prompt the user to continue the
+         * conversation based on the current context.
+         *
+         * The timeout value will be applied after the last model response's audio has
+         * finished playing, i.e. it's set to the `response.done` time plus audio playback
+         * duration.
+         *
+         * An `input_audio_buffer.timeout_triggered` event (plus events associated with the
+         * Response) will be emitted when the timeout is reached. Idle timeout is currently
+         * only supported for `server_vad` mode.
          */
         idle_timeout_ms?: number | null;
 
@@ -269,11 +271,38 @@ export namespace RealtimeSessionCreateResponse {
          * model, and thus might perform better in noisy environments.
          */
         threshold?: number;
+      }
 
+      /**
+       * Server-side semantic turn detection which uses a model to determine when the
+       * user has finished speaking.
+       */
+      export interface SemanticVad {
         /**
-         * Type of turn detection.
+         * Type of turn detection, `semantic_vad` to turn on Semantic VAD.
          */
-        type?: 'server_vad' | 'semantic_vad';
+        type: 'semantic_vad';
+
+        /**
+         * Whether or not to automatically generate a response when a VAD stop event
+         * occurs.
+         */
+        create_response?: boolean;
+
+        /**
+         * Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
+         * will wait longer for the user to continue speaking, `high` will respond more
+         * quickly. `auto` is the default and is equivalent to `medium`. `low`, `medium`,
+         * and `high` have max timeouts of 8s, 4s, and 2s respectively.
+         */
+        eagerness?: 'low' | 'medium' | 'high' | 'auto';
+
+        /**
+         * Whether or not to automatically interrupt any ongoing response with output to
+         * the default conversation (i.e. `conversation` of `auto`) when a VAD start event
+         * occurs.
+         */
+        interrupt_response?: boolean;
       }
     }
 
diff --git a/src/resources/realtime/realtime.ts b/src/resources/realtime/realtime.ts
index 12f1f6848..3fe58e7be 100644
--- a/src/resources/realtime/realtime.ts
+++ b/src/resources/realtime/realtime.ts
@@ -933,16 +933,33 @@ export interface InputAudioBufferSpeechStoppedEvent {
 }
 
 /**
- * Returned when the server VAD timeout is triggered for the input audio buffer.
+ * Returned when the Server VAD timeout is triggered for the input audio buffer.
+ * This is configured with `idle_timeout_ms` in the `turn_detection` settings of
+ * the session, and it indicates that there hasn't been any speech detected for the
+ * configured duration.
+ *
+ * The `audio_start_ms` and `audio_end_ms` fields indicate the segment of audio
+ * after the last model response up to the triggering time, as an offset from the
+ * beginning of audio written to the input audio buffer. This means it demarcates
+ * the segment of audio that was silent and the difference between the start and
+ * end values will roughly match the configured timeout.
+ *
+ * The empty audio will be committed to the conversation as an `input_audio` item
+ * (there will be a `input_audio_buffer.committed` event) and a model response will
+ * be generated. There may be speech that didn't trigger VAD but is still detected
+ * by the model, so the model may respond with something relevant to the
+ * conversation or a prompt to continue speaking.
  */
 export interface InputAudioBufferTimeoutTriggered {
   /**
-   * Millisecond offset where speech ended within the buffered audio.
+   * Millisecond offset of audio written to the input audio buffer at the time the
+   * timeout was triggered.
    */
   audio_end_ms: number;
 
   /**
-   * Millisecond offset where speech started within the buffered audio.
+   * Millisecond offset of audio written to the input audio buffer that was after the
+   * playback time of the last model response.
    */
   audio_start_ms: number;
 
@@ -1154,16 +1171,19 @@ export interface RealtimeAudioConfigInput {
   /**
    * Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
    * set to `null` to turn off, in which case the client must manually trigger model
-   * response. Server VAD means that the model will detect the start and end of
-   * speech based on audio volume and respond at the end of user speech. Semantic VAD
-   * is more advanced and uses a turn detection model (in conjunction with VAD) to
-   * semantically estimate whether the user has finished speaking, then dynamically
-   * sets a timeout based on this probability. For example, if user audio trails off
-   * with "uhhm", the model will score a low probability of turn end and wait longer
-   * for the user to continue speaking. This can be useful for more natural
-   * conversations, but may have a higher latency.
-   */
-  turn_detection?: RealtimeAudioInputTurnDetection;
+   * response.
+   *
+   * Server VAD means that the model will detect the start and end of speech based on
+   * audio volume and respond at the end of user speech.
+   *
+   * Semantic VAD is more advanced and uses a turn detection model (in conjunction
+   * with VAD) to semantically estimate whether the user has finished speaking, then
+   * dynamically sets a timeout based on this probability. For example, if user audio
+   * trails off with "uhhm", the model will score a low probability of turn end and
+   * wait longer for the user to continue speaking. This can be useful for more
+   * natural conversations, but may have a higher latency.
+   */
+  turn_detection?: RealtimeAudioInputTurnDetection | null;
 }
 
 export namespace RealtimeAudioConfigInput {
@@ -1269,67 +1289,114 @@ export namespace RealtimeAudioFormats {
 /**
  * Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
  * set to `null` to turn off, in which case the client must manually trigger model
- * response. Server VAD means that the model will detect the start and end of
- * speech based on audio volume and respond at the end of user speech. Semantic VAD
- * is more advanced and uses a turn detection model (in conjunction with VAD) to
- * semantically estimate whether the user has finished speaking, then dynamically
- * sets a timeout based on this probability. For example, if user audio trails off
- * with "uhhm", the model will score a low probability of turn end and wait longer
- * for the user to continue speaking. This can be useful for more natural
- * conversations, but may have a higher latency.
+ * response.
+ *
+ * Server VAD means that the model will detect the start and end of speech based on
+ * audio volume and respond at the end of user speech.
+ *
+ * Semantic VAD is more advanced and uses a turn detection model (in conjunction
+ * with VAD) to semantically estimate whether the user has finished speaking, then
+ * dynamically sets a timeout based on this probability. For example, if user audio
+ * trails off with "uhhm", the model will score a low probability of turn end and
+ * wait longer for the user to continue speaking. This can be useful for more
+ * natural conversations, but may have a higher latency.
  */
-export interface RealtimeAudioInputTurnDetection {
-  /**
-   * Whether or not to automatically generate a response when a VAD stop event
-   * occurs.
-   */
-  create_response?: boolean;
+export type RealtimeAudioInputTurnDetection =
+  | RealtimeAudioInputTurnDetection.ServerVad
+  | RealtimeAudioInputTurnDetection.SemanticVad;
 
+export namespace RealtimeAudioInputTurnDetection {
   /**
-   * Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
-   * will wait longer for the user to continue speaking, `high` will respond more
-   * quickly. `auto` is the default and is equivalent to `medium`. `low`, `medium`,
-   * and `high` have max timeouts of 8s, 4s, and 2s respectively.
+   * Server-side voice activity detection (VAD) which flips on when user speech is
+   * detected and off after a period of silence.
    */
-  eagerness?: 'low' | 'medium' | 'high' | 'auto';
+  export interface ServerVad {
+    /**
+     * Type of turn detection, `server_vad` to turn on simple Server VAD.
+     */
+    type: 'server_vad';
 
-  /**
-   * Optional idle timeout after which turn detection will auto-timeout when no
-   * additional audio is received and emits a `timeout_triggered` event.
-   */
-  idle_timeout_ms?: number | null;
+    /**
+     * Whether or not to automatically generate a response when a VAD stop event
+     * occurs.
+     */
+    create_response?: boolean;
 
-  /**
-   * Whether or not to automatically interrupt any ongoing response with output to
-   * the default conversation (i.e. `conversation` of `auto`) when a VAD start event
-   * occurs.
-   */
-  interrupt_response?: boolean;
+    /**
+     * Optional timeout after which a model response will be triggered automatically.
+     * This is useful for situations in which a long pause from the user is unexpected,
+     * such as a phone call. The model will effectively prompt the user to continue the
+     * conversation based on the current context.
+     *
+     * The timeout value will be applied after the last model response's audio has
+     * finished playing, i.e. it's set to the `response.done` time plus audio playback
+     * duration.
+     *
+     * An `input_audio_buffer.timeout_triggered` event (plus events associated with the
+     * Response) will be emitted when the timeout is reached. Idle timeout is currently
+     * only supported for `server_vad` mode.
+     */
+    idle_timeout_ms?: number | null;
 
-  /**
-   * Used only for `server_vad` mode. Amount of audio to include before the VAD
-   * detected speech (in milliseconds). Defaults to 300ms.
-   */
-  prefix_padding_ms?: number;
+    /**
+     * Whether or not to automatically interrupt any ongoing response with output to
+     * the default conversation (i.e. `conversation` of `auto`) when a VAD start event
+     * occurs.
+     */
+    interrupt_response?: boolean;
 
-  /**
-   * Used only for `server_vad` mode. Duration of silence to detect speech stop (in
-   * milliseconds). Defaults to 500ms. With shorter values the model will respond
-   * more quickly, but may jump in on short pauses from the user.
-   */
-  silence_duration_ms?: number;
+    /**
+     * Used only for `server_vad` mode. Amount of audio to include before the VAD
+     * detected speech (in milliseconds). Defaults to 300ms.
+     */
+    prefix_padding_ms?: number;
 
-  /**
-   * Used only for `server_vad` mode. Activation threshold for VAD (0.0 to 1.0), this
-   * defaults to 0.5. A higher threshold will require louder audio to activate the
-   * model, and thus might perform better in noisy environments.
-   */
-  threshold?: number;
+    /**
+     * Used only for `server_vad` mode. Duration of silence to detect speech stop (in
+     * milliseconds). Defaults to 500ms. With shorter values the model will respond
+     * more quickly, but may jump in on short pauses from the user.
+     */
+    silence_duration_ms?: number;
+
+    /**
+     * Used only for `server_vad` mode. Activation threshold for VAD (0.0 to 1.0), this
+     * defaults to 0.5. A higher threshold will require louder audio to activate the
+     * model, and thus might perform better in noisy environments.
+     */
+    threshold?: number;
+  }
 
   /**
-   * Type of turn detection.
+   * Server-side semantic turn detection which uses a model to determine when the
+   * user has finished speaking.
    */
-  type?: 'server_vad' | 'semantic_vad';
+  export interface SemanticVad {
+    /**
+     * Type of turn detection, `semantic_vad` to turn on Semantic VAD.
+     */
+    type: 'semantic_vad';
+
+    /**
+     * Whether or not to automatically generate a response when a VAD stop event
+     * occurs.
+     */
+    create_response?: boolean;
+
+    /**
+     * Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
+     * will wait longer for the user to continue speaking, `high` will respond more
+     * quickly. `auto` is the default and is equivalent to `medium`. `low`, `medium`,
+     * and `high` have max timeouts of 8s, 4s, and 2s respectively.
+     */
+    eagerness?: 'low' | 'medium' | 'high' | 'auto';
+
+    /**
+     * Whether or not to automatically interrupt any ongoing response with output to
+     * the default conversation (i.e. `conversation` of `auto`) when a VAD start event
+     * occurs.
+     */
+    interrupt_response?: boolean;
+  }
 }
 
 /**
@@ -2568,7 +2635,7 @@ export namespace RealtimeServerEvent {
 }
 
 /**
- * Realtime session object.
+ * Realtime session object for the beta interface.
  */
 export interface RealtimeSession {
   /**
@@ -2711,16 +2778,19 @@ export interface RealtimeSession {
   /**
    * Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
    * set to `null` to turn off, in which case the client must manually trigger model
-   * response. Server VAD means that the model will detect the start and end of
-   * speech based on audio volume and respond at the end of user speech. Semantic VAD
-   * is more advanced and uses a turn detection model (in conjunction with VAD) to
-   * semantically estimate whether the user has finished speaking, then dynamically
-   * sets a timeout based on this probability. For example, if user audio trails off
-   * with "uhhm", the model will score a low probability of turn end and wait longer
-   * for the user to continue speaking. This can be useful for more natural
-   * conversations, but may have a higher latency.
+   * response.
+   *
+   * Server VAD means that the model will detect the start and end of speech based on
+   * audio volume and respond at the end of user speech.
+   *
+   * Semantic VAD is more advanced and uses a turn detection model (in conjunction
+   * with VAD) to semantically estimate whether the user has finished speaking, then
+   * dynamically sets a timeout based on this probability. For example, if user audio
+   * trails off with "uhhm", the model will score a low probability of turn end and
+   * wait longer for the user to continue speaking. This can be useful for more
+   * natural conversations, but may have a higher latency.
    */
-  turn_detection?: RealtimeSession.TurnDetection | null;
+  turn_detection?: RealtimeSession.ServerVad | RealtimeSession.SemanticVad | null;
 
   /**
    * The voice the model uses to respond. Voice cannot be changed during the session
@@ -2782,34 +2852,34 @@ export namespace RealtimeSession {
   }
 
   /**
-   * Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
-   * set to `null` to turn off, in which case the client must manually trigger model
-   * response. Server VAD means that the model will detect the start and end of
-   * speech based on audio volume and respond at the end of user speech. Semantic VAD
-   * is more advanced and uses a turn detection model (in conjunction with VAD) to
-   * semantically estimate whether the user has finished speaking, then dynamically
-   * sets a timeout based on this probability. For example, if user audio trails off
-   * with "uhhm", the model will score a low probability of turn end and wait longer
-   * for the user to continue speaking. This can be useful for more natural
-   * conversations, but may have a higher latency.
+   * Server-side voice activity detection (VAD) which flips on when user speech is
+   * detected and off after a period of silence.
    */
-  export interface TurnDetection {
+  export interface ServerVad {
     /**
-     * Whether or not to automatically generate a response when a VAD stop event
-     * occurs.
+     * Type of turn detection, `server_vad` to turn on simple Server VAD.
      */
-    create_response?: boolean;
+    type: 'server_vad';
 
     /**
-     * Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
-     * will wait longer for the user to continue speaking, `high` will respond more
-     * quickly. `auto` is the default and is equivalent to `medium`.
+     * Whether or not to automatically generate a response when a VAD stop event
+     * occurs.
      */
-    eagerness?: 'low' | 'medium' | 'high' | 'auto';
+    create_response?: boolean;
 
     /**
-     * Optional idle timeout after which turn detection will auto-timeout when no
-     * additional audio is received.
+     * Optional timeout after which a model response will be triggered automatically.
+     * This is useful for situations in which a long pause from the user is unexpected,
+     * such as a phone call. The model will effectively prompt the user to continue the
+     * conversation based on the current context.
+     *
+     * The timeout value will be applied after the last model response's audio has
+     * finished playing, i.e. it's set to the `response.done` time plus audio playback
+     * duration.
+     *
+     * An `input_audio_buffer.timeout_triggered` event (plus events associated with the
+     * Response) will be emitted when the timeout is reached. Idle timeout is currently
+     * only supported for `server_vad` mode.
      */
     idle_timeout_ms?: number | null;
 
@@ -2839,11 +2909,38 @@ export namespace RealtimeSession {
      * model, and thus might perform better in noisy environments.
      */
     threshold?: number;
+  }
+
+  /**
+   * Server-side semantic turn detection which uses a model to determine when the
+   * user has finished speaking.
+   */
+  export interface SemanticVad {
+    /**
+     * Type of turn detection, `semantic_vad` to turn on Semantic VAD.
+     */
+    type: 'semantic_vad';
+
+    /**
+     * Whether or not to automatically generate a response when a VAD stop event
+     * occurs.
+     */
+    create_response?: boolean;
+
+    /**
+     * Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
+     * will wait longer for the user to continue speaking, `high` will respond more
+     * quickly. `auto` is the default and is equivalent to `medium`. `low`, `medium`,
+     * and `high` have max timeouts of 8s, 4s, and 2s respectively.
+     */
+    eagerness?: 'low' | 'medium' | 'high' | 'auto';
 
     /**
-     * Type of turn detection.
+     * Whether or not to automatically interrupt any ongoing response with output to
+     * the default conversation (i.e. `conversation` of `auto`) when a VAD start event
+     * occurs.
      */
-    type?: 'server_vad' | 'semantic_vad';
+    interrupt_response?: boolean;
   }
 }
 
@@ -3194,16 +3291,19 @@ export interface RealtimeTranscriptionSessionAudioInput {
   /**
    * Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
    * set to `null` to turn off, in which case the client must manually trigger model
-   * response. Server VAD means that the model will detect the start and end of
-   * speech based on audio volume and respond at the end of user speech. Semantic VAD
-   * is more advanced and uses a turn detection model (in conjunction with VAD) to
-   * semantically estimate whether the user has finished speaking, then dynamically
-   * sets a timeout based on this probability. For example, if user audio trails off
-   * with "uhhm", the model will score a low probability of turn end and wait longer
-   * for the user to continue speaking. This can be useful for more natural
-   * conversations, but may have a higher latency.
-   */
-  turn_detection?: RealtimeTranscriptionSessionAudioInputTurnDetection;
+   * response.
+   *
+   * Server VAD means that the model will detect the start and end of speech based on
+   * audio volume and respond at the end of user speech.
+   *
+   * Semantic VAD is more advanced and uses a turn detection model (in conjunction
+   * with VAD) to semantically estimate whether the user has finished speaking, then
+   * dynamically sets a timeout based on this probability. For example, if user audio
+   * trails off with "uhhm", the model will score a low probability of turn end and
+   * wait longer for the user to continue speaking. This can be useful for more
+   * natural conversations, but may have a higher latency.
+   */
+  turn_detection?: RealtimeTranscriptionSessionAudioInputTurnDetection | null;
 }
 
 export namespace RealtimeTranscriptionSessionAudioInput {
@@ -3227,66 +3327,114 @@ export namespace RealtimeTranscriptionSessionAudioInput {
 /**
  * Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
  * set to `null` to turn off, in which case the client must manually trigger model
- * response. Server VAD means that the model will detect the start and end of
- * speech based on audio volume and respond at the end of user speech. Semantic VAD
- * is more advanced and uses a turn detection model (in conjunction with VAD) to
- * semantically estimate whether the user has finished speaking, then dynamically
- * sets a timeout based on this probability. For example, if user audio trails off
- * with "uhhm", the model will score a low probability of turn end and wait longer
- * for the user to continue speaking. This can be useful for more natural
- * conversations, but may have a higher latency.
+ * response.
+ *
+ * Server VAD means that the model will detect the start and end of speech based on
+ * audio volume and respond at the end of user speech.
+ *
+ * Semantic VAD is more advanced and uses a turn detection model (in conjunction
+ * with VAD) to semantically estimate whether the user has finished speaking, then
+ * dynamically sets a timeout based on this probability. For example, if user audio
+ * trails off with "uhhm", the model will score a low probability of turn end and
+ * wait longer for the user to continue speaking. This can be useful for more
+ * natural conversations, but may have a higher latency.
  */
-export interface RealtimeTranscriptionSessionAudioInputTurnDetection {
-  /**
-   * Whether or not to automatically generate a response when a VAD stop event
-   * occurs.
-   */
-  create_response?: boolean;
+export type RealtimeTranscriptionSessionAudioInputTurnDetection =
+  | RealtimeTranscriptionSessionAudioInputTurnDetection.ServerVad
+  | RealtimeTranscriptionSessionAudioInputTurnDetection.SemanticVad;
 
+export namespace RealtimeTranscriptionSessionAudioInputTurnDetection {
   /**
-   * Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
-   * will wait longer for the user to continue speaking, `high` will respond more
-   * quickly. `auto` is the default and is equivalent to `medium`.
+   * Server-side voice activity detection (VAD) which flips on when user speech is
+   * detected and off after a period of silence.
    */
-  eagerness?: 'low' | 'medium' | 'high' | 'auto';
+  export interface ServerVad {
+    /**
+     * Type of turn detection, `server_vad` to turn on simple Server VAD.
+     */
+    type: 'server_vad';
 
-  /**
-   * Optional idle timeout after which turn detection will auto-timeout when no
-   * additional audio is received.
-   */
-  idle_timeout_ms?: number | null;
+    /**
+     * Whether or not to automatically generate a response when a VAD stop event
+     * occurs.
+     */
+    create_response?: boolean;
 
-  /**
-   * Whether or not to automatically interrupt any ongoing response with output to
-   * the default conversation (i.e. `conversation` of `auto`) when a VAD start event
-   * occurs.
-   */
-  interrupt_response?: boolean;
+    /**
+     * Optional timeout after which a model response will be triggered automatically.
+     * This is useful for situations in which a long pause from the user is unexpected,
+     * such as a phone call. The model will effectively prompt the user to continue the
+     * conversation based on the current context.
+     *
+     * The timeout value will be applied after the last model response's audio has
+     * finished playing, i.e. it's set to the `response.done` time plus audio playback
+     * duration.
+     *
+     * An `input_audio_buffer.timeout_triggered` event (plus events associated with the
+     * Response) will be emitted when the timeout is reached. Idle timeout is currently
+     * only supported for `server_vad` mode.
+     */
+    idle_timeout_ms?: number | null;
 
-  /**
-   * Used only for `server_vad` mode. Amount of audio to include before the VAD
-   * detected speech (in milliseconds). Defaults to 300ms.
-   */
-  prefix_padding_ms?: number;
+    /**
+     * Whether or not to automatically interrupt any ongoing response with output to
+     * the default conversation (i.e. `conversation` of `auto`) when a VAD start event
+     * occurs.
+     */
+    interrupt_response?: boolean;
 
-  /**
-   * Used only for `server_vad` mode. Duration of silence to detect speech stop (in
-   * milliseconds). Defaults to 500ms. With shorter values the model will respond
-   * more quickly, but may jump in on short pauses from the user.
-   */
-  silence_duration_ms?: number;
+    /**
+     * Used only for `server_vad` mode. Amount of audio to include before the VAD
+     * detected speech (in milliseconds). Defaults to 300ms.
+     */
+    prefix_padding_ms?: number;
 
-  /**
-   * Used only for `server_vad` mode. Activation threshold for VAD (0.0 to 1.0), this
-   * defaults to 0.5. A higher threshold will require louder audio to activate the
-   * model, and thus might perform better in noisy environments.
-   */
-  threshold?: number;
+    /**
+     * Used only for `server_vad` mode. Duration of silence to detect speech stop (in
+     * milliseconds). Defaults to 500ms. With shorter values the model will respond
+     * more quickly, but may jump in on short pauses from the user.
+     */
+    silence_duration_ms?: number;
+
+    /**
+     * Used only for `server_vad` mode. Activation threshold for VAD (0.0 to 1.0), this
+     * defaults to 0.5. A higher threshold will require louder audio to activate the
+     * model, and thus might perform better in noisy environments.
+     */
+    threshold?: number;
+  }
 
   /**
-   * Type of turn detection.
+   * Server-side semantic turn detection which uses a model to determine when the
+   * user has finished speaking.
    */
-  type?: 'server_vad' | 'semantic_vad';
+  export interface SemanticVad {
+    /**
+     * Type of turn detection, `semantic_vad` to turn on Semantic VAD.
+     */
+    type: 'semantic_vad';
+
+    /**
+     * Whether or not to automatically generate a response when a VAD stop event
+     * occurs.
+     */
+    create_response?: boolean;
+
+    /**
+     * Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
+     * will wait longer for the user to continue speaking, `high` will respond more
+     * quickly. `auto` is the default and is equivalent to `medium`. `low`, `medium`,
+     * and `high` have max timeouts of 8s, 4s, and 2s respectively.
+     */
+    eagerness?: 'low' | 'medium' | 'high' | 'auto';
+
+    /**
+     * Whether or not to automatically interrupt any ongoing response with output to
+     * the default conversation (i.e. `conversation` of `auto`) when a VAD start event
+     * occurs.
+     */
+    interrupt_response?: boolean;
+  }
 }
 
 /**
diff --git a/src/resources/responses/responses.ts b/src/resources/responses/responses.ts
index b3da02889..91e01bed2 100644
--- a/src/resources/responses/responses.ts
+++ b/src/resources/responses/responses.ts
@@ -585,10 +585,10 @@ export interface Response {
   /**
    * The truncation strategy to use for the model response.
    *
-   * - `auto`: If the context of this response and previous ones exceeds the model's
-   *   context window size, the model will truncate the response to fit the context
-   *   window by dropping input items in the middle of the conversation.
-   * - `disabled` (default): If a model response will exceed the context window size
+   * - `auto`: If the input to this Response exceeds the model's context window size,
+   *   the model will truncate the response to fit the context window by dropping
+   *   items from the beginning of the conversation.
+   * - `disabled` (default): If the input size will exceed the context window size
    *   for a model, the request will fail with a 400 error.
    */
   truncation?: 'auto' | 'disabled' | null;
@@ -5455,10 +5455,10 @@ export interface ResponseCreateParamsBase {
   /**
    * The truncation strategy to use for the model response.
    *
-   * - `auto`: If the context of this response and previous ones exceeds the model's
-   *   context window size, the model will truncate the response to fit the context
-   *   window by dropping input items in the middle of the conversation.
-   * - `disabled` (default): If a model response will exceed the context window size
+   * - `auto`: If the input to this Response exceeds the model's context window size,
+   *   the model will truncate the response to fit the context window by dropping
+   *   items from the beginning of the conversation.
+   * - `disabled` (default): If the input size will exceed the context window size
    *   for a model, the request will fail with a 400 error.
    */
   truncation?: 'auto' | 'disabled' | null;

From 18c029ab3bcdc4c4fd6a78c660d4ebd6e52b0bf1 Mon Sep 17 00:00:00 2001
From: "stainless-app[bot]"
 <142633134+stainless-app[bot]@users.noreply.github.com>
Date: Thu, 11 Sep 2025 18:27:35 +0000
Subject: [PATCH 2/4] codegen metadata

---
 .stats.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.stats.yml b/.stats.yml
index 5388f2463..e38971896 100644
--- a/.stats.yml
+++ b/.stats.yml
@@ -1,4 +1,4 @@
 configured_endpoints: 118
-openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/openai%2Fopenai-c829f9e7f51d4946dae7b02eb37eb857b538a464cf54c7ced5eff1b1c93e07db.yml
-openapi_spec_hash: 1b2eaba46b264bcec8831bc496543649
+openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/openai%2Fopenai-94b1e3cb0bdc616ff0c2f267c33dadd95f133b1f64e647aab6c64afb292b2793.yml
+openapi_spec_hash: 2395319ac9befd59b6536ae7f9564a05
 config_hash: 930dac3aa861344867e4ac84f037b5df

From 836d1b4cdd077c206e1c647c762f4c16e9db444c Mon Sep 17 00:00:00 2001
From: "stainless-app[bot]"
 <142633134+stainless-app[bot]@users.noreply.github.com>
Date: Thu, 11 Sep 2025 19:14:44 +0000
Subject: [PATCH 3/4] fix: coerce nullable values to undefined

---
 src/internal/utils/values.ts | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/internal/utils/values.ts b/src/internal/utils/values.ts
index 801974e84..284ff5cde 100644
--- a/src/internal/utils/values.ts
+++ b/src/internal/utils/values.ts
@@ -76,21 +76,21 @@ export const coerceBoolean = (value: unknown): boolean => {
 };
 
 export const maybeCoerceInteger = (value: unknown): number | undefined => {
-  if (value === undefined) {
+  if (value == null) {
     return undefined;
   }
   return coerceInteger(value);
 };
 
 export const maybeCoerceFloat = (value: unknown): number | undefined => {
-  if (value === undefined) {
+  if (value == null) {
     return undefined;
   }
   return coerceFloat(value);
 };
 
 export const maybeCoerceBoolean = (value: unknown): boolean | undefined => {
-  if (value === undefined) {
+  if (value == null) {
     return undefined;
   }
   return coerceBoolean(value);

From cc7ce47f67ff2fcaf59d4e203b75d6c50a522f5a Mon Sep 17 00:00:00 2001
From: "stainless-app[bot]"
 <142633134+stainless-app[bot]@users.noreply.github.com>
Date: Fri, 12 Sep 2025 05:07:27 +0000
Subject: [PATCH 4/4] release: 5.20.2

---
 .release-please-manifest.json |  2 +-
 CHANGELOG.md                  | 13 +++++++++++++
 jsr.json                      |  2 +-
 package.json                  |  2 +-
 src/version.ts                |  2 +-
 5 files changed, 17 insertions(+), 4 deletions(-)

diff --git a/.release-please-manifest.json b/.release-please-manifest.json
index 83fac5c78..afa75b89f 100644
--- a/.release-please-manifest.json
+++ b/.release-please-manifest.json
@@ -1,3 +1,3 @@
 {
-  ".": "5.20.1"
+  ".": "5.20.2"
 }
diff --git a/CHANGELOG.md b/CHANGELOG.md
index b0daffdad..80c50ea85 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,18 @@
 # Changelog
 
+## 5.20.2 (2025-09-12)
+
+Full Changelog: [v5.20.1...v5.20.2](https://github.com/openai/openai-node/compare/v5.20.1...v5.20.2)
+
+### Bug Fixes
+
+* coerce nullable values to undefined ([836d1b4](https://github.com/openai/openai-node/commit/836d1b4cdd077c206e1c647c762f4c16e9db444c))
+
+
+### Chores
+
+* **api:** Minor docs and type updates for realtime ([ccb00dc](https://github.com/openai/openai-node/commit/ccb00dcbd1466976045aafee152cbc038bb293b9))
+
 ## 5.20.1 (2025-09-10)
 
 Full Changelog: [v5.20.0...v5.20.1](https://github.com/openai/openai-node/compare/v5.20.0...v5.20.1)
diff --git a/jsr.json b/jsr.json
index af3e71220..961bdcde2 100644
--- a/jsr.json
+++ b/jsr.json
@@ -1,6 +1,6 @@
 {
   "name": "@openai/openai",
-  "version": "5.20.1",
+  "version": "5.20.2",
   "exports": {
     ".": "./index.ts",
     "./helpers/zod": "./helpers/zod.ts",
diff --git a/package.json b/package.json
index 7dac7c900..11e6d6361 100644
--- a/package.json
+++ b/package.json
@@ -1,6 +1,6 @@
 {
   "name": "openai",
-  "version": "5.20.1",
+  "version": "5.20.2",
   "description": "The official TypeScript library for the OpenAI API",
   "author": "OpenAI <support@openai.com>",
   "types": "dist/index.d.ts",
diff --git a/src/version.ts b/src/version.ts
index 95318e579..da9ff343d 100644
--- a/src/version.ts
+++ b/src/version.ts
@@ -1 +1 @@
-export const VERSION = '5.20.1'; // x-release-please-version
+export const VERSION = '5.20.2'; // x-release-please-version