openai
diff --git a/‎.stats.yml‎
Lines changed: 2 additions & 2 deletions b/‎.stats.yml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎openai-java-core/src/main/kotlin/com/openai/models/realtime/InputAudioBufferTimeoutTriggered.kt‎
Lines changed: 27 additions & 5 deletions b/‎openai-java-core/src/main/kotlin/com/openai/models/realtime/InputAudioBufferTimeoutTriggered.kt‎
Lines changed: 27 additions & 5 deletions
diff --git a/‎openai-java-core/src/main/kotlin/com/openai/models/realtime/RealtimeAudioConfigInput.kt‎
Lines changed: 41 additions & 18 deletions b/‎openai-java-core/src/main/kotlin/com/openai/models/realtime/RealtimeAudioConfigInput.kt‎
Lines changed: 41 additions & 18 deletions
@@ -1,4 +1,4 @@
 configured_endpoints: 118
-openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/openai%2Fopenai-16cb18bed32bae8c5840fb39a1bf664026cc40463ad0c487dcb0df1bd3d72db0.yml
-openapi_spec_hash: 4cb51b22f98dee1a90bc7add82d1d132
+openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/openai%2Fopenai-c829f9e7f51d4946dae7b02eb37eb857b538a464cf54c7ced5eff1b1c93e07db.yml
+openapi_spec_hash: 1b2eaba46b264bcec8831bc496543649
 config_hash: 930dac3aa861344867e4ac84f037b5df
@@ -15,7 +15,21 @@ import com.openai.errors.OpenAIInvalidDataException
 import java.util.Collections
 import java.util.Objects
 
-/** Returned when the server VAD timeout is triggered for the input audio buffer. */
+/**
+ * Returned when the Server VAD timeout is triggered for the input audio buffer. This is configured
+ * with `idle_timeout_ms` in the `turn_detection` settings of the session, and it indicates that
+ * there hasn't been any speech detected for the configured duration.
+ *
+ * The `audio_start_ms` and `audio_end_ms` fields indicate the segment of audio after the last model
+ * response up to the triggering time, as an offset from the beginning of audio written to the input
+ * audio buffer. This means it demarcates the segment of audio that was silent and the difference
+ * between the start and end values will roughly match the configured timeout.
+ *
+ * The empty audio will be committed to the conversation as an `input_audio` item (there will be a
+ * `input_audio_buffer.committed` event) and a model response will be generated. There may be speech
+ * that didn't trigger VAD but is still detected by the model, so the model may respond with
+ * something relevant to the conversation or a prompt to continue speaking.
+ */
 class InputAudioBufferTimeoutTriggered
 private constructor(
     private val audioEndMs: JsonField<Long>,
@@ -40,15 +54,17 @@ private constructor(
     ) : this(audioEndMs, audioStartMs, eventId, itemId, type, mutableMapOf())
 
     /**
-     * Millisecond offset where speech ended within the buffered audio.
+     * Millisecond offset of audio written to the input audio buffer at the time the timeout was
+     * triggered.
      *
      * @throws OpenAIInvalidDataException if the JSON field has an unexpected type or is
      *   unexpectedly missing or null (e.g. if the server responded with an unexpected value).
      */
     fun audioEndMs(): Long = audioEndMs.getRequired("audio_end_ms")
 
     /**
-     * Millisecond offset where speech started within the buffered audio.
+     * Millisecond offset of audio written to the input audio buffer that was after the playback
+     * time of the last model response.
      *
      * @throws OpenAIInvalidDataException if the JSON field has an unexpected type or is
      *   unexpectedly missing or null (e.g. if the server responded with an unexpected value).
@@ -165,7 +181,10 @@ private constructor(
                     inputAudioBufferTimeoutTriggered.additionalProperties.toMutableMap()
             }
 
-        /** Millisecond offset where speech ended within the buffered audio. */
+        /**
+         * Millisecond offset of audio written to the input audio buffer at the time the timeout was
+         * triggered.
+         */
         fun audioEndMs(audioEndMs: Long) = audioEndMs(JsonField.of(audioEndMs))
 
         /**
@@ -176,7 +195,10 @@ private constructor(
          */
         fun audioEndMs(audioEndMs: JsonField<Long>) = apply { this.audioEndMs = audioEndMs }
 
-        /** Millisecond offset where speech started within the buffered audio. */
+        /**
+         * Millisecond offset of audio written to the input audio buffer that was after the playback
+         * time of the last model response.
+         */
         fun audioStartMs(audioStartMs: Long) = audioStartMs(JsonField.of(audioStartMs))
 
         /**
 
@@ -76,14 +76,16 @@ private constructor(
 
     /**
      * Configuration for turn detection, ether Server VAD or Semantic VAD. This can be set to `null`
-     * to turn off, in which case the client must manually trigger model response. Server VAD means
-     * that the model will detect the start and end of speech based on audio volume and respond at
-     * the end of user speech. Semantic VAD is more advanced and uses a turn detection model (in
-     * conjunction with VAD) to semantically estimate whether the user has finished speaking, then
-     * dynamically sets a timeout based on this probability. For example, if user audio trails off
-     * with "uhhm", the model will score a low probability of turn end and wait longer for the user
-     * to continue speaking. This can be useful for more natural conversations, but may have a
-     * higher latency.
+     * to turn off, in which case the client must manually trigger model response.
+     *
+     * Server VAD means that the model will detect the start and end of speech based on audio volume
+     * and respond at the end of user speech.
+     *
+     * Semantic VAD is more advanced and uses a turn detection model (in conjunction with VAD) to
+     * semantically estimate whether the user has finished speaking, then dynamically sets a timeout
+     * based on this probability. For example, if user audio trails off with "uhhm", the model will
+     * score a low probability of turn end and wait longer for the user to continue speaking. This
+     * can be useful for more natural conversations, but may have a higher latency.
      *
      * @throws OpenAIInvalidDataException if the JSON field has an unexpected type (e.g. if the
      *   server responded with an unexpected value).
@@ -230,17 +232,24 @@ private constructor(
 
         /**
          * Configuration for turn detection, ether Server VAD or Semantic VAD. This can be set to
-         * `null` to turn off, in which case the client must manually trigger model response. Server
-         * VAD means that the model will detect the start and end of speech based on audio volume
-         * and respond at the end of user speech. Semantic VAD is more advanced and uses a turn
-         * detection model (in conjunction with VAD) to semantically estimate whether the user has
-         * finished speaking, then dynamically sets a timeout based on this probability. For
-         * example, if user audio trails off with "uhhm", the model will score a low probability of
-         * turn end and wait longer for the user to continue speaking. This can be useful for more
-         * natural conversations, but may have a higher latency.
+         * `null` to turn off, in which case the client must manually trigger model response.
+         *
+         * Server VAD means that the model will detect the start and end of speech based on audio
+         * volume and respond at the end of user speech.
+         *
+         * Semantic VAD is more advanced and uses a turn detection model (in conjunction with VAD)
+         * to semantically estimate whether the user has finished speaking, then dynamically sets a
+         * timeout based on this probability. For example, if user audio trails off with "uhhm", the
+         * model will score a low probability of turn end and wait longer for the user to continue
+         * speaking. This can be useful for more natural conversations, but may have a higher
+         * latency.
          */
-        fun turnDetection(turnDetection: RealtimeAudioInputTurnDetection) =
-            turnDetection(JsonField.of(turnDetection))
+        fun turnDetection(turnDetection: RealtimeAudioInputTurnDetection?) =
+            turnDetection(JsonField.ofNullable(turnDetection))
+
+        /** Alias for calling [Builder.turnDetection] with `turnDetection.orElse(null)`. */
+        fun turnDetection(turnDetection: Optional<RealtimeAudioInputTurnDetection>) =
+            turnDetection(turnDetection.getOrNull())
 
         /**
          * Sets [Builder.turnDetection] to an arbitrary JSON value.
@@ -253,6 +262,20 @@ private constructor(
             this.turnDetection = turnDetection
         }
 
+        /**
+         * Alias for calling [turnDetection] with
+         * `RealtimeAudioInputTurnDetection.ofServerVad(serverVad)`.
+         */
+        fun turnDetection(serverVad: RealtimeAudioInputTurnDetection.ServerVad) =
+            turnDetection(RealtimeAudioInputTurnDetection.ofServerVad(serverVad))
+
+        /**
+         * Alias for calling [turnDetection] with
+         * `RealtimeAudioInputTurnDetection.ofSemanticVad(semanticVad)`.
+         */
+        fun turnDetection(semanticVad: RealtimeAudioInputTurnDetection.SemanticVad) =
+            turnDetection(RealtimeAudioInputTurnDetection.ofSemanticVad(semanticVad))
+
         fun additionalProperties(additionalProperties: Map<String, JsonValue>) = apply {
             this.additionalProperties.clear()
             putAllAdditionalProperties(additionalProperties)