Skip to content

Commit 63ad148

Browse files
chore(api): Minor docs and type updates for realtime
1 parent cdf1a47 commit 63ad148

30 files changed

+4364
-2805
lines changed

.stats.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
configured_endpoints: 118
2-
openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/openai%2Fopenai-16cb18bed32bae8c5840fb39a1bf664026cc40463ad0c487dcb0df1bd3d72db0.yml
3-
openapi_spec_hash: 4cb51b22f98dee1a90bc7add82d1d132
2+
openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/openai%2Fopenai-c829f9e7f51d4946dae7b02eb37eb857b538a464cf54c7ced5eff1b1c93e07db.yml
3+
openapi_spec_hash: 1b2eaba46b264bcec8831bc496543649
44
config_hash: 930dac3aa861344867e4ac84f037b5df

openai-java-core/src/main/kotlin/com/openai/models/realtime/InputAudioBufferTimeoutTriggered.kt

Lines changed: 27 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,21 @@ import com.openai.errors.OpenAIInvalidDataException
1515
import java.util.Collections
1616
import java.util.Objects
1717

18-
/** Returned when the server VAD timeout is triggered for the input audio buffer. */
18+
/**
19+
* Returned when the Server VAD timeout is triggered for the input audio buffer. This is configured
20+
* with `idle_timeout_ms` in the `turn_detection` settings of the session, and it indicates that
21+
* there hasn't been any speech detected for the configured duration.
22+
*
23+
* The `audio_start_ms` and `audio_end_ms` fields indicate the segment of audio after the last model
24+
* response up to the triggering time, as an offset from the beginning of audio written to the input
25+
* audio buffer. This means it demarcates the segment of audio that was silent and the difference
26+
* between the start and end values will roughly match the configured timeout.
27+
*
28+
* The empty audio will be committed to the conversation as an `input_audio` item (there will be a
29+
* `input_audio_buffer.committed` event) and a model response will be generated. There may be speech
30+
* that didn't trigger VAD but is still detected by the model, so the model may respond with
31+
* something relevant to the conversation or a prompt to continue speaking.
32+
*/
1933
class InputAudioBufferTimeoutTriggered
2034
private constructor(
2135
private val audioEndMs: JsonField<Long>,
@@ -40,15 +54,17 @@ private constructor(
4054
) : this(audioEndMs, audioStartMs, eventId, itemId, type, mutableMapOf())
4155

4256
/**
43-
* Millisecond offset where speech ended within the buffered audio.
57+
* Millisecond offset of audio written to the input audio buffer at the time the timeout was
58+
* triggered.
4459
*
4560
* @throws OpenAIInvalidDataException if the JSON field has an unexpected type or is
4661
* unexpectedly missing or null (e.g. if the server responded with an unexpected value).
4762
*/
4863
fun audioEndMs(): Long = audioEndMs.getRequired("audio_end_ms")
4964

5065
/**
51-
* Millisecond offset where speech started within the buffered audio.
66+
* Millisecond offset of audio written to the input audio buffer that was after the playback
67+
* time of the last model response.
5268
*
5369
* @throws OpenAIInvalidDataException if the JSON field has an unexpected type or is
5470
* unexpectedly missing or null (e.g. if the server responded with an unexpected value).
@@ -165,7 +181,10 @@ private constructor(
165181
inputAudioBufferTimeoutTriggered.additionalProperties.toMutableMap()
166182
}
167183

168-
/** Millisecond offset where speech ended within the buffered audio. */
184+
/**
185+
* Millisecond offset of audio written to the input audio buffer at the time the timeout was
186+
* triggered.
187+
*/
169188
fun audioEndMs(audioEndMs: Long) = audioEndMs(JsonField.of(audioEndMs))
170189

171190
/**
@@ -176,7 +195,10 @@ private constructor(
176195
*/
177196
fun audioEndMs(audioEndMs: JsonField<Long>) = apply { this.audioEndMs = audioEndMs }
178197

179-
/** Millisecond offset where speech started within the buffered audio. */
198+
/**
199+
* Millisecond offset of audio written to the input audio buffer that was after the playback
200+
* time of the last model response.
201+
*/
180202
fun audioStartMs(audioStartMs: Long) = audioStartMs(JsonField.of(audioStartMs))
181203

182204
/**

openai-java-core/src/main/kotlin/com/openai/models/realtime/RealtimeAudioConfigInput.kt

Lines changed: 41 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -76,14 +76,16 @@ private constructor(
7676

7777
/**
7878
* Configuration for turn detection, ether Server VAD or Semantic VAD. This can be set to `null`
79-
* to turn off, in which case the client must manually trigger model response. Server VAD means
80-
* that the model will detect the start and end of speech based on audio volume and respond at
81-
* the end of user speech. Semantic VAD is more advanced and uses a turn detection model (in
82-
* conjunction with VAD) to semantically estimate whether the user has finished speaking, then
83-
* dynamically sets a timeout based on this probability. For example, if user audio trails off
84-
* with "uhhm", the model will score a low probability of turn end and wait longer for the user
85-
* to continue speaking. This can be useful for more natural conversations, but may have a
86-
* higher latency.
79+
* to turn off, in which case the client must manually trigger model response.
80+
*
81+
* Server VAD means that the model will detect the start and end of speech based on audio volume
82+
* and respond at the end of user speech.
83+
*
84+
* Semantic VAD is more advanced and uses a turn detection model (in conjunction with VAD) to
85+
* semantically estimate whether the user has finished speaking, then dynamically sets a timeout
86+
* based on this probability. For example, if user audio trails off with "uhhm", the model will
87+
* score a low probability of turn end and wait longer for the user to continue speaking. This
88+
* can be useful for more natural conversations, but may have a higher latency.
8789
*
8890
* @throws OpenAIInvalidDataException if the JSON field has an unexpected type (e.g. if the
8991
* server responded with an unexpected value).
@@ -230,17 +232,24 @@ private constructor(
230232

231233
/**
232234
* Configuration for turn detection, ether Server VAD or Semantic VAD. This can be set to
233-
* `null` to turn off, in which case the client must manually trigger model response. Server
234-
* VAD means that the model will detect the start and end of speech based on audio volume
235-
* and respond at the end of user speech. Semantic VAD is more advanced and uses a turn
236-
* detection model (in conjunction with VAD) to semantically estimate whether the user has
237-
* finished speaking, then dynamically sets a timeout based on this probability. For
238-
* example, if user audio trails off with "uhhm", the model will score a low probability of
239-
* turn end and wait longer for the user to continue speaking. This can be useful for more
240-
* natural conversations, but may have a higher latency.
235+
* `null` to turn off, in which case the client must manually trigger model response.
236+
*
237+
* Server VAD means that the model will detect the start and end of speech based on audio
238+
* volume and respond at the end of user speech.
239+
*
240+
* Semantic VAD is more advanced and uses a turn detection model (in conjunction with VAD)
241+
* to semantically estimate whether the user has finished speaking, then dynamically sets a
242+
* timeout based on this probability. For example, if user audio trails off with "uhhm", the
243+
* model will score a low probability of turn end and wait longer for the user to continue
244+
* speaking. This can be useful for more natural conversations, but may have a higher
245+
* latency.
241246
*/
242-
fun turnDetection(turnDetection: RealtimeAudioInputTurnDetection) =
243-
turnDetection(JsonField.of(turnDetection))
247+
fun turnDetection(turnDetection: RealtimeAudioInputTurnDetection?) =
248+
turnDetection(JsonField.ofNullable(turnDetection))
249+
250+
/** Alias for calling [Builder.turnDetection] with `turnDetection.orElse(null)`. */
251+
fun turnDetection(turnDetection: Optional<RealtimeAudioInputTurnDetection>) =
252+
turnDetection(turnDetection.getOrNull())
244253

245254
/**
246255
* Sets [Builder.turnDetection] to an arbitrary JSON value.
@@ -253,6 +262,20 @@ private constructor(
253262
this.turnDetection = turnDetection
254263
}
255264

265+
/**
266+
* Alias for calling [turnDetection] with
267+
* `RealtimeAudioInputTurnDetection.ofServerVad(serverVad)`.
268+
*/
269+
fun turnDetection(serverVad: RealtimeAudioInputTurnDetection.ServerVad) =
270+
turnDetection(RealtimeAudioInputTurnDetection.ofServerVad(serverVad))
271+
272+
/**
273+
* Alias for calling [turnDetection] with
274+
* `RealtimeAudioInputTurnDetection.ofSemanticVad(semanticVad)`.
275+
*/
276+
fun turnDetection(semanticVad: RealtimeAudioInputTurnDetection.SemanticVad) =
277+
turnDetection(RealtimeAudioInputTurnDetection.ofSemanticVad(semanticVad))
278+
256279
fun additionalProperties(additionalProperties: Map<String, JsonValue>) = apply {
257280
this.additionalProperties.clear()
258281
putAllAdditionalProperties(additionalProperties)

0 commit comments

Comments
 (0)