Skip to content

Commit 24829f6

Browse files
chore(api): Minor docs and type updates for realtime
1 parent c50a697 commit 24829f6

File tree

4 files changed

+371
-194
lines changed

4 files changed

+371
-194
lines changed

.stats.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
configured_endpoints: 118
2-
openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/openai%2Fopenai-16cb18bed32bae8c5840fb39a1bf664026cc40463ad0c487dcb0df1bd3d72db0.yml
3-
openapi_spec_hash: 4cb51b22f98dee1a90bc7add82d1d132
2+
openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/openai%2Fopenai-c829f9e7f51d4946dae7b02eb37eb857b538a464cf54c7ced5eff1b1c93e07db.yml
3+
openapi_spec_hash: 1b2eaba46b264bcec8831bc496543649
44
config_hash: 930dac3aa861344867e4ac84f037b5df

src/resources/realtime/client-secrets.ts

Lines changed: 61 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -181,16 +181,19 @@ export namespace RealtimeSessionCreateResponse {
181181
/**
182182
* Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
183183
* set to `null` to turn off, in which case the client must manually trigger model
184-
* response. Server VAD means that the model will detect the start and end of
185-
* speech based on audio volume and respond at the end of user speech. Semantic VAD
186-
* is more advanced and uses a turn detection model (in conjunction with VAD) to
187-
* semantically estimate whether the user has finished speaking, then dynamically
188-
* sets a timeout based on this probability. For example, if user audio trails off
189-
* with "uhhm", the model will score a low probability of turn end and wait longer
190-
* for the user to continue speaking. This can be useful for more natural
191-
* conversations, but may have a higher latency.
184+
* response.
185+
*
186+
* Server VAD means that the model will detect the start and end of speech based on
187+
* audio volume and respond at the end of user speech.
188+
*
189+
* Semantic VAD is more advanced and uses a turn detection model (in conjunction
190+
* with VAD) to semantically estimate whether the user has finished speaking, then
191+
* dynamically sets a timeout based on this probability. For example, if user audio
192+
* trails off with "uhhm", the model will score a low probability of turn end and
193+
* wait longer for the user to continue speaking. This can be useful for more
194+
* natural conversations, but may have a higher latency.
192195
*/
193-
turn_detection?: Input.TurnDetection;
196+
turn_detection?: Input.ServerVad | Input.SemanticVad | null;
194197
}
195198

196199
export namespace Input {
@@ -211,35 +214,34 @@ export namespace RealtimeSessionCreateResponse {
211214
}
212215

213216
/**
214-
* Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
215-
* set to `null` to turn off, in which case the client must manually trigger model
216-
* response. Server VAD means that the model will detect the start and end of
217-
* speech based on audio volume and respond at the end of user speech. Semantic VAD
218-
* is more advanced and uses a turn detection model (in conjunction with VAD) to
219-
* semantically estimate whether the user has finished speaking, then dynamically
220-
* sets a timeout based on this probability. For example, if user audio trails off
221-
* with "uhhm", the model will score a low probability of turn end and wait longer
222-
* for the user to continue speaking. This can be useful for more natural
223-
* conversations, but may have a higher latency.
217+
* Server-side voice activity detection (VAD) which flips on when user speech is
218+
* detected and off after a period of silence.
224219
*/
225-
export interface TurnDetection {
220+
export interface ServerVad {
226221
/**
227-
* Whether or not to automatically generate a response when a VAD stop event
228-
* occurs.
222+
* Type of turn detection, `server_vad` to turn on simple Server VAD.
229223
*/
230-
create_response?: boolean;
224+
type: 'server_vad';
231225

232226
/**
233-
* Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
234-
* will wait longer for the user to continue speaking, `high` will respond more
235-
* quickly. `auto` is the default and is equivalent to `medium`. `low`, `medium`,
236-
* and `high` have max timeouts of 8s, 4s, and 2s respectively.
227+
* Whether or not to automatically generate a response when a VAD stop event
228+
* occurs.
237229
*/
238-
eagerness?: 'low' | 'medium' | 'high' | 'auto';
230+
create_response?: boolean;
239231

240232
/**
241-
* Optional idle timeout after which turn detection will auto-timeout when no
242-
* additional audio is received and emits a `timeout_triggered` event.
233+
* Optional timeout after which a model response will be triggered automatically.
234+
* This is useful for situations in which a long pause from the user is unexpected,
235+
* such as a phone call. The model will effectively prompt the user to continue the
236+
* conversation based on the current context.
237+
*
238+
* The timeout value will be applied after the last model response's audio has
239+
* finished playing, i.e. it's set to the `response.done` time plus audio playback
240+
* duration.
241+
*
242+
* An `input_audio_buffer.timeout_triggered` event (plus events associated with the
243+
* Response) will be emitted when the timeout is reached. Idle timeout is currently
244+
* only supported for `server_vad` mode.
243245
*/
244246
idle_timeout_ms?: number | null;
245247

@@ -269,11 +271,38 @@ export namespace RealtimeSessionCreateResponse {
269271
* model, and thus might perform better in noisy environments.
270272
*/
271273
threshold?: number;
274+
}
272275

276+
/**
277+
* Server-side semantic turn detection which uses a model to determine when the
278+
* user has finished speaking.
279+
*/
280+
export interface SemanticVad {
273281
/**
274-
* Type of turn detection.
282+
* Type of turn detection, `semantic_vad` to turn on Semantic VAD.
275283
*/
276-
type?: 'server_vad' | 'semantic_vad';
284+
type: 'semantic_vad';
285+
286+
/**
287+
* Whether or not to automatically generate a response when a VAD stop event
288+
* occurs.
289+
*/
290+
create_response?: boolean;
291+
292+
/**
293+
* Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
294+
* will wait longer for the user to continue speaking, `high` will respond more
295+
* quickly. `auto` is the default and is equivalent to `medium`. `low`, `medium`,
296+
* and `high` have max timeouts of 8s, 4s, and 2s respectively.
297+
*/
298+
eagerness?: 'low' | 'medium' | 'high' | 'auto';
299+
300+
/**
301+
* Whether or not to automatically interrupt any ongoing response with output to
302+
* the default conversation (i.e. `conversation` of `auto`) when a VAD start event
303+
* occurs.
304+
*/
305+
interrupt_response?: boolean;
277306
}
278307
}
279308

0 commit comments

Comments
 (0)