@@ -181,16 +181,19 @@ export namespace RealtimeSessionCreateResponse {
181
181
/**
182
182
* Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
183
183
* set to `null` to turn off, in which case the client must manually trigger model
184
- * response. Server VAD means that the model will detect the start and end of
185
- * speech based on audio volume and respond at the end of user speech. Semantic VAD
186
- * is more advanced and uses a turn detection model (in conjunction with VAD) to
187
- * semantically estimate whether the user has finished speaking, then dynamically
188
- * sets a timeout based on this probability. For example, if user audio trails off
189
- * with "uhhm", the model will score a low probability of turn end and wait longer
190
- * for the user to continue speaking. This can be useful for more natural
191
- * conversations, but may have a higher latency.
184
+ * response.
185
+ *
186
+ * Server VAD means that the model will detect the start and end of speech based on
187
+ * audio volume and respond at the end of user speech.
188
+ *
189
+ * Semantic VAD is more advanced and uses a turn detection model (in conjunction
190
+ * with VAD) to semantically estimate whether the user has finished speaking, then
191
+ * dynamically sets a timeout based on this probability. For example, if user audio
192
+ * trails off with "uhhm", the model will score a low probability of turn end and
193
+ * wait longer for the user to continue speaking. This can be useful for more
194
+ * natural conversations, but may have a higher latency.
192
195
*/
193
- turn_detection ?: Input . TurnDetection ;
196
+ turn_detection ?: Input . ServerVad | Input . SemanticVad | null ;
194
197
}
195
198
196
199
export namespace Input {
@@ -211,35 +214,34 @@ export namespace RealtimeSessionCreateResponse {
211
214
}
212
215
213
216
/**
214
- * Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
215
- * set to `null` to turn off, in which case the client must manually trigger model
216
- * response. Server VAD means that the model will detect the start and end of
217
- * speech based on audio volume and respond at the end of user speech. Semantic VAD
218
- * is more advanced and uses a turn detection model (in conjunction with VAD) to
219
- * semantically estimate whether the user has finished speaking, then dynamically
220
- * sets a timeout based on this probability. For example, if user audio trails off
221
- * with "uhhm", the model will score a low probability of turn end and wait longer
222
- * for the user to continue speaking. This can be useful for more natural
223
- * conversations, but may have a higher latency.
217
+ * Server-side voice activity detection (VAD) which flips on when user speech is
218
+ * detected and off after a period of silence.
224
219
*/
225
- export interface TurnDetection {
220
+ export interface ServerVad {
226
221
/**
227
- * Whether or not to automatically generate a response when a VAD stop event
228
- * occurs.
222
+ * Type of turn detection, `server_vad` to turn on simple Server VAD.
229
223
*/
230
- create_response ?: boolean ;
224
+ type : 'server_vad' ;
231
225
232
226
/**
233
- * Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
234
- * will wait longer for the user to continue speaking, `high` will respond more
235
- * quickly. `auto` is the default and is equivalent to `medium`. `low`, `medium`,
236
- * and `high` have max timeouts of 8s, 4s, and 2s respectively.
227
+ * Whether or not to automatically generate a response when a VAD stop event
228
+ * occurs.
237
229
*/
238
- eagerness ?: 'low' | 'medium' | 'high' | 'auto' ;
230
+ create_response ?: boolean ;
239
231
240
232
/**
241
- * Optional idle timeout after which turn detection will auto-timeout when no
242
- * additional audio is received and emits a `timeout_triggered` event.
233
+ * Optional timeout after which a model response will be triggered automatically.
234
+ * This is useful for situations in which a long pause from the user is unexpected,
235
+ * such as a phone call. The model will effectively prompt the user to continue the
236
+ * conversation based on the current context.
237
+ *
238
+ * The timeout value will be applied after the last model response's audio has
239
+ * finished playing, i.e. it's set to the `response.done` time plus audio playback
240
+ * duration.
241
+ *
242
+ * An `input_audio_buffer.timeout_triggered` event (plus events associated with the
243
+ * Response) will be emitted when the timeout is reached. Idle timeout is currently
244
+ * only supported for `server_vad` mode.
243
245
*/
244
246
idle_timeout_ms ?: number | null ;
245
247
@@ -269,11 +271,38 @@ export namespace RealtimeSessionCreateResponse {
269
271
* model, and thus might perform better in noisy environments.
270
272
*/
271
273
threshold ?: number ;
274
+ }
272
275
276
+ /**
277
+ * Server-side semantic turn detection which uses a model to determine when the
278
+ * user has finished speaking.
279
+ */
280
+ export interface SemanticVad {
273
281
/**
274
- * Type of turn detection.
282
+ * Type of turn detection, `semantic_vad` to turn on Semantic VAD .
275
283
*/
276
- type ?: 'server_vad' | 'semantic_vad' ;
284
+ type : 'semantic_vad' ;
285
+
286
+ /**
287
+ * Whether or not to automatically generate a response when a VAD stop event
288
+ * occurs.
289
+ */
290
+ create_response ?: boolean ;
291
+
292
+ /**
293
+ * Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
294
+ * will wait longer for the user to continue speaking, `high` will respond more
295
+ * quickly. `auto` is the default and is equivalent to `medium`. `low`, `medium`,
296
+ * and `high` have max timeouts of 8s, 4s, and 2s respectively.
297
+ */
298
+ eagerness ?: 'low' | 'medium' | 'high' | 'auto' ;
299
+
300
+ /**
301
+ * Whether or not to automatically interrupt any ongoing response with output to
302
+ * the default conversation (i.e. `conversation` of `auto`) when a VAD start event
303
+ * occurs.
304
+ */
305
+ interrupt_response ?: boolean ;
277
306
}
278
307
}
279
308
0 commit comments