diff --git a/README.md b/README.md index 7f7d539..e521f28 100644 --- a/README.md +++ b/README.md @@ -95,7 +95,11 @@ One of the key session-wide settings is `turn_detection`, which controls how dat - `server_vad` will evaluate incoming user audio (as sent via `input_audio_buffer.append`) using a voice activity detector (VAD) component and automatically use that audio to initiate response generation on applicable conversations when an end of speech is detected. Silence detection for the VAD can be configured when specifying `server_vad` detection mode. - `none` will rely on caller-initiated `input_audio_buffer.commit` and `response.create` commands to progress conversations and produce output. This is useful for push-to-talk applications or situations that have external audio flow control (such as caller-side VAD component). Note that these manual signals can be still be used in `server_vad` mode to supplement VAD-initiated response generation. -Transcription of user input audio is opted into via the `input_audio_transcription` property; specifying a transcription model (`whisper-1`) in this configuration will enable the delivery of `conversation.item.audio_transcription.completed` events. +Transcription of user input audio is opted into via the `input_audio_transcription` property; specifying a transcription model (`whisper-1` or `gpt-4o-mini-transcribe` or `gpt-4o-transcribe` ) in this configuration will enable the delivery of `conversation.item.audio_transcription.completed` events. + +Additionally, noise reduction can be configured using the `input_audio_noise_reduction` property. Specify the type of noise reduction using the `type` field: +- Use `near_field` for close-talking microphones such as headphones. +- Use `far_field` for far-field microphones such as laptop or conference room microphones. An example `session.update` that configures several aspects of the session, including tools, follows. Note that all session parameters are optional; not everything needs to be configured! @@ -114,6 +118,7 @@ An example `session.update` that configures several aspects of the session, incl "silence_duration_ms": 600, "type": "server_vad" }, + "input_audio_noise_reduction": "near_field", "tools": [ { "type": "function", diff --git a/javascript/standalone/src/models.ts b/javascript/standalone/src/models.ts index 27a1fc1..d25dc16 100644 --- a/javascript/standalone/src/models.ts +++ b/javascript/standalone/src/models.ts @@ -38,7 +38,9 @@ export type MessageRole = "system" | "assistant" | "user"; export interface InputAudioTranscription { model: "whisper-1"; } - +export interface NoiseReduction{ + type: "none" | "near_field" | "far_field"; +} export interface ClientMessageBase { event_id?: string; } @@ -58,6 +60,7 @@ export interface SessionUpdateParams { tool_choice?: ToolChoice; temperature?: number; max_response_output_tokens?: number; + input_audio_noise_reduction?: NoiseReduction; } export interface SessionUpdateMessage extends ClientMessageBase { @@ -182,6 +185,7 @@ export interface ResponseCreateParams { tools?: ToolsDefinition; tool_choice?: ToolChoice; output_audio_format?: AudioFormat; + input_audio_noise_reduction?: NoiseReduction; } export interface ResponseCreateMessage extends ClientMessageBase { @@ -224,6 +228,7 @@ export interface Session { tool_choice: ToolChoice; temperature: number; max_response_output_tokens?: number; + input_audio_noise_reduction?: NoiseReduction; } export interface SessionCreatedMessage extends ServerMessageBase { diff --git a/python/rtclient/models.py b/python/rtclient/models.py index cdef1ce..5fe2547 100644 --- a/python/rtclient/models.py +++ b/python/rtclient/models.py @@ -17,7 +17,7 @@ Voice = Literal["alloy", "ash", "ballad", "coral", "echo", "sage", "shimmer", "verse"] AudioFormat = Literal["pcm16", "g711-ulaw", "g711-alaw"] Modality = Literal["text", "audio"] - +NoiseReduction = Literal["none", "near-field", "far_field"] class NoTurnDetection(ModelWithDefaults): type: Literal["none"] = "none" @@ -71,7 +71,7 @@ class SessionUpdateParams(BaseModel): tool_choice: Optional[ToolChoice] = None temperature: Optional[Temperature] = None max_response_output_tokens: Optional[MaxTokensType] = None - + input_audio_noise_reduction: Optional[NoiseReduction] = None class SessionUpdateMessage(ClientMessageBase): """ @@ -226,7 +226,7 @@ class ResponseCreateParams(BaseModel): tools: Optional[ToolsDefinition] = None tool_choice: Optional[ToolChoice] = None output_audio_format: Optional[AudioFormat] = None - + input_audio_noise_reduction: Optional[NoiseReduction] = None class ResponseCreateMessage(ClientMessageBase): """ @@ -272,7 +272,7 @@ class Session(BaseModel): tool_choice: ToolChoice temperature: Temperature max_response_output_tokens: Optional[MaxTokensType] - + input_audio_noise_reduction: Optional[NoiseReduction] = None class SessionCreatedMessage(ServerMessageBase): type: Literal["session.created"] = "session.created"