Skip to content

Commit 5a56b7a

Browse files
Fix OpenAI Realtime API transcription test (#2127)
* Fix OpenAI Realtime API transcription test - Add intent=transcription to WebSocket URL for transcription-only sessions - Add session.type = transcription in session.update payload - Implement audio_to_message method to wrap audio in base64-encoded JSON events - Add InputAudioBufferAppend struct for proper audio event serialization - Update live.rs to transform audio stream before passing to WebSocket client - Add configurable sample rate support (OpenAI requires 24kHz PCM) - Add speech_started and speech_stopped event handlers for better debugging - Add base64 dependency for audio encoding Co-Authored-By: yujonglee <yujonglee.dev@gmail.com> * Remove unused interleave_audio call in ListenClientDualIO::to_input Co-Authored-By: yujonglee <yujonglee.dev@gmail.com> --------- Co-authored-by: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com>
1 parent 34f8b16 commit 5a56b7a

File tree

7 files changed

+146
-62
lines changed

7 files changed

+146
-62
lines changed

Cargo.lock

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

owhisper/owhisper-client/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ tokio = { workspace = true }
1717
tokio-stream = { workspace = true }
1818
ureq = { version = "2", features = ["json"] }
1919

20+
base64 = "0.22.1"
2021
bytes = { workspace = true }
2122
serde = { workspace = true }
2223
serde_json = { workspace = true }

owhisper/owhisper-client/src/adapter/mod.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,10 @@ pub trait RealtimeSttAdapter: Clone + Default + Send + Sync + 'static {
5656

5757
fn finalize_message(&self) -> Message;
5858

59+
fn audio_to_message(&self, audio: bytes::Bytes) -> Message {
60+
Message::Binary(audio)
61+
}
62+
5963
fn initial_message(
6064
&self,
6165
_api_key: Option<&str>,

owhisper/owhisper-client/src/adapter/openai/live.rs

Lines changed: 44 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -16,9 +16,8 @@ impl RealtimeSttAdapter for OpenAIAdapter {
1616
false
1717
}
1818

19-
fn build_ws_url(&self, api_base: &str, params: &ListenParams, _channels: u8) -> url::Url {
20-
let (mut url, existing_params) =
21-
Self::build_ws_url_from_base(api_base, params.model.as_deref());
19+
fn build_ws_url(&self, api_base: &str, _params: &ListenParams, _channels: u8) -> url::Url {
20+
let (mut url, existing_params) = Self::build_ws_url_from_base(api_base);
2221

2322
if !existing_params.is_empty() {
2423
let mut query_pairs = url.query_pairs_mut();
@@ -38,6 +37,16 @@ impl RealtimeSttAdapter for OpenAIAdapter {
3837
None
3938
}
4039

40+
fn audio_to_message(&self, audio: bytes::Bytes) -> Message {
41+
use base64::Engine;
42+
let base64_audio = base64::engine::general_purpose::STANDARD.encode(&audio);
43+
let event = InputAudioBufferAppend {
44+
event_type: "input_audio_buffer.append".to_string(),
45+
audio: base64_audio,
46+
};
47+
Message::Text(serde_json::to_string(&event).unwrap().into())
48+
}
49+
4150
fn initial_message(
4251
&self,
4352
_api_key: Option<&str>,
@@ -49,7 +58,10 @@ impl RealtimeSttAdapter for OpenAIAdapter {
4958
.first()
5059
.map(|l| l.iso639().code().to_string());
5160

52-
let model = params.model.as_deref().unwrap_or(super::DEFAULT_MODEL);
61+
let model = params
62+
.model
63+
.as_deref()
64+
.unwrap_or(super::DEFAULT_TRANSCRIPTION_MODEL);
5365

5466
let session_config = SessionUpdateEvent {
5567
event_type: "session.update".to_string(),
@@ -59,7 +71,7 @@ impl RealtimeSttAdapter for OpenAIAdapter {
5971
input: Some(AudioInputConfig {
6072
format: Some(AudioFormat {
6173
format_type: "audio/pcm".to_string(),
62-
rate: 24000,
74+
rate: params.sample_rate,
6375
}),
6476
transcription: Some(TranscriptionConfig {
6577
model: model.to_string(),
@@ -78,6 +90,7 @@ impl RealtimeSttAdapter for OpenAIAdapter {
7890
};
7991

8092
let json = serde_json::to_string(&session_config).ok()?;
93+
tracing::debug!(payload = %json, "openai_session_update_payload");
8194
Some(Message::Text(json.into()))
8295
}
8396

@@ -114,6 +127,14 @@ impl RealtimeSttAdapter for OpenAIAdapter {
114127
tracing::debug!("openai_audio_buffer_cleared");
115128
vec![]
116129
}
130+
OpenAIEvent::InputAudioBufferSpeechStarted { item_id } => {
131+
tracing::debug!(item_id = %item_id, "openai_speech_started");
132+
vec![]
133+
}
134+
OpenAIEvent::InputAudioBufferSpeechStopped { item_id } => {
135+
tracing::debug!(item_id = %item_id, "openai_speech_stopped");
136+
vec![]
137+
}
117138
OpenAIEvent::ConversationItemInputAudioTranscriptionCompleted {
118139
item_id,
119140
content_index,
@@ -226,6 +247,13 @@ struct TurnDetection {
226247
silence_duration_ms: Option<u32>,
227248
}
228249

250+
#[derive(Debug, Serialize)]
251+
struct InputAudioBufferAppend {
252+
#[serde(rename = "type")]
253+
event_type: String,
254+
audio: String,
255+
}
256+
229257
#[derive(Debug, Serialize)]
230258
struct InputAudioBufferCommit {
231259
#[serde(rename = "type")]
@@ -243,6 +271,10 @@ enum OpenAIEvent {
243271
InputAudioBufferCommitted { item_id: String },
244272
#[serde(rename = "input_audio_buffer.cleared")]
245273
InputAudioBufferCleared,
274+
#[serde(rename = "input_audio_buffer.speech_started")]
275+
InputAudioBufferSpeechStarted { item_id: String },
276+
#[serde(rename = "input_audio_buffer.speech_stopped")]
277+
InputAudioBufferSpeechStopped { item_id: String },
246278
#[serde(rename = "conversation.item.input_audio_transcription.completed")]
247279
ConversationItemInputAudioTranscriptionCompleted {
248280
item_id: String,
@@ -321,9 +353,11 @@ impl OpenAIAdapter {
321353
#[cfg(test)]
322354
mod tests {
323355
use super::OpenAIAdapter;
324-
use crate::test_utils::{run_dual_test, run_single_test};
356+
use crate::test_utils::{run_dual_test_with_rate, run_single_test_with_rate};
325357
use crate::ListenClient;
326358

359+
const OPENAI_SAMPLE_RATE: u32 = 24000;
360+
327361
#[tokio::test]
328362
#[ignore]
329363
async fn test_build_single() {
@@ -334,11 +368,12 @@ mod tests {
334368
.params(owhisper_interface::ListenParams {
335369
model: Some("gpt-4o-transcribe".to_string()),
336370
languages: vec![hypr_language::ISO639::En.into()],
371+
sample_rate: OPENAI_SAMPLE_RATE,
337372
..Default::default()
338373
})
339374
.build_single();
340375

341-
run_single_test(client, "openai").await;
376+
run_single_test_with_rate(client, "openai", OPENAI_SAMPLE_RATE).await;
342377
}
343378

344379
#[tokio::test]
@@ -351,10 +386,11 @@ mod tests {
351386
.params(owhisper_interface::ListenParams {
352387
model: Some("gpt-4o-transcribe".to_string()),
353388
languages: vec![hypr_language::ISO639::En.into()],
389+
sample_rate: OPENAI_SAMPLE_RATE,
354390
..Default::default()
355391
})
356392
.build_dual();
357393

358-
run_dual_test(client, "openai").await;
394+
run_dual_test_with_rate(client, "openai", OPENAI_SAMPLE_RATE).await;
359395
}
360396
}

owhisper/owhisper-client/src/adapter/openai/mod.rs

Lines changed: 9 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ mod live;
33

44
pub(crate) const DEFAULT_WS_HOST: &str = "api.openai.com";
55
pub(crate) const WS_PATH: &str = "/v1/realtime";
6-
pub(crate) const DEFAULT_MODEL: &str = "gpt-4o-transcribe";
6+
pub(crate) const DEFAULT_TRANSCRIPTION_MODEL: &str = "gpt-4o-transcribe";
77

88
#[derive(Clone, Default)]
99
pub struct OpenAIAdapter;
@@ -21,17 +21,13 @@ impl OpenAIAdapter {
2121
host.contains("openai.com")
2222
}
2323

24-
pub(crate) fn build_ws_url_from_base(
25-
api_base: &str,
26-
model: Option<&str>,
27-
) -> (url::Url, Vec<(String, String)>) {
24+
pub(crate) fn build_ws_url_from_base(api_base: &str) -> (url::Url, Vec<(String, String)>) {
2825
if api_base.is_empty() {
29-
let model = model.unwrap_or(DEFAULT_MODEL);
3026
return (
3127
format!("wss://{}{}", DEFAULT_WS_HOST, WS_PATH)
3228
.parse()
3329
.expect("invalid_default_ws_url"),
34-
vec![("model".to_string(), model.to_string())],
30+
vec![("intent".to_string(), "transcription".to_string())],
3531
);
3632
}
3733

@@ -42,9 +38,8 @@ impl OpenAIAdapter {
4238
let parsed: url::Url = api_base.parse().expect("invalid_api_base");
4339
let mut existing_params = super::extract_query_params(&parsed);
4440

45-
if !existing_params.iter().any(|(k, _)| k == "model") {
46-
let model = model.unwrap_or(DEFAULT_MODEL);
47-
existing_params.push(("model".to_string(), model.to_string()));
41+
if !existing_params.iter().any(|(k, _)| k == "intent") {
42+
existing_params.push(("intent".to_string(), "transcription".to_string()));
4843
}
4944

5045
let host = parsed.host_str().unwrap_or(DEFAULT_WS_HOST);
@@ -64,40 +59,26 @@ mod tests {
6459

6560
#[test]
6661
fn test_build_ws_url_from_base_empty() {
67-
let (url, params) = OpenAIAdapter::build_ws_url_from_base("", None);
62+
let (url, params) = OpenAIAdapter::build_ws_url_from_base("");
6863
assert_eq!(url.as_str(), "wss://api.openai.com/v1/realtime");
6964
assert_eq!(
7065
params,
71-
vec![("model".to_string(), "gpt-4o-transcribe".to_string())]
72-
);
73-
}
74-
75-
#[test]
76-
fn test_build_ws_url_from_base_with_model() {
77-
let (url, params) =
78-
OpenAIAdapter::build_ws_url_from_base("", Some("gpt-4o-mini-realtime-preview"));
79-
assert_eq!(url.as_str(), "wss://api.openai.com/v1/realtime");
80-
assert_eq!(
81-
params,
82-
vec![(
83-
"model".to_string(),
84-
"gpt-4o-mini-realtime-preview".to_string()
85-
)]
66+
vec![("intent".to_string(), "transcription".to_string())]
8667
);
8768
}
8869

8970
#[test]
9071
fn test_build_ws_url_from_base_proxy() {
9172
let (url, params) =
92-
OpenAIAdapter::build_ws_url_from_base("https://api.hyprnote.com?provider=openai", None);
73+
OpenAIAdapter::build_ws_url_from_base("https://api.hyprnote.com?provider=openai");
9374
assert_eq!(url.as_str(), "wss://api.hyprnote.com/listen");
9475
assert_eq!(params, vec![("provider".to_string(), "openai".to_string())]);
9576
}
9677

9778
#[test]
9879
fn test_build_ws_url_from_base_localhost() {
9980
let (url, params) =
100-
OpenAIAdapter::build_ws_url_from_base("http://localhost:8787?provider=openai", None);
81+
OpenAIAdapter::build_ws_url_from_base("http://localhost:8787?provider=openai");
10182
assert_eq!(url.as_str(), "ws://localhost:8787/listen");
10283
assert_eq!(params, vec![("provider".to_string(), "openai".to_string())]);
10384
}

0 commit comments

Comments
 (0)