Skip to content

Commit 1bd6144

Browse files
feat(owhisper): add OpenAI Realtime API adapter (#2126)
* feat(owhisper): add OpenAI Realtime API adapter Add OpenAI adapter for real-time speech-to-text transcription using the OpenAI Realtime API. The adapter implements the RealtimeSttAdapter trait and supports: - WebSocket connection to wss://api.openai.com/v1/realtime - Session configuration for transcription mode - Parsing of transcription events (completed, delta, failed) - Server-side VAD for turn detection Note: The API configuration is still being finalized as there are two session types (realtime vs transcription) with different schemas. Co-Authored-By: yujonglee <yujonglee.dev@gmail.com> * style: apply dprint formatting to OpenAI adapter Co-Authored-By: yujonglee <yujonglee.dev@gmail.com> * fix(owhisper): use params.model for TranscriptionConfig instead of hardcoded value - Add DEFAULT_MODEL constant to avoid drift between URL and session config - Use params.model with fallback to DEFAULT_MODEL in initial_message - Ensures WebSocket URL model and TranscriptionConfig model stay consistent Co-Authored-By: yujonglee <yujonglee.dev@gmail.com> --------- Co-authored-by: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com>
1 parent 301ae71 commit 1bd6144

File tree

3 files changed

+475
-0
lines changed

3 files changed

+475
-0
lines changed

owhisper/owhisper-client/src/adapter/mod.rs

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ mod deepgram;
44
mod deepgram_compat;
55
mod fireworks;
66
mod gladia;
7+
mod openai;
78
mod owhisper;
89
pub mod parsing;
910
mod soniox;
@@ -16,6 +17,7 @@ pub use assemblyai::*;
1617
pub use deepgram::*;
1718
pub use fireworks::*;
1819
pub use gladia::*;
20+
pub use openai::*;
1921
pub use soniox::*;
2022

2123
use std::future::Future;
@@ -164,6 +166,7 @@ pub enum AdapterKind {
164166
Fireworks,
165167
Deepgram,
166168
AssemblyAI,
169+
OpenAI,
167170
}
168171

169172
impl AdapterKind {
@@ -182,6 +185,8 @@ impl AdapterKind {
182185
Self::Soniox
183186
} else if FireworksAdapter::is_host(base_url) {
184187
Self::Fireworks
188+
} else if OpenAIAdapter::is_host(base_url) {
189+
Self::OpenAI
185190
} else {
186191
Self::Deepgram
187192
}
Lines changed: 360 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,360 @@
1+
use hypr_ws::client::Message;
2+
use owhisper_interface::stream::{Alternatives, Channel, Metadata, StreamResponse};
3+
use owhisper_interface::ListenParams;
4+
use serde::{Deserialize, Serialize};
5+
6+
use super::OpenAIAdapter;
7+
use crate::adapter::parsing::{calculate_time_span, WordBuilder};
8+
use crate::adapter::RealtimeSttAdapter;
9+
10+
impl RealtimeSttAdapter for OpenAIAdapter {
11+
fn provider_name(&self) -> &'static str {
12+
"openai"
13+
}
14+
15+
fn supports_native_multichannel(&self) -> bool {
16+
false
17+
}
18+
19+
fn build_ws_url(&self, api_base: &str, params: &ListenParams, _channels: u8) -> url::Url {
20+
let (mut url, existing_params) =
21+
Self::build_ws_url_from_base(api_base, params.model.as_deref());
22+
23+
if !existing_params.is_empty() {
24+
let mut query_pairs = url.query_pairs_mut();
25+
for (key, value) in &existing_params {
26+
query_pairs.append_pair(key, value);
27+
}
28+
}
29+
30+
url
31+
}
32+
33+
fn build_auth_header(&self, api_key: Option<&str>) -> Option<(&'static str, String)> {
34+
api_key.map(|key| ("Authorization", format!("Bearer {}", key)))
35+
}
36+
37+
fn keep_alive_message(&self) -> Option<Message> {
38+
None
39+
}
40+
41+
fn initial_message(
42+
&self,
43+
_api_key: Option<&str>,
44+
params: &ListenParams,
45+
_channels: u8,
46+
) -> Option<Message> {
47+
let language = params
48+
.languages
49+
.first()
50+
.map(|l| l.iso639().code().to_string());
51+
52+
let model = params.model.as_deref().unwrap_or(super::DEFAULT_MODEL);
53+
54+
let session_config = SessionUpdateEvent {
55+
event_type: "session.update".to_string(),
56+
session: SessionConfig {
57+
session_type: "transcription".to_string(),
58+
audio: Some(AudioConfig {
59+
input: Some(AudioInputConfig {
60+
format: Some(AudioFormat {
61+
format_type: "audio/pcm".to_string(),
62+
rate: 24000,
63+
}),
64+
transcription: Some(TranscriptionConfig {
65+
model: model.to_string(),
66+
language,
67+
}),
68+
turn_detection: Some(TurnDetection {
69+
detection_type: "server_vad".to_string(),
70+
threshold: Some(0.5),
71+
prefix_padding_ms: Some(300),
72+
silence_duration_ms: Some(500),
73+
}),
74+
}),
75+
}),
76+
include: Some(vec!["item.input_audio_transcription.logprobs".to_string()]),
77+
},
78+
};
79+
80+
let json = serde_json::to_string(&session_config).ok()?;
81+
Some(Message::Text(json.into()))
82+
}
83+
84+
fn finalize_message(&self) -> Message {
85+
let commit = InputAudioBufferCommit {
86+
event_type: "input_audio_buffer.commit".to_string(),
87+
};
88+
Message::Text(serde_json::to_string(&commit).unwrap().into())
89+
}
90+
91+
fn parse_response(&self, raw: &str) -> Vec<StreamResponse> {
92+
let event: OpenAIEvent = match serde_json::from_str(raw) {
93+
Ok(e) => e,
94+
Err(e) => {
95+
tracing::warn!(error = ?e, raw = raw, "openai_json_parse_failed");
96+
return vec![];
97+
}
98+
};
99+
100+
match event {
101+
OpenAIEvent::SessionCreated { session } => {
102+
tracing::debug!(session_id = %session.id, "openai_session_created");
103+
vec![]
104+
}
105+
OpenAIEvent::SessionUpdated { session } => {
106+
tracing::debug!(session_id = %session.id, "openai_session_updated");
107+
vec![]
108+
}
109+
OpenAIEvent::InputAudioBufferCommitted { item_id } => {
110+
tracing::debug!(item_id = %item_id, "openai_audio_buffer_committed");
111+
vec![]
112+
}
113+
OpenAIEvent::InputAudioBufferCleared => {
114+
tracing::debug!("openai_audio_buffer_cleared");
115+
vec![]
116+
}
117+
OpenAIEvent::ConversationItemInputAudioTranscriptionCompleted {
118+
item_id,
119+
content_index,
120+
transcript,
121+
} => {
122+
tracing::debug!(
123+
item_id = %item_id,
124+
content_index = content_index,
125+
transcript = %transcript,
126+
"openai_transcription_completed"
127+
);
128+
Self::build_transcript_response(&transcript, true, true)
129+
}
130+
OpenAIEvent::ConversationItemInputAudioTranscriptionDelta {
131+
item_id,
132+
content_index,
133+
delta,
134+
} => {
135+
tracing::debug!(
136+
item_id = %item_id,
137+
content_index = content_index,
138+
delta = %delta,
139+
"openai_transcription_delta"
140+
);
141+
Self::build_transcript_response(&delta, false, false)
142+
}
143+
OpenAIEvent::ConversationItemInputAudioTranscriptionFailed {
144+
item_id, error, ..
145+
} => {
146+
tracing::error!(
147+
item_id = %item_id,
148+
error_type = %error.error_type,
149+
error_message = %error.message,
150+
"openai_transcription_failed"
151+
);
152+
vec![]
153+
}
154+
OpenAIEvent::Error { error } => {
155+
tracing::error!(
156+
error_type = %error.error_type,
157+
error_message = %error.message,
158+
"openai_error"
159+
);
160+
vec![]
161+
}
162+
OpenAIEvent::Unknown => {
163+
tracing::debug!(raw = raw, "openai_unknown_event");
164+
vec![]
165+
}
166+
}
167+
}
168+
}
169+
170+
#[derive(Debug, Serialize)]
171+
struct SessionUpdateEvent {
172+
#[serde(rename = "type")]
173+
event_type: String,
174+
session: SessionConfig,
175+
}
176+
177+
#[derive(Debug, Serialize)]
178+
struct SessionConfig {
179+
#[serde(rename = "type")]
180+
session_type: String,
181+
#[serde(skip_serializing_if = "Option::is_none")]
182+
audio: Option<AudioConfig>,
183+
#[serde(skip_serializing_if = "Option::is_none")]
184+
include: Option<Vec<String>>,
185+
}
186+
187+
#[derive(Debug, Serialize)]
188+
struct AudioConfig {
189+
#[serde(skip_serializing_if = "Option::is_none")]
190+
input: Option<AudioInputConfig>,
191+
}
192+
193+
#[derive(Debug, Serialize)]
194+
struct AudioInputConfig {
195+
#[serde(skip_serializing_if = "Option::is_none")]
196+
format: Option<AudioFormat>,
197+
#[serde(skip_serializing_if = "Option::is_none")]
198+
transcription: Option<TranscriptionConfig>,
199+
#[serde(skip_serializing_if = "Option::is_none")]
200+
turn_detection: Option<TurnDetection>,
201+
}
202+
203+
#[derive(Debug, Serialize)]
204+
struct AudioFormat {
205+
#[serde(rename = "type")]
206+
format_type: String,
207+
rate: u32,
208+
}
209+
210+
#[derive(Debug, Serialize)]
211+
struct TranscriptionConfig {
212+
model: String,
213+
#[serde(skip_serializing_if = "Option::is_none")]
214+
language: Option<String>,
215+
}
216+
217+
#[derive(Debug, Serialize)]
218+
struct TurnDetection {
219+
#[serde(rename = "type")]
220+
detection_type: String,
221+
#[serde(skip_serializing_if = "Option::is_none")]
222+
threshold: Option<f32>,
223+
#[serde(skip_serializing_if = "Option::is_none")]
224+
prefix_padding_ms: Option<u32>,
225+
#[serde(skip_serializing_if = "Option::is_none")]
226+
silence_duration_ms: Option<u32>,
227+
}
228+
229+
#[derive(Debug, Serialize)]
230+
struct InputAudioBufferCommit {
231+
#[serde(rename = "type")]
232+
event_type: String,
233+
}
234+
235+
#[derive(Debug, Deserialize)]
236+
#[serde(tag = "type")]
237+
enum OpenAIEvent {
238+
#[serde(rename = "session.created")]
239+
SessionCreated { session: SessionInfo },
240+
#[serde(rename = "session.updated")]
241+
SessionUpdated { session: SessionInfo },
242+
#[serde(rename = "input_audio_buffer.committed")]
243+
InputAudioBufferCommitted { item_id: String },
244+
#[serde(rename = "input_audio_buffer.cleared")]
245+
InputAudioBufferCleared,
246+
#[serde(rename = "conversation.item.input_audio_transcription.completed")]
247+
ConversationItemInputAudioTranscriptionCompleted {
248+
item_id: String,
249+
content_index: u32,
250+
transcript: String,
251+
},
252+
#[serde(rename = "conversation.item.input_audio_transcription.delta")]
253+
ConversationItemInputAudioTranscriptionDelta {
254+
item_id: String,
255+
content_index: u32,
256+
delta: String,
257+
},
258+
#[serde(rename = "conversation.item.input_audio_transcription.failed")]
259+
ConversationItemInputAudioTranscriptionFailed {
260+
item_id: String,
261+
content_index: u32,
262+
error: OpenAIError,
263+
},
264+
#[serde(rename = "error")]
265+
Error { error: OpenAIError },
266+
#[serde(other)]
267+
Unknown,
268+
}
269+
270+
#[derive(Debug, Deserialize)]
271+
struct SessionInfo {
272+
id: String,
273+
}
274+
275+
#[derive(Debug, Deserialize)]
276+
struct OpenAIError {
277+
#[serde(rename = "type")]
278+
error_type: String,
279+
message: String,
280+
}
281+
282+
impl OpenAIAdapter {
283+
fn build_transcript_response(
284+
transcript: &str,
285+
is_final: bool,
286+
speech_final: bool,
287+
) -> Vec<StreamResponse> {
288+
if transcript.is_empty() {
289+
return vec![];
290+
}
291+
292+
let words: Vec<_> = transcript
293+
.split_whitespace()
294+
.map(|word| WordBuilder::new(word).confidence(1.0).build())
295+
.collect();
296+
297+
let (start, duration) = calculate_time_span(&words);
298+
299+
let channel = Channel {
300+
alternatives: vec![Alternatives {
301+
transcript: transcript.to_string(),
302+
words,
303+
confidence: 1.0,
304+
languages: vec![],
305+
}],
306+
};
307+
308+
vec![StreamResponse::TranscriptResponse {
309+
is_final,
310+
speech_final,
311+
from_finalize: false,
312+
start,
313+
duration,
314+
channel,
315+
metadata: Metadata::default(),
316+
channel_index: vec![0, 1],
317+
}]
318+
}
319+
}
320+
321+
#[cfg(test)]
322+
mod tests {
323+
use super::OpenAIAdapter;
324+
use crate::test_utils::{run_dual_test, run_single_test};
325+
use crate::ListenClient;
326+
327+
#[tokio::test]
328+
#[ignore]
329+
async fn test_build_single() {
330+
let client = ListenClient::builder()
331+
.adapter::<OpenAIAdapter>()
332+
.api_base("wss://api.openai.com")
333+
.api_key(std::env::var("OPENAI_API_KEY").expect("OPENAI_API_KEY not set"))
334+
.params(owhisper_interface::ListenParams {
335+
model: Some("gpt-4o-transcribe".to_string()),
336+
languages: vec![hypr_language::ISO639::En.into()],
337+
..Default::default()
338+
})
339+
.build_single();
340+
341+
run_single_test(client, "openai").await;
342+
}
343+
344+
#[tokio::test]
345+
#[ignore]
346+
async fn test_build_dual() {
347+
let client = ListenClient::builder()
348+
.adapter::<OpenAIAdapter>()
349+
.api_base("wss://api.openai.com")
350+
.api_key(std::env::var("OPENAI_API_KEY").expect("OPENAI_API_KEY not set"))
351+
.params(owhisper_interface::ListenParams {
352+
model: Some("gpt-4o-transcribe".to_string()),
353+
languages: vec![hypr_language::ISO639::En.into()],
354+
..Default::default()
355+
})
356+
.build_dual();
357+
358+
run_dual_test(client, "openai").await;
359+
}
360+
}

0 commit comments

Comments
 (0)