Skip to content

Commit 497a78d

Browse files
committed
experimental assemblyai support
1 parent 9034f2a commit 497a78d

File tree

5 files changed

+100
-62
lines changed

5 files changed

+100
-62
lines changed

apps/desktop/src/components/settings/ai/stt/configure.tsx

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -463,11 +463,13 @@ function ProviderContext({ providerId }: { providerId: ProviderId }) {
463463
you can do that in the **advanced** section.`
464464
: providerId === "soniox"
465465
? `Use [Soniox](https://soniox.com) for transcriptions.`
466-
: providerId === "fireworks"
467-
? `Use [Fireworks AI](https://fireworks.ai) for transcriptions.`
468-
: providerId === "custom"
469-
? `We only support **Deepgram compatible** endpoints for now.`
470-
: "";
466+
: providerId === "assemblyai"
467+
? `Use [AssemblyAI](https://www.assemblyai.com) for transcriptions.`
468+
: providerId === "fireworks"
469+
? `Use [Fireworks AI](https://fireworks.ai) for transcriptions.`
470+
: providerId === "custom"
471+
? `We only support **Deepgram compatible** endpoints for now.`
472+
: "";
471473

472474
if (!content.trim()) {
473475
return null;

apps/desktop/src/components/settings/ai/stt/shared.tsx

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
import { Icon } from "@iconify-icon/react";
2-
import { Fireworks } from "@lobehub/icons";
2+
import { AssemblyAI, Fireworks } from "@lobehub/icons";
33
import { queryOptions } from "@tanstack/react-query";
44
import type { ReactNode } from "react";
55

@@ -32,6 +32,10 @@ export const displayModelId = (model: string) => {
3232
return "Soniox v3";
3333
}
3434

35+
if (model === "universal") {
36+
return "Universal";
37+
}
38+
3539
if (model.startsWith("am-")) {
3640
const am = model as AmModel;
3741
if (am == "am-parakeet-v2") {
@@ -109,6 +113,15 @@ export const PROVIDERS = [
109113
models: ["stt-v3"],
110114
requiresPro: false,
111115
},
116+
{
117+
disabled: false,
118+
id: "assemblyai",
119+
displayName: "AssemblyAI",
120+
icon: <AssemblyAI size={20} />,
121+
baseUrl: "https://api.assemblyai.com",
122+
models: ["universal"],
123+
requiresPro: false,
124+
},
112125
{
113126
disabled: false,
114127
id: "custom",

owhisper/owhisper-client/src/adapter/assemblyai/batch.rs

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,12 @@ use super::AssemblyAIAdapter;
1313
use crate::adapter::{BatchFuture, BatchSttAdapter};
1414
use crate::error::Error;
1515

16+
// API
17+
// https://www.assemblyai.com/docs/api-reference/transcripts/submit.md
18+
// https://www.assemblyai.com/docs/api-reference/transcripts/get.md
19+
// Model & Language
20+
// https://www.assemblyai.com/docs/pre-recorded-audio/select-the-speech-model.md
21+
// https://www.assemblyai.com/docs/pre-recorded-audio/supported-languages.md
1622
impl BatchSttAdapter for AssemblyAIAdapter {
1723
fn transcribe_file<'a, P: AsRef<Path> + Send + 'a>(
1824
&'a self,

owhisper/owhisper-client/src/adapter/assemblyai/live.rs

Lines changed: 67 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,10 @@ use serde::Deserialize;
66
use super::AssemblyAIAdapter;
77
use crate::adapter::RealtimeSttAdapter;
88

9+
// https://www.assemblyai.com/docs/api-reference/streaming-api/streaming-api.md
910
impl RealtimeSttAdapter for AssemblyAIAdapter {
1011
fn supports_native_multichannel(&self) -> bool {
12+
// https://www.assemblyai.com/docs/universal-streaming/multichannel-streams.md
1113
false
1214
}
1315

@@ -22,38 +24,25 @@ impl RealtimeSttAdapter for AssemblyAIAdapter {
2224
query_pairs.append_pair("encoding", "pcm_s16le");
2325
query_pairs.append_pair("format_turns", "true");
2426

25-
// Compute final speech_model and language_detection values
2627
let model = params
2728
.model
2829
.as_deref()
2930
.unwrap_or("universal-streaming-english");
30-
let mut speech_model_final = match model {
31-
"multilingual" | "universal-streaming-multilingual" => {
32-
"universal-streaming-multilingual"
33-
}
34-
_ => "universal-streaming-english",
35-
};
36-
let mut language_detection = false;
37-
38-
if !params.languages.is_empty() {
39-
if params.languages.len() > 1
40-
|| speech_model_final == "universal-streaming-multilingual"
41-
{
42-
language_detection = true;
43-
} else if let Some(lang) = params.languages.first() {
44-
let code = lang.iso639().code();
45-
if code != "en" {
46-
speech_model_final = "universal-streaming-multilingual";
47-
language_detection = true;
48-
}
49-
}
50-
}
5131

52-
query_pairs.append_pair("speech_model", speech_model_final);
32+
let (speech_model, language, language_detection) =
33+
Self::resolve_language_config(model, params);
34+
35+
query_pairs.append_pair("speech_model", speech_model);
36+
query_pairs.append_pair("language", language);
5337
if language_detection {
5438
query_pairs.append_pair("language_detection", "true");
5539
}
5640

41+
if let Some(redemption_time) = params.redemption_time_ms {
42+
let max_silence = redemption_time.to_string();
43+
query_pairs.append_pair("max_turn_silence", &max_silence);
44+
}
45+
5746
if !params.keywords.is_empty() {
5847
let keyterms_json = serde_json::to_string(&params.keywords).unwrap_or_default();
5948
query_pairs.append_pair("keyterms_prompt", &keyterms_json);
@@ -64,8 +53,7 @@ impl RealtimeSttAdapter for AssemblyAIAdapter {
6453
}
6554

6655
fn build_auth_header(&self, api_key: Option<&str>) -> Option<(&'static str, String)> {
67-
// AssemblyAI accepts the API key directly in the Authorization header (no Bearer prefix)
68-
api_key.map(|key| ("authorization", key.to_string()))
56+
api_key.map(|key| ("Authorization", key.to_string()))
6957
}
7058

7159
fn keep_alive_message(&self) -> Option<Message> {
@@ -86,8 +74,8 @@ impl RealtimeSttAdapter for AssemblyAIAdapter {
8674
};
8775

8876
match msg {
89-
AssemblyAIMessage::Begin { .. } => {
90-
tracing::debug!("assemblyai_session_began");
77+
AssemblyAIMessage::Begin { id, expires_at } => {
78+
tracing::debug!(session_id = %id, expires_at = %expires_at, "assemblyai_session_began");
9179
vec![]
9280
}
9381
AssemblyAIMessage::Turn(turn) => Self::parse_turn(turn),
@@ -107,6 +95,10 @@ impl RealtimeSttAdapter for AssemblyAIAdapter {
10795
channels: 1,
10896
}]
10997
}
98+
AssemblyAIMessage::Error { error } => {
99+
tracing::error!(error = %error, "assemblyai_error");
100+
vec![]
101+
}
110102
AssemblyAIMessage::Unknown => {
111103
tracing::debug!(raw = raw, "assemblyai_unknown_message");
112104
vec![]
@@ -119,16 +111,17 @@ impl RealtimeSttAdapter for AssemblyAIAdapter {
119111
#[serde(tag = "type")]
120112
enum AssemblyAIMessage {
121113
Begin {
122-
#[allow(dead_code)]
123114
id: String,
124-
#[allow(dead_code)]
125115
expires_at: u64,
126116
},
127117
Turn(TurnMessage),
128118
Termination {
129119
audio_duration_seconds: u64,
130120
session_duration_seconds: u64,
131121
},
122+
Error {
123+
error: String,
124+
},
132125
#[serde(other)]
133126
Unknown,
134127
}
@@ -172,6 +165,28 @@ struct AssemblyAIWord {
172165
}
173166

174167
impl AssemblyAIAdapter {
168+
fn resolve_language_config(
169+
model: &str,
170+
params: &ListenParams,
171+
) -> (&'static str, &'static str, bool) {
172+
let is_multilingual_model =
173+
matches!(model, "multilingual" | "universal-streaming-multilingual");
174+
175+
let needs_multilingual = is_multilingual_model
176+
|| params.languages.len() > 1
177+
|| params
178+
.languages
179+
.first()
180+
.map(|l| l.iso639().code() != "en")
181+
.unwrap_or(false);
182+
183+
if needs_multilingual {
184+
("universal-streaming-multilingual", "multi", true)
185+
} else {
186+
("universal-streaming-english", "en", false)
187+
}
188+
}
189+
175190
fn parse_turn(turn: TurnMessage) -> Vec<StreamResponse> {
176191
tracing::debug!(
177192
transcript = %turn.transcript,
@@ -217,8 +232,26 @@ impl AssemblyAIAdapter {
217232

218233
let transcript = if turn.turn_is_formatted {
219234
turn.transcript.clone()
235+
} else if let Some(ref utt) = turn.utterance {
236+
if !utt.is_empty() {
237+
utt.clone()
238+
} else if !turn.transcript.is_empty() {
239+
turn.transcript.clone()
240+
} else {
241+
words
242+
.iter()
243+
.map(|w| w.word.as_str())
244+
.collect::<Vec<_>>()
245+
.join(" ")
246+
}
247+
} else if !turn.transcript.is_empty() {
248+
turn.transcript.clone()
220249
} else {
221-
turn.utterance.clone().unwrap_or(turn.transcript.clone())
250+
words
251+
.iter()
252+
.map(|w| w.word.as_str())
253+
.collect::<Vec<_>>()
254+
.join(" ")
222255
};
223256

224257
let channel = Channel {
@@ -249,11 +282,9 @@ mod tests {
249282
use hypr_audio_utils::AudioFormatExt;
250283

251284
use super::AssemblyAIAdapter;
252-
use crate::live::{FinalizeHandle, ListenClientInput};
285+
use crate::live::ListenClientInput;
253286
use crate::ListenClient;
254287

255-
// Integration test that makes real network calls to AssemblyAI.
256-
// Run explicitly with: cargo test -p owhisper-client test_client -- --ignored
257288
#[tokio::test]
258289
#[ignore]
259290
async fn test_client() {
@@ -283,42 +314,22 @@ mod tests {
283314
})
284315
.build_single();
285316

286-
let (stream, handle) = client.from_realtime_audio(input).await.unwrap();
317+
let (stream, _handle) = client.from_realtime_audio(input).await.unwrap();
287318
futures_util::pin_mut!(stream);
288319

289-
let mut saw_transcript = false;
290320
while let Some(result) = stream.next().await {
291321
match result {
292322
Ok(response) => match response {
293323
owhisper_interface::stream::StreamResponse::TranscriptResponse {
294324
channel,
295-
speech_final,
296325
..
297326
} => {
298-
let transcript = &channel.alternatives.first().unwrap().transcript;
299-
println!(
300-
"Transcript (speech_final={}): {:?}",
301-
speech_final, transcript
302-
);
303-
if !transcript.is_empty() {
304-
saw_transcript = true;
305-
break;
306-
}
327+
println!("{:?}", channel.alternatives.first().unwrap().transcript);
307328
}
308329
_ => {}
309330
},
310-
Err(e) => {
311-
println!("Error: {:?}", e);
312-
break;
313-
}
331+
_ => {}
314332
}
315333
}
316-
317-
handle.finalize().await;
318-
319-
assert!(
320-
saw_transcript,
321-
"expected at least one non-empty transcript from AssemblyAI"
322-
);
323334
}
324335
}

owhisper/owhisper-client/src/adapter/assemblyai/mod.rs

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,12 @@ impl AssemblyAIAdapter {
1212
.expect("invalid_default_ws_url");
1313
}
1414

15+
if api_base.contains(".eu.") || api_base.ends_with("-eu") {
16+
return "wss://streaming.eu.assemblyai.com/v3/ws"
17+
.parse()
18+
.expect("invalid_eu_ws_url");
19+
}
20+
1521
let mut url: url::Url = api_base.parse().expect("invalid_api_base");
1622

1723
let mut path = url.path().to_string();

0 commit comments

Comments
 (0)