exclude rodio from apps/ai build

yujonglee · yujonglee · commit 54392f2853c1 · 2025-12-20T20:34:40.000+09:00
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/apps/ai/Dockerfile b/apps/ai/Dockerfile
@@ -13,7 +13,6 @@ RUN mkdir -p apps/ai/src && echo "fn main() {}" > apps/ai/src/main.rs
 RUN cargo chef prepare --recipe-path recipe.json
 
 FROM rust:${RUST_VERSION}-bookworm AS build
-RUN apt-get update && apt-get install -y pkg-config libasound2-dev && rm -rf /var/lib/apt/lists/*
 RUN cargo install cargo-chef sccache --locked
 ENV RUSTC_WRAPPER=sccache \
     SCCACHE_DIR=/sccache
diff --git a/crates/owhisper-client/Cargo.toml b/crates/owhisper-client/Cargo.toml
@@ -3,9 +3,12 @@ name = "owhisper-client"
 version = "0.0.1"
 edition = "2024"
 
+[features]
+default = []
+argmax = ["hypr-audio-utils"]
+
 [dependencies]
-hypr-audio = { workspace = true }
-hypr-audio-utils = { workspace = true }
+hypr-audio-utils = { workspace = true, optional = true }
 hypr-language = { workspace = true }
 hypr-ws-client = { workspace = true }
 
@@ -28,6 +31,7 @@ tracing = { workspace = true }
 url = { workspace = true }
 
 [dev-dependencies]
+hypr-audio-utils = { workspace = true }
 hypr-data = { workspace = true }
 
 deepgram = { workspace = true, features = ["listen"] }
diff --git a/crates/owhisper-client/src/adapter/argmax/mod.rs b/crates/owhisper-client/src/adapter/argmax/mod.rs
@@ -1,8 +1,12 @@
+#[cfg(feature = "argmax")]
 mod batch;
 mod keywords;
 mod language;
 mod live;
 
+#[cfg(feature = "argmax")]
+pub use batch::{StreamingBatchConfig, StreamingBatchEvent, StreamingBatchStream};
+
 #[derive(Clone, Default)]
 pub struct ArgmaxAdapter;
 
diff --git a/crates/owhisper-client/src/adapter/deepgram/batch.rs b/crates/owhisper-client/src/adapter/deepgram/batch.rs
@@ -1,6 +1,5 @@
 use std::path::{Path, PathBuf};
 
-use hypr_audio_utils::{Source, f32_to_i16_bytes, resample_audio, source_from_path};
 use owhisper_interface::ListenParams;
 use owhisper_interface::batch::Response as BatchResponse;
 
@@ -26,28 +25,38 @@ impl BatchSttAdapter for DeepgramAdapter {
     }
 }
 
+fn mime_type_from_extension(path: &Path) -> &'static str {
+    match path.extension().and_then(|e| e.to_str()) {
+        Some("mp3") => "audio/mpeg",
+        Some("mp4") => "audio/mp4",
+        Some("m4a") => "audio/mp4",
+        Some("wav") => "audio/wav",
+        Some("webm") => "audio/webm",
+        Some("ogg") => "audio/ogg",
+        Some("flac") => "audio/flac",
+        _ => "application/octet-stream",
+    }
+}
+
 async fn do_transcribe_file(
     client: &ClientWithMiddleware,
     api_base: &str,
     api_key: &str,
     params: &ListenParams,
     file_path: PathBuf,
 ) -> Result<BatchResponse, Error> {
-    let (audio_data, sample_rate) = decode_audio_to_linear16(file_path).await?;
+    let audio_data = tokio::fs::read(&file_path)
+        .await
+        .map_err(|e| Error::AudioProcessing(format!("failed to read file: {}", e)))?;
 
-    let url = {
-        let mut url = build_batch_url(
-            api_base,
-            params,
-            &DeepgramLanguageStrategy,
-            &DeepgramKeywordStrategy,
-        );
-        url.query_pairs_mut()
-            .append_pair("sample_rate", &sample_rate.to_string());
-        url
-    };
+    let content_type = mime_type_from_extension(&file_path);
 
-    let content_type = format!("audio/raw;encoding=linear16;rate={}", sample_rate);
+    let url = build_batch_url(
+        api_base,
+        params,
+        &DeepgramLanguageStrategy,
+        &DeepgramKeywordStrategy,
+    );
 
     let response = client
         .post(url)
@@ -69,45 +78,6 @@ async fn do_transcribe_file(
     }
 }
 
-async fn decode_audio_to_linear16(path: PathBuf) -> Result<(bytes::Bytes, u32), Error> {
-    tokio::task::spawn_blocking(move || -> Result<(bytes::Bytes, u32), Error> {
-        let decoder =
-            source_from_path(&path).map_err(|err| Error::AudioProcessing(err.to_string()))?;
-
-        let channels = decoder.channels().max(1);
-        let sample_rate = decoder.sample_rate();
-
-        let samples = resample_audio(decoder, sample_rate)
-            .map_err(|err| Error::AudioProcessing(err.to_string()))?;
-
-        let samples = if channels == 1 {
-            samples
-        } else {
-            let channels_usize = channels as usize;
-            let mut mono = Vec::with_capacity(samples.len() / channels_usize);
-            for frame in samples.chunks(channels_usize) {
-                if frame.is_empty() {
-                    continue;
-                }
-                let sum: f32 = frame.iter().copied().sum();
-                mono.push(sum / frame.len() as f32);
-            }
-            mono
-        };
-
-        if samples.is_empty() {
-            return Err(Error::AudioProcessing(
-                "audio file contains no samples".to_string(),
-            ));
-        }
-
-        let bytes = f32_to_i16_bytes(samples.into_iter());
-
-        Ok((bytes, sample_rate))
-    })
-    .await?
-}
-
 #[cfg(test)]
 mod tests {
     use super::*;
diff --git a/crates/owhisper-client/src/lib.rs b/crates/owhisper-client/src/lib.rs
@@ -15,6 +15,9 @@ pub use adapter::{
     FireworksAdapter, GladiaAdapter, OpenAIAdapter, RealtimeSttAdapter, SonioxAdapter,
     append_provider_param, is_local_host,
 };
+#[cfg(feature = "argmax")]
+pub use adapter::{StreamingBatchConfig, StreamingBatchEvent, StreamingBatchStream};
+
 pub use batch::{BatchClient, BatchClientBuilder};
 pub use error::Error;
 pub use hypr_ws_client;
diff --git a/crates/transcribe-proxy/Cargo.toml b/crates/transcribe-proxy/Cargo.toml
@@ -32,7 +32,6 @@ url = { workspace = true }
 hypr-audio-utils = { workspace = true }
 hypr-data = { workspace = true }
 hypr-language = { workspace = true }
-owhisper-client = { workspace = true }
 owhisper-interface = { workspace = true }
 
 rodio = { workspace = true }
diff --git a/plugins/listener2/Cargo.toml b/plugins/listener2/Cargo.toml
@@ -17,7 +17,7 @@ specta-typescript = { workspace = true }
 hypr-audio-utils = { workspace = true }
 hypr-language = { workspace = true }
 
-owhisper-client = { workspace = true }
+owhisper-client = { workspace = true, features = ["argmax"] }
 owhisper-interface = { workspace = true }
 
 tauri = { workspace = true, features = ["specta", "test"] }