Skip to content

Commit 0ba52ce

Browse files
authored
feat(stt): add base_url and language support for local whisper servers (#616)
Add base_url and language fields to SttConfig, allowing the Whisper STT provider to target OpenAI-compatible local servers (e.g. whisper.cpp) without requiring an OpenAI API key. Pass language parameter in transcription requests for accurate non-English speech recognition. Preserve voice message attachments through drain_channel buffering and add configurable language support for the candle-whisper backend.
1 parent e3927e6 commit 0ba52ce

File tree

10 files changed

+180
-46
lines changed

10 files changed

+180
-46
lines changed

CHANGELOG.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,10 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/).
77
## [Unreleased]
88

99
### Added
10+
- `base_url` and `language` fields in `[llm.stt]` config for OpenAI-compatible local whisper servers (e.g. whisper.cpp)
11+
- `ZEPH_STT_BASE_URL` and `ZEPH_STT_LANGUAGE` environment variable overrides
12+
- Whisper API provider now passes `language` parameter for accurate non-English transcription
13+
- Documentation for whisper.cpp server setup with Metal acceleration on macOS
1014
- Per-sub-provider `base_url` and `embedding_model` overrides in orchestrator config
1115
- Full orchestrator example with cloud + local + STT in default.toml
1216
- All previously undocumented config keys in default.toml (`agent.auto_update_check`, `llm.stt`, `llm.vision_model`, `skills.disambiguation_threshold`, `tools.filters.*`, `tools.permissions`, `a2a.auth_token`, `mcp.servers.env`)
@@ -17,6 +21,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/).
1721
- Vault age backend now falls back to default directory for key/path when `--vault-key`/`--vault-path` are not provided, matching `zeph vault init` behavior (#613)
1822

1923
### Changed
24+
- Whisper STT provider no longer requires OpenAI API key when `base_url` points to a local server
2025
- Orchestrator sub-providers now resolve `base_url` and `embedding_model` via fallback chain: per-provider, parent section, global default
2126

2227
## [0.11.1] - 2026-02-19

crates/zeph-core/src/agent/mod.rs

Lines changed: 32 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,7 @@ struct QueuedMessage {
7373
text: String,
7474
received_at: Instant,
7575
image_parts: Vec<zeph_llm::provider::MessagePart>,
76+
raw_attachments: Vec<crate::channel::Attachment>,
7677
}
7778

7879
pub(super) struct MemoryState {
@@ -560,20 +561,23 @@ impl<C: Channel, T: ToolExecutor> Agent<C, T> {
560561
self.message_queue.pop_back();
561562
continue;
562563
}
563-
self.enqueue_or_merge(msg.text, vec![]);
564+
self.enqueue_or_merge(msg.text, vec![], msg.attachments);
564565
}
565566
}
566567

567568
fn enqueue_or_merge(
568569
&mut self,
569570
text: String,
570571
image_parts: Vec<zeph_llm::provider::MessagePart>,
572+
raw_attachments: Vec<crate::channel::Attachment>,
571573
) {
572574
let now = Instant::now();
573575
if let Some(last) = self.message_queue.back_mut()
574576
&& now.duration_since(last.received_at) < MESSAGE_MERGE_WINDOW
575577
&& last.image_parts.is_empty()
576578
&& image_parts.is_empty()
579+
&& last.raw_attachments.is_empty()
580+
&& raw_attachments.is_empty()
577581
{
578582
last.text.push('\n');
579583
last.text.push_str(&text);
@@ -584,6 +588,7 @@ impl<C: Channel, T: ToolExecutor> Agent<C, T> {
584588
text,
585589
received_at: now,
586590
image_parts,
591+
raw_attachments,
587592
});
588593
} else {
589594
tracing::warn!("message queue full, dropping message");
@@ -649,7 +654,15 @@ impl<C: Channel, T: ToolExecutor> Agent<C, T> {
649654

650655
let (text, image_parts) = if let Some(queued) = self.message_queue.pop_front() {
651656
self.notify_queue_count().await;
652-
(queued.text, queued.image_parts)
657+
if queued.raw_attachments.is_empty() {
658+
(queued.text, queued.image_parts)
659+
} else {
660+
let msg = crate::channel::ChannelMessage {
661+
text: queued.text,
662+
attachments: queued.raw_attachments,
663+
};
664+
self.resolve_message(msg).await
665+
}
653666
} else {
654667
let incoming = tokio::select! {
655668
result = self.channel.recv() => result?,
@@ -708,6 +721,12 @@ impl<C: Channel, T: ToolExecutor> Agent<C, T> {
708721
.into_iter()
709722
.partition(|a| a.kind == AttachmentKind::Audio);
710723

724+
tracing::debug!(
725+
audio = audio_attachments.len(),
726+
has_stt = self.stt.is_some(),
727+
"resolve_message attachments"
728+
);
729+
711730
let text = if !audio_attachments.is_empty()
712731
&& let Some(stt) = self.stt.as_ref()
713732
{
@@ -2029,7 +2048,7 @@ pub(super) mod agent_tests {
20292048
let executor = MockToolExecutor::no_tools();
20302049
let mut agent = Agent::new(provider, channel, registry, None, 5, executor);
20312050

2032-
agent.enqueue_or_merge("hello".into(), vec![]);
2051+
agent.enqueue_or_merge("hello".into(), vec![], vec![]);
20332052
assert_eq!(agent.message_queue.len(), 1);
20342053
assert_eq!(agent.message_queue[0].text, "hello");
20352054
}
@@ -2042,8 +2061,8 @@ pub(super) mod agent_tests {
20422061
let executor = MockToolExecutor::no_tools();
20432062
let mut agent = Agent::new(provider, channel, registry, None, 5, executor);
20442063

2045-
agent.enqueue_or_merge("first".into(), vec![]);
2046-
agent.enqueue_or_merge("second".into(), vec![]);
2064+
agent.enqueue_or_merge("first".into(), vec![], vec![]);
2065+
agent.enqueue_or_merge("second".into(), vec![], vec![]);
20472066
assert_eq!(agent.message_queue.len(), 1);
20482067
assert_eq!(agent.message_queue[0].text, "first\nsecond");
20492068
}
@@ -2060,8 +2079,9 @@ pub(super) mod agent_tests {
20602079
text: "old".into(),
20612080
received_at: Instant::now() - Duration::from_secs(2),
20622081
image_parts: vec![],
2082+
raw_attachments: vec![],
20632083
});
2064-
agent.enqueue_or_merge("new".into(), vec![]);
2084+
agent.enqueue_or_merge("new".into(), vec![], vec![]);
20652085
assert_eq!(agent.message_queue.len(), 2);
20662086
assert_eq!(agent.message_queue[0].text, "old");
20672087
assert_eq!(agent.message_queue[1].text, "new");
@@ -2080,9 +2100,10 @@ pub(super) mod agent_tests {
20802100
text: format!("msg{i}"),
20812101
received_at: Instant::now() - Duration::from_secs(2),
20822102
image_parts: vec![],
2103+
raw_attachments: vec![],
20832104
});
20842105
}
2085-
agent.enqueue_or_merge("overflow".into(), vec![]);
2106+
agent.enqueue_or_merge("overflow".into(), vec![], vec![]);
20862107
assert_eq!(agent.message_queue.len(), MAX_QUEUE_SIZE);
20872108
}
20882109

@@ -2094,11 +2115,11 @@ pub(super) mod agent_tests {
20942115
let executor = MockToolExecutor::no_tools();
20952116
let mut agent = Agent::new(provider, channel, registry, None, 5, executor);
20962117

2097-
agent.enqueue_or_merge("a".into(), vec![]);
2118+
agent.enqueue_or_merge("a".into(), vec![], vec![]);
20982119
// Wait past merge window
20992120
agent.message_queue.back_mut().unwrap().received_at =
21002121
Instant::now() - Duration::from_secs(1);
2101-
agent.enqueue_or_merge("b".into(), vec![]);
2122+
agent.enqueue_or_merge("b".into(), vec![], vec![]);
21022123
assert_eq!(agent.message_queue.len(), 2);
21032124

21042125
let count = agent.clear_queue();
@@ -2137,6 +2158,7 @@ pub(super) mod agent_tests {
21372158
text: format!("pre{i}"),
21382159
received_at: Instant::now() - Duration::from_secs(2),
21392160
image_parts: vec![],
2161+
raw_attachments: vec![],
21402162
});
21412163
}
21422164
agent.drain_channel();
@@ -2157,6 +2179,7 @@ pub(super) mod agent_tests {
21572179
text: format!("msg{i}"),
21582180
received_at: Instant::now() - Duration::from_secs(2),
21592181
image_parts: vec![],
2182+
raw_attachments: vec![],
21602183
});
21612184
}
21622185

crates/zeph-core/src/bootstrap.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -555,7 +555,8 @@ pub fn create_provider(config: &Config) -> anyhow::Result<AnyProvider> {
555555
providers,
556556
))))
557557
}
558-
other => bail!("LLM provider {other} not available"),
558+
#[cfg(not(feature = "candle"))]
559+
ProviderKind::Candle => bail!("candle feature is not enabled"),
559560
}
560561
}
561562

crates/zeph-core/src/config/env.rs

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
use super::{Config, SttConfig, default_stt_model, default_stt_provider};
1+
use super::{Config, SttConfig, default_stt_language, default_stt_model, default_stt_provider};
22

33
impl Config {
44
pub(crate) fn apply_env_overrides(&mut self) {
@@ -141,16 +141,38 @@ impl Config {
141141
let stt = self.llm.stt.get_or_insert_with(|| SttConfig {
142142
provider: default_stt_provider(),
143143
model: default_stt_model(),
144+
language: default_stt_language(),
145+
base_url: None,
144146
});
145147
stt.provider = v;
146148
}
147149
if let Ok(v) = std::env::var("ZEPH_STT_MODEL") {
148150
let stt = self.llm.stt.get_or_insert_with(|| SttConfig {
149151
provider: default_stt_provider(),
150152
model: default_stt_model(),
153+
language: default_stt_language(),
154+
base_url: None,
151155
});
152156
stt.model = v;
153157
}
158+
if let Ok(v) = std::env::var("ZEPH_STT_LANGUAGE") {
159+
let stt = self.llm.stt.get_or_insert_with(|| SttConfig {
160+
provider: default_stt_provider(),
161+
model: default_stt_model(),
162+
language: default_stt_language(),
163+
base_url: None,
164+
});
165+
stt.language = v;
166+
}
167+
if let Ok(v) = std::env::var("ZEPH_STT_BASE_URL") {
168+
let stt = self.llm.stt.get_or_insert_with(|| SttConfig {
169+
provider: default_stt_provider(),
170+
model: default_stt_model(),
171+
language: default_stt_language(),
172+
base_url: None,
173+
});
174+
stt.base_url = Some(v);
175+
}
154176
if let Ok(v) = std::env::var("ZEPH_AUTO_UPDATE_CHECK")
155177
&& let Ok(enabled) = v.parse::<bool>()
156178
{

crates/zeph-core/src/config/types.rs

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,10 @@ pub struct SttConfig {
135135
pub provider: String,
136136
#[serde(default = "default_stt_model")]
137137
pub model: String,
138+
#[serde(default = "default_stt_language")]
139+
pub language: String,
140+
#[serde(default)]
141+
pub base_url: Option<String>,
138142
}
139143

140144
pub(crate) fn default_stt_provider() -> String {
@@ -145,6 +149,10 @@ pub(crate) fn default_stt_model() -> String {
145149
"whisper-1".into()
146150
}
147151

152+
pub(crate) fn default_stt_language() -> String {
153+
"auto".into()
154+
}
155+
148156
#[derive(Debug, Deserialize, Serialize)]
149157
pub struct CloudLlmConfig {
150158
pub model: String,

crates/zeph-llm/src/candle_whisper.rs

Lines changed: 20 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ pub struct CandleWhisperProvider {
1818
mel_filters: Vec<f32>,
1919
tokenizer: Arc<Tokenizer>,
2020
device: Device,
21+
language: String,
2122
}
2223

2324
impl std::fmt::Debug for CandleWhisperProvider {
@@ -58,7 +59,7 @@ impl CandleWhisperProvider {
5859
/// # Errors
5960
///
6061
/// Returns `LlmError::ModelLoad` if downloading or loading fails.
61-
pub fn load(repo_id: &str, device: Option<Device>) -> Result<Self, LlmError> {
62+
pub fn load(repo_id: &str, device: Option<Device>, language: &str) -> Result<Self, LlmError> {
6263
let device = device.unwrap_or_else(detect_device);
6364
tracing::info!(
6465
repo = repo_id,
@@ -117,6 +118,7 @@ impl CandleWhisperProvider {
117118
mel_filters,
118119
tokenizer: Arc::new(tokenizer),
119120
device,
121+
language: language.to_string(),
120122
})
121123
}
122124

@@ -145,8 +147,15 @@ impl CandleWhisperProvider {
145147
.token_to_id(m::EOT_TOKEN)
146148
.ok_or_else(|| LlmError::TranscriptionFailed("missing EOT token".into()))?;
147149

148-
let language_token = self.tokenizer.token_to_id("<|en|>").ok_or_else(|| {
149-
LlmError::TranscriptionFailed("language token not found in tokenizer".into())
150+
let lang_tag = if self.language == "auto" {
151+
"<|en|>".to_string()
152+
} else {
153+
format!("<|{}|>", self.language)
154+
};
155+
let language_token = self.tokenizer.token_to_id(&lang_tag).ok_or_else(|| {
156+
LlmError::TranscriptionFailed(format!(
157+
"language token {lang_tag} not found in tokenizer"
158+
))
150159
})?;
151160

152161
let mut model = self
@@ -189,7 +198,14 @@ impl CandleWhisperProvider {
189198

190199
Ok(Transcription {
191200
text: text.trim().to_string(),
192-
language: Some("en".into()),
201+
language: Some(
202+
if self.language == "auto" {
203+
"en"
204+
} else {
205+
&self.language
206+
}
207+
.into(),
208+
),
193209
duration_secs: Some(pcm.len() as f32 / m::SAMPLE_RATE as f32),
194210
})
195211
}

crates/zeph-llm/src/whisper.rs

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ pub struct WhisperProvider {
99
api_key: String,
1010
base_url: String,
1111
model: String,
12+
language: Option<String>,
1213
}
1314

1415
impl WhisperProvider {
@@ -24,8 +25,18 @@ impl WhisperProvider {
2425
api_key: api_key.into(),
2526
base_url: base_url.into(),
2627
model: model.into(),
28+
language: None,
2729
}
2830
}
31+
32+
#[must_use]
33+
pub fn with_language(mut self, language: impl Into<String>) -> Self {
34+
let lang = language.into();
35+
if lang != "auto" && !lang.is_empty() {
36+
self.language = Some(lang);
37+
}
38+
self
39+
}
2940
}
3041

3142
impl std::fmt::Debug for WhisperProvider {
@@ -56,10 +67,13 @@ impl SpeechToText for WhisperProvider {
5667
.mime_str("application/octet-stream")
5768
.map_err(|e| LlmError::TranscriptionFailed(e.to_string()))?;
5869

59-
let form = reqwest::multipart::Form::new()
70+
let mut form = reqwest::multipart::Form::new()
6071
.text("model", self.model.clone())
6172
.text("response_format", "json")
6273
.part("file", part);
74+
if let Some(ref lang) = self.language {
75+
form = form.text("language", lang.clone());
76+
}
6377

6478
let url = format!(
6579
"{}/audio/transcriptions",

docs/src/advanced/multimodal.md

Lines changed: 42 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,15 +20,56 @@ provider = "whisper"
2020
model = "whisper-1"
2121
```
2222

23-
The Whisper provider inherits the OpenAI API key from `[llm.openai]` or `ZEPH_OPENAI_API_KEY`. Environment variable overrides: `ZEPH_STT_PROVIDER`, `ZEPH_STT_MODEL`.
23+
When `base_url` is omitted, the provider uses the OpenAI API key from `[llm.openai]` or `ZEPH_OPENAI_API_KEY`. Set `base_url` to point at any OpenAI-compatible server (no API key required for local servers). The `language` field accepts an [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) code (e.g. `ru`, `en`, `de`) or `auto` for automatic detection.
24+
25+
Environment variable overrides: `ZEPH_STT_PROVIDER`, `ZEPH_STT_MODEL`, `ZEPH_STT_LANGUAGE`, `ZEPH_STT_BASE_URL`.
2426

2527
### Backends
2628

2729
| Backend | Provider | Feature | Description |
2830
|---------|----------|---------|-------------|
2931
| OpenAI Whisper API | `whisper` | `stt` | Cloud-based transcription |
32+
| OpenAI-compatible server | `whisper` | `stt` | Any local server with `/v1/audio/transcriptions` |
3033
| Local Whisper | `candle-whisper` | `candle` | Fully offline via candle |
3134

35+
### Local Whisper Server (whisper.cpp)
36+
37+
The recommended setup for local speech-to-text. Uses Metal acceleration on Apple Silicon and handles all audio formats (including Telegram OGG/Opus) server-side.
38+
39+
**Install and run:**
40+
41+
```bash
42+
brew install whisper-cpp
43+
44+
# Download a model
45+
curl -L -o ~/.cache/whisper/ggml-large-v3.bin \
46+
https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-large-v3.bin
47+
48+
# Start the server
49+
whisper-server \
50+
--model ~/.cache/whisper/ggml-large-v3.bin \
51+
--host 127.0.0.1 --port 8080 \
52+
--inference-path "/v1/audio/transcriptions" \
53+
--convert
54+
```
55+
56+
**Configure Zeph:**
57+
58+
```toml
59+
[llm.stt]
60+
provider = "whisper"
61+
model = "large-v3"
62+
base_url = "http://127.0.0.1:8080/v1"
63+
language = "en" # ISO-639-1 code or "auto"
64+
```
65+
66+
| Model | Parameters | Disk | Notes |
67+
|-------|------------|------|-------|
68+
| `ggml-tiny` | 39M | ~75 MB | Fastest, lower accuracy |
69+
| `ggml-base` | 74M | ~142 MB | Good balance |
70+
| `ggml-small` | 244M | ~466 MB | Better accuracy |
71+
| `ggml-large-v3` | 1.5B | ~2.9 GB | Best accuracy |
72+
3273
### Local Whisper (Candle)
3374

3475
```bash

0 commit comments

Comments
 (0)