Skip to content

Commit ae2a949

Browse files
Refactor OpenAI adapter and add model support (#2128)
- Remove dead code (OpenAISegment struct, segments field) - Extract magic values to constants (VAD config, response formats) - Add model-aware request building for batch API - whisper-1: uses verbose_json with word timestamps - gpt-4o-transcribe/gpt-4o-mini-transcribe: uses json format - Update UI to show all OpenAI STT models - Add model documentation comments Co-authored-by: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com>
1 parent 5a56b7a commit ae2a949

File tree

4 files changed

+44
-21
lines changed

4 files changed

+44
-21
lines changed

apps/desktop/src/components/settings/ai/stt/shared.tsx

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,14 @@ export const displayModelId = (model: string) => {
4444
return "Whisper 1";
4545
}
4646

47+
if (model === "gpt-4o-transcribe") {
48+
return "GPT-4o Transcribe";
49+
}
50+
51+
if (model === "gpt-4o-mini-transcribe") {
52+
return "GPT-4o mini Transcribe";
53+
}
54+
4755
if (model.startsWith("am-")) {
4856
const am = model as AmModel;
4957
if (am == "am-parakeet-v2") {
@@ -153,7 +161,7 @@ export const PROVIDERS = [
153161
badge: "Beta",
154162
icon: <OpenAI size={16} />,
155163
baseUrl: "https://api.openai.com/v1",
156-
models: ["whisper-1"],
164+
models: ["gpt-4o-transcribe", "gpt-4o-mini-transcribe", "whisper-1"],
157165
requiresPro: false,
158166
},
159167
{

owhisper/owhisper-client/src/adapter/openai/batch.rs

Lines changed: 19 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,14 @@ use super::OpenAIAdapter;
1111

1212
const DEFAULT_API_BASE: &str = "https://api.openai.com/v1";
1313
const DEFAULT_MODEL: &str = "whisper-1";
14+
const RESPONSE_FORMAT_VERBOSE: &str = "verbose_json";
15+
const RESPONSE_FORMAT_JSON: &str = "json";
16+
const TIMESTAMP_GRANULARITY: &str = "word";
17+
18+
// Models that support verbose_json with word-level timestamps
19+
fn supports_word_timestamps(model: &str) -> bool {
20+
model == "whisper-1"
21+
}
1422

1523
impl BatchSttAdapter for OpenAIAdapter {
1624
fn transcribe_file<'a, P: AsRef<Path> + Send + 'a>(
@@ -33,17 +41,6 @@ struct OpenAIWord {
3341
end: f64,
3442
}
3543

36-
#[derive(Debug, serde::Deserialize)]
37-
struct OpenAISegment {
38-
#[allow(dead_code)]
39-
id: i32,
40-
#[allow(dead_code)]
41-
seek: i32,
42-
start: f64,
43-
end: f64,
44-
text: String,
45-
}
46-
4744
#[derive(Debug, serde::Deserialize)]
4845
struct OpenAIVerboseResponse {
4946
#[allow(dead_code)]
@@ -54,8 +51,6 @@ struct OpenAIVerboseResponse {
5451
text: String,
5552
#[serde(default)]
5653
words: Vec<OpenAIWord>,
57-
#[serde(default)]
58-
segments: Vec<OpenAISegment>,
5954
}
6055

6156
async fn do_transcribe_file(
@@ -91,9 +86,17 @@ async fn do_transcribe_file(
9186

9287
let mut form = Form::new()
9388
.part("file", file_part)
94-
.text("model", model.to_string())
95-
.text("response_format", "verbose_json")
96-
.text("timestamp_granularities[]", "word");
89+
.text("model", model.to_string());
90+
91+
// whisper-1 supports verbose_json with word-level timestamps
92+
// gpt-4o-transcribe and gpt-4o-mini-transcribe only support json/text
93+
if supports_word_timestamps(model) {
94+
form = form
95+
.text("response_format", RESPONSE_FORMAT_VERBOSE)
96+
.text("timestamp_granularities[]", TIMESTAMP_GRANULARITY);
97+
} else {
98+
form = form.text("response_format", RESPONSE_FORMAT_JSON);
99+
}
97100

98101
if let Some(lang) = params.languages.first() {
99102
form = form.text("language", lang.iso639().code().to_string());

owhisper/owhisper-client/src/adapter/openai/live.rs

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,12 @@ use super::OpenAIAdapter;
77
use crate::adapter::parsing::{calculate_time_span, WordBuilder};
88
use crate::adapter::RealtimeSttAdapter;
99

10+
// Voice Activity Detection (VAD) configuration defaults
11+
const VAD_DETECTION_TYPE: &str = "server_vad";
12+
const VAD_THRESHOLD: f32 = 0.5;
13+
const VAD_PREFIX_PADDING_MS: u32 = 300;
14+
const VAD_SILENCE_DURATION_MS: u32 = 500;
15+
1016
impl RealtimeSttAdapter for OpenAIAdapter {
1117
fn provider_name(&self) -> &'static str {
1218
"openai"
@@ -78,10 +84,10 @@ impl RealtimeSttAdapter for OpenAIAdapter {
7884
language,
7985
}),
8086
turn_detection: Some(TurnDetection {
81-
detection_type: "server_vad".to_string(),
82-
threshold: Some(0.5),
83-
prefix_padding_ms: Some(300),
84-
silence_duration_ms: Some(500),
87+
detection_type: VAD_DETECTION_TYPE.to_string(),
88+
threshold: Some(VAD_THRESHOLD),
89+
prefix_padding_ms: Some(VAD_PREFIX_PADDING_MS),
90+
silence_duration_ms: Some(VAD_SILENCE_DURATION_MS),
8591
}),
8692
}),
8793
}),

owhisper/owhisper-client/src/adapter/openai/mod.rs

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,12 @@ mod live;
33

44
pub(crate) const DEFAULT_WS_HOST: &str = "api.openai.com";
55
pub(crate) const WS_PATH: &str = "/v1/realtime";
6+
7+
// OpenAI STT Models:
8+
// - whisper-1: Legacy model, supports verbose_json with word timestamps (batch only)
9+
// - gpt-4o-transcribe: High quality, supports both batch (json only) and realtime
10+
// - gpt-4o-mini-transcribe: Cost-efficient, supports both batch (json only) and realtime
11+
// - gpt-4o-transcribe-diarize: Speaker diarization (batch only, not yet supported here)
612
pub(crate) const DEFAULT_TRANSCRIPTION_MODEL: &str = "gpt-4o-transcribe";
713

814
#[derive(Clone, Default)]

0 commit comments

Comments
 (0)