Refactor OpenAI adapter and add model support (#2128)

yujonglee · devin-ai-integration[bot] · web-flow · commit ae2a9498674d · 2025-12-05T11:07:22.000+09:00
- Remove dead code (OpenAISegment struct, segments field)
- Extract magic values to constants (VAD config, response formats)
- Add model-aware request building for batch API
  - whisper-1: uses verbose_json with word timestamps
  - gpt-4o-transcribe/gpt-4o-mini-transcribe: uses json format
- Update UI to show all OpenAI STT models
- Add model documentation comments

Co-authored-by: Devin AI &lt;158243242+devin-ai-integration[bot]@users.noreply.github.com&gt;
diff --git a/apps/desktop/src/components/settings/ai/stt/shared.tsx b/apps/desktop/src/components/settings/ai/stt/shared.tsx
@@ -44,6 +44,14 @@ export const displayModelId = (model: string) => {
     return "Whisper 1";
   }
 
+  if (model === "gpt-4o-transcribe") {
+    return "GPT-4o Transcribe";
+  }
+
+  if (model === "gpt-4o-mini-transcribe") {
+    return "GPT-4o mini Transcribe";
+  }
+
   if (model.startsWith("am-")) {
     const am = model as AmModel;
     if (am == "am-parakeet-v2") {
@@ -153,7 +161,7 @@ export const PROVIDERS = [
     badge: "Beta",
     icon: <OpenAI size={16} />,
     baseUrl: "https://api.openai.com/v1",
-    models: ["whisper-1"],
+    models: ["gpt-4o-transcribe", "gpt-4o-mini-transcribe", "whisper-1"],
     requiresPro: false,
   },
   {
diff --git a/owhisper/owhisper-client/src/adapter/openai/batch.rs b/owhisper/owhisper-client/src/adapter/openai/batch.rs
@@ -11,6 +11,14 @@ use super::OpenAIAdapter;
 
 const DEFAULT_API_BASE: &str = "https://api.openai.com/v1";
 const DEFAULT_MODEL: &str = "whisper-1";
+const RESPONSE_FORMAT_VERBOSE: &str = "verbose_json";
+const RESPONSE_FORMAT_JSON: &str = "json";
+const TIMESTAMP_GRANULARITY: &str = "word";
+
+// Models that support verbose_json with word-level timestamps
+fn supports_word_timestamps(model: &str) -> bool {
+    model == "whisper-1"
+}
 
 impl BatchSttAdapter for OpenAIAdapter {
     fn transcribe_file<'a, P: AsRef<Path> + Send + 'a>(
@@ -33,17 +41,6 @@ struct OpenAIWord {
     end: f64,
 }
 
-#[derive(Debug, serde::Deserialize)]
-struct OpenAISegment {
-    #[allow(dead_code)]
-    id: i32,
-    #[allow(dead_code)]
-    seek: i32,
-    start: f64,
-    end: f64,
-    text: String,
-}
-
 #[derive(Debug, serde::Deserialize)]
 struct OpenAIVerboseResponse {
     #[allow(dead_code)]
@@ -54,8 +51,6 @@ struct OpenAIVerboseResponse {
     text: String,
     #[serde(default)]
     words: Vec<OpenAIWord>,
-    #[serde(default)]
-    segments: Vec<OpenAISegment>,
 }
 
 async fn do_transcribe_file(
@@ -91,9 +86,17 @@ async fn do_transcribe_file(
 
     let mut form = Form::new()
         .part("file", file_part)
-        .text("model", model.to_string())
-        .text("response_format", "verbose_json")
-        .text("timestamp_granularities[]", "word");
+        .text("model", model.to_string());
+
+    // whisper-1 supports verbose_json with word-level timestamps
+    // gpt-4o-transcribe and gpt-4o-mini-transcribe only support json/text
+    if supports_word_timestamps(model) {
+        form = form
+            .text("response_format", RESPONSE_FORMAT_VERBOSE)
+            .text("timestamp_granularities[]", TIMESTAMP_GRANULARITY);
+    } else {
+        form = form.text("response_format", RESPONSE_FORMAT_JSON);
+    }
 
     if let Some(lang) = params.languages.first() {
         form = form.text("language", lang.iso639().code().to_string());
diff --git a/owhisper/owhisper-client/src/adapter/openai/live.rs b/owhisper/owhisper-client/src/adapter/openai/live.rs
@@ -7,6 +7,12 @@ use super::OpenAIAdapter;
 use crate::adapter::parsing::{calculate_time_span, WordBuilder};
 use crate::adapter::RealtimeSttAdapter;
 
+// Voice Activity Detection (VAD) configuration defaults
+const VAD_DETECTION_TYPE: &str = "server_vad";
+const VAD_THRESHOLD: f32 = 0.5;
+const VAD_PREFIX_PADDING_MS: u32 = 300;
+const VAD_SILENCE_DURATION_MS: u32 = 500;
+
 impl RealtimeSttAdapter for OpenAIAdapter {
     fn provider_name(&self) -> &'static str {
         "openai"
@@ -78,10 +84,10 @@ impl RealtimeSttAdapter for OpenAIAdapter {
                             language,
                         }),
                         turn_detection: Some(TurnDetection {
-                            detection_type: "server_vad".to_string(),
-                            threshold: Some(0.5),
-                            prefix_padding_ms: Some(300),
-                            silence_duration_ms: Some(500),
+                            detection_type: VAD_DETECTION_TYPE.to_string(),
+                            threshold: Some(VAD_THRESHOLD),
+                            prefix_padding_ms: Some(VAD_PREFIX_PADDING_MS),
+                            silence_duration_ms: Some(VAD_SILENCE_DURATION_MS),
                         }),
                     }),
                 }),
diff --git a/owhisper/owhisper-client/src/adapter/openai/mod.rs b/owhisper/owhisper-client/src/adapter/openai/mod.rs
@@ -3,6 +3,12 @@ mod live;
 
 pub(crate) const DEFAULT_WS_HOST: &str = "api.openai.com";
 pub(crate) const WS_PATH: &str = "/v1/realtime";
+
+// OpenAI STT Models:
+// - whisper-1: Legacy model, supports verbose_json with word timestamps (batch only)
+// - gpt-4o-transcribe: High quality, supports both batch (json only) and realtime
+// - gpt-4o-mini-transcribe: Cost-efficient, supports both batch (json only) and realtime
+// - gpt-4o-transcribe-diarize: Speaker diarization (batch only, not yet supported here)
 pub(crate) const DEFAULT_TRANSCRIPTION_MODEL: &str = "gpt-4o-transcribe";
 
 #[derive(Clone, Default)]