Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 9 additions & 1 deletion apps/desktop/src/components/settings/ai/stt/shared.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,14 @@ export const displayModelId = (model: string) => {
return "Whisper 1";
}

if (model === "gpt-4o-transcribe") {
return "GPT-4o Transcribe";
}

if (model === "gpt-4o-mini-transcribe") {
return "GPT-4o mini Transcribe";
}

if (model.startsWith("am-")) {
const am = model as AmModel;
if (am == "am-parakeet-v2") {
Expand Down Expand Up @@ -153,7 +161,7 @@ export const PROVIDERS = [
badge: "Beta",
icon: <OpenAI size={16} />,
baseUrl: "https://api.openai.com/v1",
models: ["whisper-1"],
models: ["gpt-4o-transcribe", "gpt-4o-mini-transcribe", "whisper-1"],
requiresPro: false,
},
{
Expand Down
35 changes: 19 additions & 16 deletions owhisper/owhisper-client/src/adapter/openai/batch.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,14 @@ use super::OpenAIAdapter;

const DEFAULT_API_BASE: &str = "https://api.openai.com/v1";
const DEFAULT_MODEL: &str = "whisper-1";
const RESPONSE_FORMAT_VERBOSE: &str = "verbose_json";
const RESPONSE_FORMAT_JSON: &str = "json";
const TIMESTAMP_GRANULARITY: &str = "word";

// Models that support verbose_json with word-level timestamps
fn supports_word_timestamps(model: &str) -> bool {
model == "whisper-1"
}

impl BatchSttAdapter for OpenAIAdapter {
fn transcribe_file<'a, P: AsRef<Path> + Send + 'a>(
Expand All @@ -33,17 +41,6 @@ struct OpenAIWord {
end: f64,
}

#[derive(Debug, serde::Deserialize)]
struct OpenAISegment {
#[allow(dead_code)]
id: i32,
#[allow(dead_code)]
seek: i32,
start: f64,
end: f64,
text: String,
}

#[derive(Debug, serde::Deserialize)]
struct OpenAIVerboseResponse {
#[allow(dead_code)]
Expand All @@ -54,8 +51,6 @@ struct OpenAIVerboseResponse {
text: String,
#[serde(default)]
words: Vec<OpenAIWord>,
#[serde(default)]
segments: Vec<OpenAISegment>,
}

async fn do_transcribe_file(
Expand Down Expand Up @@ -91,9 +86,17 @@ async fn do_transcribe_file(

let mut form = Form::new()
.part("file", file_part)
.text("model", model.to_string())
.text("response_format", "verbose_json")
.text("timestamp_granularities[]", "word");
.text("model", model.to_string());

// whisper-1 supports verbose_json with word-level timestamps
// gpt-4o-transcribe and gpt-4o-mini-transcribe only support json/text
if supports_word_timestamps(model) {
form = form
.text("response_format", RESPONSE_FORMAT_VERBOSE)
.text("timestamp_granularities[]", TIMESTAMP_GRANULARITY);
} else {
form = form.text("response_format", RESPONSE_FORMAT_JSON);
}

if let Some(lang) = params.languages.first() {
form = form.text("language", lang.iso639().code().to_string());
Expand Down
14 changes: 10 additions & 4 deletions owhisper/owhisper-client/src/adapter/openai/live.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,12 @@ use super::OpenAIAdapter;
use crate::adapter::parsing::{calculate_time_span, WordBuilder};
use crate::adapter::RealtimeSttAdapter;

// Voice Activity Detection (VAD) configuration defaults
const VAD_DETECTION_TYPE: &str = "server_vad";
const VAD_THRESHOLD: f32 = 0.5;
const VAD_PREFIX_PADDING_MS: u32 = 300;
const VAD_SILENCE_DURATION_MS: u32 = 500;

impl RealtimeSttAdapter for OpenAIAdapter {
fn provider_name(&self) -> &'static str {
"openai"
Expand Down Expand Up @@ -78,10 +84,10 @@ impl RealtimeSttAdapter for OpenAIAdapter {
language,
}),
turn_detection: Some(TurnDetection {
detection_type: "server_vad".to_string(),
threshold: Some(0.5),
prefix_padding_ms: Some(300),
silence_duration_ms: Some(500),
detection_type: VAD_DETECTION_TYPE.to_string(),
threshold: Some(VAD_THRESHOLD),
prefix_padding_ms: Some(VAD_PREFIX_PADDING_MS),
silence_duration_ms: Some(VAD_SILENCE_DURATION_MS),
}),
}),
}),
Expand Down
6 changes: 6 additions & 0 deletions owhisper/owhisper-client/src/adapter/openai/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,12 @@ mod live;

pub(crate) const DEFAULT_WS_HOST: &str = "api.openai.com";
pub(crate) const WS_PATH: &str = "/v1/realtime";

// OpenAI STT Models:
// - whisper-1: Legacy model, supports verbose_json with word timestamps (batch only)
// - gpt-4o-transcribe: High quality, supports both batch (json only) and realtime
// - gpt-4o-mini-transcribe: Cost-efficient, supports both batch (json only) and realtime
// - gpt-4o-transcribe-diarize: Speaker diarization (batch only, not yet supported here)
pub(crate) const DEFAULT_TRANSCRIPTION_MODEL: &str = "gpt-4o-transcribe";

#[derive(Clone, Default)]
Expand Down