Skip to content
Open
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions mingle-app/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,9 @@ Default local endpoints:
- STT WS: `ws://127.0.0.1:3001`
- API: `http://127.0.0.1:3000`

The web client no longer hardcodes a specific STT provider.
`mingle-stt` selects the default provider through `STT_DEFAULT_MODEL`.

Default audio fixture path:

- `test-fixtures/audio/fixtures/`
Expand All @@ -51,6 +54,7 @@ You can override paths/endpoints with env vars:
MINGLE_TEST_AUDIO_FIXTURE=/absolute/path/to/file.wav
MINGLE_TEST_AUDIO_FIXTURE_DIR=/absolute/path/to/fixtures-dir
MINGLE_TEST_WS_URL=ws://127.0.0.1:3001
MINGLE_TEST_STT_MODEL=soniox
MINGLE_TEST_API_BASE_URL=http://127.0.0.1:3000
MINGLE_TEST_API_NAMESPACE=
MINGLE_TEST_EXPECTED_PHRASE="hello mingle"
Expand Down
10 changes: 9 additions & 1 deletion mingle-app/rn/App.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -570,6 +570,7 @@ const VERSION_POLICY_FALLBACK_COPY: Record<VersionPolicyLocale, {
type NativeSttStartPayload = {
wsUrl?: string;
sttModel?: string;
languages?: string[];
aecEnabled?: boolean;
sonioxLanguageHints?: string[];
sonioxManualFinalizeSilenceMs?: number;
Expand Down Expand Up @@ -1250,7 +1251,13 @@ function AppInner(): React.JSX.Element {
: DEFAULT_WS_URL;
const sttModel = typeof payload?.sttModel === 'string' && payload.sttModel.trim()
? payload.sttModel.trim()
: 'soniox';
: undefined;
const languages = Array.isArray(payload?.languages)
? payload.languages
.filter((language): language is string => typeof language === 'string')
.map(language => language.trim())
.filter(Boolean)
: [];
const aecEnabled = payload?.aecEnabled === true;
const sonioxLanguageHints = Array.isArray(payload?.sonioxLanguageHints)
? payload.sonioxLanguageHints
Expand All @@ -1267,6 +1274,7 @@ function AppInner(): React.JSX.Element {
await startNativeStt({
wsUrl,
sttModel,
languages,
aecEnabled,
sonioxLanguageHints,
sonioxManualFinalizeSilenceMs,
Expand Down
4 changes: 4 additions & 0 deletions mingle-app/rn/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,10 @@ Regular builds that do not use devbox keep the default Xcode project values (pro

Android runtime URLs and the namespace are injected through Gradle `BuildConfig` and `NativeRuntimeConfigModule`.

The native STT bridge forwards the selected languages and lets `mingle-stt`
choose the default provider through `STT_DEFAULT_MODEL`, unless a specific
`sttModel` override is supplied for debugging.

This project was bootstrapped using [`@react-native-community/cli`](https://github.com/react-native-community/cli).

# Getting Started
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ import okhttp3.Request
import okhttp3.Response
import okhttp3.WebSocket
import okhttp3.WebSocketListener
import org.json.JSONArray
import org.json.JSONObject
import java.util.concurrent.TimeUnit
import java.util.concurrent.Executors
Expand All @@ -46,7 +47,8 @@ class NativeSTTModule(

private data class StartOptions(
val wsUrl: String,
val sttModel: String,
val sttModel: String?,
val languages: List<String>,
val aecEnabled: Boolean,
val sonioxLanguageHints: List<String>,
val sonioxManualFinalizeSilenceMs: Int,
Expand Down Expand Up @@ -133,7 +135,8 @@ class NativeSTTModule(

val startOptions = StartOptions(
wsUrl = wsUrl,
sttModel = options.getString("sttModel")?.trim().orEmpty().ifEmpty { "soniox" },
sttModel = options.getString("sttModel")?.trim()?.takeIf { it.isNotEmpty() },
languages = normalizeStringArray(options.getArray("languages")),
aecEnabled = if (options.hasKey("aecEnabled")) options.getBoolean("aecEnabled") else false,
sonioxLanguageHints = normalizeStringArray(options.getArray("sonioxLanguageHints")),
sonioxManualFinalizeSilenceMs = normalizeSonioxManualFinalizeSilenceMs(
Expand Down Expand Up @@ -292,7 +295,10 @@ class NativeSTTModule(
webSocketReady = true
val config = JSONObject()
.put("sample_rate", currentSampleRate)
.put("stt_model", options.sttModel)
.put("languages", JSONArray(options.languages))
if (options.sttModel != null) {
config.put("stt_model", options.sttModel)
}
.put("soniox_manual_finalize_silence_ms", options.sonioxManualFinalizeSilenceMs)
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1 Badge Fix dangling .put call in Android STT config builder

After making stt_model optional, this block leaves a chained .put(...) call unattached to any receiver (if is now a statement), which causes Kotlin compilation to fail in Android builds. Any RN Android build that compiles NativeSTTModule.kt will break here, so the config construction needs to be rewritten as separate config.put(...) statements (or a single uninterrupted chain).

Useful? React with 👍 / 👎.

if (options.sonioxLanguageHints.isNotEmpty()) {
config.put("soniox_language_hints", options.sonioxLanguageHints)
Expand Down
17 changes: 14 additions & 3 deletions mingle-app/rn/ios/mingle/NativeSTTModule.swift
Original file line number Diff line number Diff line change
Expand Up @@ -758,7 +758,8 @@ class NativeSTTModule: RCTEventEmitter {
private func startSession(
wsUrl: URL,
wsUrlString: String,
sttModel: String,
sttModel: String?,
languages: [String],
aecEnabled: Bool,
sonioxLanguageHints: [String],
sonioxManualFinalizeSilenceMs: Int,
Expand Down Expand Up @@ -842,9 +843,12 @@ class NativeSTTModule: RCTEventEmitter {

var configPayload: [String: Any] = [
"sample_rate": sampleRate,
"stt_model": sttModel,
"languages": languages,
"soniox_manual_finalize_silence_ms": sonioxManualFinalizeSilenceMs,
]
if let sttModel, !sttModel.isEmpty {
configPayload["stt_model"] = sttModel
}
if !sonioxLanguageHints.isEmpty {
configPayload["soniox_language_hints"] = sonioxLanguageHints
}
Expand Down Expand Up @@ -875,7 +879,12 @@ class NativeSTTModule: RCTEventEmitter {
return
}

let sttModel = options["sttModel"] as? String ?? "soniox"
let trimmedSttModel = (options["sttModel"] as? String)?
.trimmingCharacters(in: .whitespacesAndNewlines)
let sttModel = trimmedSttModel?.isEmpty == false ? trimmedSttModel : nil
let languages = (options["languages"] as? [String] ?? [])
.map { $0.trimmingCharacters(in: .whitespacesAndNewlines) }
.filter { !$0.isEmpty }
let aecEnabled = options["aecEnabled"] as? Bool ?? false
let sonioxLanguageHints = (options["sonioxLanguageHints"] as? [String] ?? [])
.map { $0.trimmingCharacters(in: .whitespacesAndNewlines) }
Expand All @@ -891,6 +900,7 @@ class NativeSTTModule: RCTEventEmitter {
wsUrl: wsUrl,
wsUrlString: wsUrlString,
sttModel: sttModel,
languages: languages,
aecEnabled: aecEnabled,
sonioxLanguageHints: sonioxLanguageHints,
sonioxManualFinalizeSilenceMs: sonioxManualFinalizeSilenceMs,
Expand All @@ -909,6 +919,7 @@ class NativeSTTModule: RCTEventEmitter {
wsUrl: wsUrl,
wsUrlString: wsUrlString,
sttModel: sttModel,
languages: languages,
aecEnabled: aecEnabled,
sonioxLanguageHints: sonioxLanguageHints,
sonioxManualFinalizeSilenceMs: sonioxManualFinalizeSilenceMs,
Expand Down
1 change: 1 addition & 0 deletions mingle-app/rn/src/nativeStt.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ import { NativeEventEmitter, NativeModules, Platform } from 'react-native';
type NativeSttStartOptions = {
wsUrl: string;
sttModel?: string;
languages?: string[];
aecEnabled?: boolean;
sonioxLanguageHints?: string[];
sonioxManualFinalizeSilenceMs?: number;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -165,20 +165,31 @@ describe('use-realtime-stt pure logic', () => {
previousSelectionSignature: buildLanguageSelectionSignature(['en', 'ko']),
nextSelectionSignature: buildLanguageSelectionSignature(['en', 'ja']),
connectionStatus: 'ready',
activeSttModel: 'soniox',
sonioxLanguageHintsEnabled: true,
})).toBe(true)

expect(shouldRestartSttForLanguageHintChange({
previousSelectionSignature: buildLanguageSelectionSignature(['en', 'ko']),
nextSelectionSignature: buildLanguageSelectionSignature(['en', 'ja']),
connectionStatus: 'ready',
activeSttModel: 'gladia',
sonioxLanguageHintsEnabled: true,
})).toBe(false)

expect(shouldRestartSttForLanguageHintChange({
previousSelectionSignature: buildLanguageSelectionSignature(['en', 'ko']),
nextSelectionSignature: buildLanguageSelectionSignature(['en', 'ja']),
connectionStatus: 'ready',
activeSttModel: 'soniox',
sonioxLanguageHintsEnabled: false,
})).toBe(false)

expect(shouldRestartSttForLanguageHintChange({
previousSelectionSignature: buildLanguageSelectionSignature(['en', 'ko']),
nextSelectionSignature: buildLanguageSelectionSignature(['en', 'ko']),
connectionStatus: 'ready',
activeSttModel: 'soniox',
sonioxLanguageHintsEnabled: true,
})).toBe(false)
})
Expand Down
23 changes: 20 additions & 3 deletions mingle-app/src/components/LivePhoneDemo/use-realtime-stt.ts
Original file line number Diff line number Diff line change
Expand Up @@ -122,8 +122,10 @@ export function shouldRestartSttForLanguageHintChange(input: {
previousSelectionSignature: string
nextSelectionSignature: string
connectionStatus: ConnectionStatus
activeSttModel: string | null
sonioxLanguageHintsEnabled: boolean
}): boolean {
if (input.activeSttModel !== 'soniox') return false
if (!input.sonioxLanguageHintsEnabled) return false
if (input.previousSelectionSignature === input.nextSelectionSignature) return false
return input.connectionStatus === 'ready'
Expand All @@ -133,7 +135,8 @@ type NativeSttStartCommand = {
type: 'native_stt_start'
payload: {
wsUrl: string
sttModel: string
sttModel?: string
languages: string[]
aecEnabled: boolean
sonioxLanguageHints: string[]
sonioxManualFinalizeSilenceMs: number
Expand Down Expand Up @@ -1598,6 +1601,7 @@ export default function useRealtimeSTT({
const previousLanguageSelectionSignatureRef = useRef(buildLanguageSelectionSignature(languages))
const languageChangeRestartTimerRef = useRef<ReturnType<typeof setTimeout> | null>(null)
const pendingLanguageChangeRestartRef = useRef(false)
const activeSttModelRef = useRef<string | null>(null)
const sonioxLanguageHintsEnabledRef = useRef(false)

const getCurrentTargetLanguages = useCallback(() => targetLanguagesRef.current, [])
Expand Down Expand Up @@ -1876,6 +1880,8 @@ export default function useRealtimeSTT({
isStoppingRef.current = false
hasActiveSessionRef.current = false
pendingLanguageChangeRestartRef.current = false
activeSttModelRef.current = null
sonioxLanguageHintsEnabledRef.current = false
clearSpeakerAvatarSession()
cleanup()
setConnectionStatus('idle')
Expand Down Expand Up @@ -2715,8 +2721,13 @@ export default function useRealtimeSTT({

const handleSttServerMessage = useCallback((message: Record<string, unknown>) => {
if (message.status === 'ready') {
const activeSttModel = typeof message.stt_model === 'string' && message.stt_model.trim()
? message.stt_model.trim()
: null
activeSttModelRef.current = activeSttModel
sonioxLanguageHintsEnabledRef.current = message.soniox_language_hints_enabled === true
logSttDebug('transport.ready', {
activeSttModel,
sonioxLanguageHintsEnabled: sonioxLanguageHintsEnabledRef.current,
})
setConnectionStatus('ready')
Expand Down Expand Up @@ -2989,6 +3000,8 @@ export default function useRealtimeSTT({
turnStartedAtRef.current = null
recentFinalizedUtteranceRef.current = null
hasActiveSessionRef.current = false
activeSttModelRef.current = null
sonioxLanguageHintsEnabledRef.current = false
pendingTurnsBySpeakerRef.current = {}
clearAllPendingTurnTranslationRuntime()
activePartialSpeakerRef.current = null
Expand All @@ -3007,7 +3020,7 @@ export default function useRealtimeSTT({
type: 'native_stt_start',
payload: {
wsUrl: getWsUrl(),
sttModel: 'soniox',
languages: targetLanguages,
aecEnabled: enableAec,
sonioxLanguageHints,
sonioxManualFinalizeSilenceMs,
Expand Down Expand Up @@ -3054,7 +3067,7 @@ export default function useRealtimeSTT({
const sonioxLanguageHints = buildSonioxLanguageHints(targetLanguages)
const config = {
sample_rate: context.sampleRate,
stt_model: 'soniox',
languages: targetLanguages,
soniox_language_hints: sonioxLanguageHints,
soniox_manual_finalize_silence_ms: sonioxManualFinalizeSilenceMs,
}
Expand Down Expand Up @@ -3163,6 +3176,7 @@ export default function useRealtimeSTT({
previousSelectionSignature: previousSignature,
nextSelectionSignature: currentSignature,
connectionStatus: connectionStatusRef.current,
activeSttModel: activeSttModelRef.current,
sonioxLanguageHintsEnabled: sonioxLanguageHintsEnabledRef.current,
})) {
return
Expand All @@ -3171,15 +3185,18 @@ export default function useRealtimeSTT({
logSttDebug('recording.languages.restart_scheduled', {
previousSignature,
currentSignature,
activeSttModel: activeSttModelRef.current,
sonioxLanguageHintsEnabled: sonioxLanguageHintsEnabledRef.current,
})
clearLanguageChangeRestartTimer()
languageChangeRestartTimerRef.current = setTimeout(() => {
languageChangeRestartTimerRef.current = null
if (activeSttModelRef.current !== 'soniox') return
if (!sonioxLanguageHintsEnabledRef.current) return
if (connectionStatusRef.current !== 'ready') return
pendingLanguageChangeRestartRef.current = true
logSttDebug('recording.languages.restart_begin', {
activeSttModel: activeSttModelRef.current,
currentSignature: buildLanguageSelectionSignature(targetLanguagesRef.current),
})
void stopRecordingGracefully(false, 'language_hint_change')
Expand Down
2 changes: 1 addition & 1 deletion mingle-ios/MingleIOS/Core/Network/STTWebSocketClient.swift
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ import Foundation
struct STTConfigPayload: Encodable {
let sample_rate: Double
let languages: [String]
let stt_model: String
let stt_model: String? = nil
let lang_hints_strict: Bool
}

Expand Down
1 change: 0 additions & 1 deletion mingle-ios/MingleIOS/ViewModels/AppViewModel.swift
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,6 @@ final class AppViewModel: ObservableObject {
let config = STTConfigPayload(
sample_rate: audioCaptureService.sampleRate,
languages: languages,
stt_model: "soniox",
lang_hints_strict: true
)
sttSocketClient.connect(url: socketURL, config: config)
Expand Down
10 changes: 10 additions & 0 deletions mingle-stt/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ Standalone STT relay server for Mingle.
## Environment Variables

- `PORT` (default: `3001`)
- `STT_DEFAULT_MODEL` (optional, default: `soniox`)
- `SONIOX_API_KEY`
- `GLADIA_API_KEY` (optional, for gladia modes)
- `DEEPGRAM_API_KEY` (optional, for deepgram modes)
Expand All @@ -26,6 +27,15 @@ server requests a manual finalize.
`mingle-stt` loads `.env.local` first, then `.env` in this directory.
If these variables are missing, it safely falls back to the defaults above.

Supported values for `STT_DEFAULT_MODEL`:

- `soniox`
- `gladia`
- `gladia-stt`
- `deepgram`
- `deepgram-multi`
- `fireworks`

## Railway

This folder includes `railway.json` and is intended to be used as the Railway
Expand Down
21 changes: 21 additions & 0 deletions mingle-stt/stt-model.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
export const SUPPORTED_STT_MODELS = [
'gladia',
'gladia-stt',
'deepgram',
'deepgram-multi',
'fireworks',
'soniox',
] as const;

export type SttModel = (typeof SUPPORTED_STT_MODELS)[number];

export function isSttModel(value: string): value is SttModel {
return SUPPORTED_STT_MODELS.includes(value as SttModel);
}

export function resolveSttModel(input: unknown, fallback: SttModel): SttModel {
if (typeof input !== 'string') return fallback;
const normalized = input.trim();
if (!normalized) return fallback;
return isSttModel(normalized) ? normalized : fallback;
}
Loading
Loading