Skip to content

Commit 39831d9

Browse files
authored
feat: implement whisper-streaming (#517)
## Description Changes: - Implement whisper-streaming algorithm - Change SpeechToTextModule API and useSpeechToText API - Add more whisper models - Drop moonshine support - Rename exported SpeechToText const objects - Update example apps ### Introduces a breaking change? - [x] Yes - [ ] No ### Type of change - [x] Bug fix (change which fixes an issue) - [x] New feature (change which adds functionality) - [x] Documentation update (improves or adds clarity to existing documentation) - [x] Other (chores, tests, code style improvements etc.) ### Tested on - [x] iOS - [x] Android ### Checklist - [x] I have performed a self-review of my code - [x] I have commented my code, particularly in hard-to-understand areas - [ ] I have updated the documentation accordingly - [x] My changes generate no new warnings
1 parent b6c5884 commit 39831d9

File tree

25 files changed

+1307
-1359
lines changed

25 files changed

+1307
-1359
lines changed

.cspell-wordlist.txt

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,4 +55,9 @@ wordlist
5555
jitpack
5656
coreml
5757
mobilenetv
58-
flac
58+
flac
59+
startoftranscript
60+
endoftext
61+
softmax
62+
logit
63+
logits

apps/llm/app/voice_chat/index.tsx

Lines changed: 11 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -12,11 +12,10 @@ import {
1212
import SWMIcon from '../../assets/icons/swm_icon.svg';
1313
import Spinner from 'react-native-loading-spinner-overlay';
1414
import {
15-
STREAMING_ACTION,
1615
useSpeechToText,
1716
useLLM,
1817
QWEN3_0_6B_QUANTIZED,
19-
MOONSHINE_TINY,
18+
WHISPER_TINY_EN,
2019
} from 'react-native-executorch';
2120
import PauseIcon from '../../assets/icons/pause_icon.svg';
2221
import MicIcon from '../../assets/icons/mic_icon.svg';
@@ -69,9 +68,7 @@ function VoiceChatScreen() {
6968

7069
const llm = useLLM({ model: QWEN3_0_6B_QUANTIZED });
7170
const speechToText = useSpeechToText({
72-
model: MOONSHINE_TINY,
73-
windowSize: 3,
74-
overlapSeconds: 1.2,
71+
model: WHISPER_TINY_EN,
7572
});
7673

7774
useEffect(() => {
@@ -80,24 +77,20 @@ function VoiceChatScreen() {
8077

8178
const onChunk = (data: string) => {
8279
const float32Chunk = float32ArrayFromPCMBinaryBuffer(data);
83-
speechToText.streamingTranscribe(
84-
STREAMING_ACTION.DATA,
85-
Array.from(float32Chunk)
86-
);
80+
speechToText.streamInsert(Array.from(float32Chunk));
8781
};
8882

8983
const handleRecordPress = async () => {
9084
if (isRecording) {
9185
setIsRecording(false);
9286
LiveAudioStream.stop();
9387
messageRecorded.current = true;
94-
await llm.sendMessage(
95-
await speechToText.streamingTranscribe(STREAMING_ACTION.STOP)
96-
);
88+
speechToText.streamStop();
9789
} else {
9890
setIsRecording(true);
9991
startStreamingAudio(audioStreamOptions, onChunk);
100-
await speechToText.streamingTranscribe(STREAMING_ACTION.START);
92+
const transcription = await speechToText.stream();
93+
await llm.sendMessage(transcription);
10194
}
10295
};
10396

@@ -117,14 +110,17 @@ function VoiceChatScreen() {
117110
<SWMIcon width={45} height={45} />
118111
<Text style={styles.textModelName}>Qwen 3 x Moonshine</Text>
119112
</View>
120-
{llm.messageHistory.length || speechToText.sequence ? (
113+
{llm.messageHistory.length || speechToText.committedTranscription ? (
121114
<View style={styles.chatContainer}>
122115
<Messages
123116
chatHistory={
124117
speechToText.isGenerating
125118
? [
126119
...llm.messageHistory,
127-
{ role: 'user', content: speechToText.sequence },
120+
{
121+
role: 'user',
122+
content: speechToText.committedTranscription,
123+
},
128124
]
129125
: llm.messageHistory
130126
}

apps/speech-to-text/screens/SpeechToTextScreen.tsx

Lines changed: 40 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
import React, { useRef } from 'react';
1+
import React, { useEffect, useRef, useState } from 'react';
22
import {
33
Text,
44
View,
@@ -10,11 +10,7 @@ import {
1010
Platform,
1111
} from 'react-native';
1212
import { SafeAreaProvider, SafeAreaView } from 'react-native-safe-area-context';
13-
import {
14-
STREAMING_ACTION,
15-
useSpeechToText,
16-
WHISPER_TINY,
17-
} from 'react-native-executorch';
13+
import { useSpeechToText, WHISPER_TINY_EN } from 'react-native-executorch';
1814
import FontAwesome from '@expo/vector-icons/FontAwesome';
1915
import {
2016
AudioManager,
@@ -27,28 +23,33 @@ import DeviceInfo from 'react-native-device-info';
2723

2824
const isSimulator = DeviceInfo.isEmulatorSync();
2925

30-
const SAMPLE_RATE = 16000;
31-
const AUDIO_LENGTH_SECONDS = 1;
32-
const BUFFER_LENGTH = SAMPLE_RATE * AUDIO_LENGTH_SECONDS;
33-
3426
export const SpeechToTextScreen = () => {
3527
const model = useSpeechToText({
36-
model: WHISPER_TINY,
37-
windowSize: 3,
38-
overlapSeconds: 1.2,
28+
model: WHISPER_TINY_EN,
3929
});
4030

41-
const [audioURL, setAudioURL] = React.useState('');
42-
const [liveTranscribing, setLiveTranscribing] = React.useState(false);
31+
const [transcription, setTranscription] = useState('');
32+
const [audioURL, setAudioURL] = useState('');
33+
const [liveTranscribing, setLiveTranscribing] = useState(false);
4334
const scrollViewRef = useRef<ScrollView>(null);
4435

45-
const recorder = useRef(
46-
new AudioRecorder({
47-
sampleRate: SAMPLE_RATE,
48-
bufferLengthInSamples: BUFFER_LENGTH,
49-
})
36+
const [recorder] = useState(
37+
() =>
38+
new AudioRecorder({
39+
sampleRate: 16000,
40+
bufferLengthInSamples: 1600,
41+
})
5042
);
5143

44+
useEffect(() => {
45+
AudioManager.setAudioSessionOptions({
46+
iosCategory: 'playAndRecord',
47+
iosMode: 'spokenAudio',
48+
iosOptions: ['allowBluetooth', 'defaultToSpeaker'],
49+
});
50+
AudioManager.requestRecordingPermissions();
51+
}, []);
52+
5253
const handleTranscribeFromURL = async () => {
5354
if (!audioURL.trim()) {
5455
console.warn('Please provide a valid audio file URL');
@@ -60,13 +61,13 @@ export const SpeechToTextScreen = () => {
6061
FileSystem.cacheDirectory + 'audio_file'
6162
);
6263

63-
const audioContext = new AudioContext({ sampleRate: SAMPLE_RATE });
64+
const audioContext = new AudioContext({ sampleRate: 16000 });
6465

6566
try {
6667
const decodedAudioData = await audioContext.decodeAudioDataSource(uri);
6768
const audioBuffer = decodedAudioData.getChannelData(0);
6869
const audioArray = Array.from(audioBuffer);
69-
await model.transcribe(audioArray);
70+
setTranscription(await model.transcribe(audioArray));
7071
} catch (error) {
7172
console.error('Error decoding audio data', error);
7273
console.warn('Note: Supported file formats: mp3, wav, flac');
@@ -76,40 +77,24 @@ export const SpeechToTextScreen = () => {
7677

7778
const handleStartTranscribeFromMicrophone = async () => {
7879
setLiveTranscribing(true);
80+
setTranscription('');
81+
recorder.onAudioReady(async ({ buffer }) => {
82+
const bufferArray = Array.from(buffer.getChannelData(0));
83+
model.streamInsert(bufferArray);
84+
});
85+
recorder.start();
7986

8087
try {
81-
await model.streamingTranscribe(STREAMING_ACTION.START);
82-
console.log('Live transcription started');
88+
await model.stream();
8389
} catch (error) {
84-
console.error('Error starting live transcription:', error);
90+
console.error('Error during live transcription:', error);
8591
}
86-
87-
AudioManager.setAudioSessionOptions({
88-
iosCategory: 'playAndRecord',
89-
iosMode: 'spokenAudio',
90-
iosOptions: ['allowBluetooth', 'defaultToSpeaker'],
91-
});
92-
93-
recorder.current.onAudioReady(async ({ buffer }) => {
94-
const bufferArray = Array.from(buffer.getChannelData(0));
95-
try {
96-
model.streamingTranscribe(STREAMING_ACTION.DATA, bufferArray);
97-
} catch (error) {
98-
console.error('Error during live transcription:', error);
99-
}
100-
});
101-
102-
recorder.current.start();
10392
};
10493

10594
const handleStopTranscribeFromMicrophone = async () => {
106-
recorder.current.stop();
107-
try {
108-
await model.streamingTranscribe(STREAMING_ACTION.STOP);
109-
console.log('Live transcription stopped');
110-
} catch (error) {
111-
console.error('Error stopping transcription:', error);
112-
}
95+
recorder.stop();
96+
model.streamStop();
97+
console.log('Live transcription stopped');
11398
setLiveTranscribing(false);
11499
};
115100

@@ -137,7 +122,6 @@ export const SpeechToTextScreen = () => {
137122
</View>
138123

139124
<View style={styles.statusContainer}>
140-
<Text>Model: {WHISPER_TINY.modelName}</Text>
141125
<Text>Status: {getModelStatus()}</Text>
142126
</View>
143127

@@ -150,7 +134,12 @@ export const SpeechToTextScreen = () => {
150134
scrollViewRef.current?.scrollToEnd({ animated: true })
151135
}
152136
>
153-
<Text>{model.sequence}</Text>
137+
<Text>
138+
{transcription !== ''
139+
? transcription
140+
: model.committedTranscription +
141+
model.nonCommittedTranscription}
142+
</Text>
154143
</ScrollView>
155144
</View>
156145

0 commit comments

Comments
 (0)