bug: Allow passing options to useSpeechToText model.stream() (#648)

mkopcins · michaldudek · web-flow · commit 0cfc2e5d204e · 2025-10-15T12:21:45.000+02:00
## Description When using multilingual transcription model (e.g. `WHISPER_TINY`) language is expected, but there's no way to pass it to the `model.stream()` method: Encountered error: ``` [Error: Model is multilingual, provide a language] ``` IDE: <img width="578" height="220" alt="Screenshot 2025-10-14 at 22 21 32" src="https://github.com/user-attachments/assets/9f9ff6d1-ea4a-4571-8a64-4aed7a4200a5" /> It looks like `SpeechToTextModule.stream()` does accept options with language, but the wrapper function in `useSpeechToText()` does not, making it impossible to use the multilingual model. ### Introduces a breaking change? - [ ] Yes - [x] No ### Type of change - [x] Bug fix (change which fixes an issue) - [ ] New feature (change which adds functionality) - [ ] Documentation update (improves or adds clarity to existing documentation) - [ ] Other (chores, tests, code style improvements etc.) ### Tested on - [x] iOS - [ ] Android ### Testing instructions  ### Screenshots  ### Related issues  ### Checklist - [x] I have performed a self-review of my code - [ ] I have commented my code, particularly in hard-to-understand areas - [x] I have updated the documentation accordingly - [x] My changes generate no new warnings ### Additional notes  --------- Co-authored-by: Michał Pałys-Dudek <michal@pnd.io>
diff --git a/docs/docs/02-hooks/01-natural-language-processing/useSpeechToText.md b/docs/docs/02-hooks/01-natural-language-processing/useSpeechToText.md
@@ -78,7 +78,7 @@ For more information on loading resources, take a look at [loading models](../..
 | Field                       | Type                                                                                                 | Description                                                                                                                                                                                                                                                                                              |
 | --------------------------- | ---------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | `transcribe`                | `(waveform: Float32Array \| number[], options?: DecodingOptions \| undefined) => Promise<string>`    | Starts a transcription process for a given input array, which should be a waveform at 16kHz. The second argument is an options object, e.g. `{ language: 'es' }` for multilingual models. Resolves a promise with the output transcription when the model is finished. Passing `number[]` is deprecated. |
-| `stream`                    | `() => Promise<string>`                                                                              | Starts a streaming transcription process. Use in combination with `streamInsert` to feed audio chunks and `streamStop` to end the stream. Updates `committedTranscription` and `nonCommittedTranscription` as transcription progresses.                                                                  |
+| `stream`                    | `(options?: DecodingOptions \| undefined) => Promise<string>`                                                                              | Starts a streaming transcription process. Use in combination with `streamInsert` to feed audio chunks and `streamStop` to end the stream. The argument is an options object, e.g. `{ language: 'es' }` for multilingual models. Updates `committedTranscription` and `nonCommittedTranscription` as transcription progresses.                                                                  |
 | `streamInsert`              | `(waveform: Float32Array \| number[]) => void`                                                       | Inserts a chunk of audio data (sampled at 16kHz) into the ongoing streaming transcription. Call this repeatedly as new audio data becomes available. Passing `number[]` is deprecated.                                                                                                                   |
 | `streamStop`                | `() => void`                                                                                         | Stops the ongoing streaming transcription process.                                                                                                                                                                                                                                                       |
 | `encode`                    | `(waveform: Float32Array \| number[]) => Promise<Float32Array>`                                      | Runs the encoding part of the model on the provided waveform. Passing `number[]` is deprecated.                                                                                                                                                                                                          |
diff --git a/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts b/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts
@@ -1,7 +1,7 @@
 import { useEffect, useCallback, useState } from 'react';
 import { ETError, getError } from '../../Error';
 import { SpeechToTextModule } from '../../modules/natural_language_processing/SpeechToTextModule';
-import { SpeechToTextModelConfig } from '../../types/stt';
+import { DecodingOptions, SpeechToTextModelConfig } from '../../types/stt';
 
 export const useSpeechToText = ({
   model,
@@ -65,24 +65,29 @@ export const useSpeechToText = ({
     [isReady, isGenerating, modelInstance]
   );
 
-  const stream = useCallback(async () => {
-    if (!isReady) throw new Error(getError(ETError.ModuleNotLoaded));
-    if (isGenerating) throw new Error(getError(ETError.ModelGenerating));
-    setIsGenerating(true);
-    setCommittedTranscription('');
-    setNonCommittedTranscription('');
-    let transcription = '';
-    try {
-      for await (const { committed, nonCommitted } of modelInstance.stream()) {
-        setCommittedTranscription((prev) => prev + committed);
-        setNonCommittedTranscription(nonCommitted);
-        transcription += committed;
+  const stream = useCallback(
+    async (options?: DecodingOptions) => {
+      if (!isReady) throw new Error(getError(ETError.ModuleNotLoaded));
+      if (isGenerating) throw new Error(getError(ETError.ModelGenerating));
+      setIsGenerating(true);
+      setCommittedTranscription('');
+      setNonCommittedTranscription('');
+      let transcription = '';
+      try {
+        for await (const { committed, nonCommitted } of modelInstance.stream(
+          options
+        )) {
+          setCommittedTranscription((prev) => prev + committed);
+          setNonCommittedTranscription(nonCommitted);
+          transcription += committed;
+        }
+      } finally {
+        setIsGenerating(false);
       }
-    } finally {
-      setIsGenerating(false);
-    }
-    return transcription;
-  }, [isReady, isGenerating, modelInstance]);
+      return transcription;
+    },
+    [isReady, isGenerating, modelInstance]
+  );
 
   const wrapper = useCallback(
     <T extends (...args: any[]) => any>(fn: T) => {