This guide covers how to use speech recognition at runtime in your Unity project. For model configuration and import, see ASR Models Setup.
The ASR system provides two modes — offline (file recognition) and online (streaming):
IAsrService (offline)
|
+-- AsrService (POCO, no MonoBehaviour)
|
+-- AsrEngine (native OfflineRecognizer pool)
IOnlineAsrService (streaming)
|
+-- OnlineAsrService (POCO, no MonoBehaviour)
|
+-- OnlineAsrEngine (native OnlineRecognizer)
MicrophoneSource (POCO, microphone capture with circular buffer)
| Approach | When to use |
|---|---|
AsrOrchestrator / OnlineAsrOrchestrator (MonoBehaviour) |
Prototyping, no DI framework |
AsrService / OnlineAsrService + VContainer |
Production with VContainer DI |
AsrService / OnlineAsrService + Zenject |
Production with Zenject DI |
AsrService / OnlineAsrService manual |
Custom lifecycle management |
The simplest way to recognize speech from an AudioClip. Add AsrOrchestrator to any GameObject:
using UnityEngine;
using PonyuDev.SherpaOnnx.Asr.Offline;
public class AsrExample : MonoBehaviour
{
[SerializeField] private AsrOrchestrator _orchestrator;
[SerializeField] private AudioClip _clip;
private void Start()
{
if (_orchestrator.IsInitialized)
Recognize();
else
_orchestrator.Initialized += Recognize;
}
private async void Recognize()
{
// Sync: extracts PCM and recognizes on the calling thread
AsrResult result = _orchestrator.RecognizeFromClip(_clip);
Debug.Log($"Recognized: {result?.Text}");
// Async: extraction on main thread, recognition on background
AsrResult asyncResult = await _orchestrator.RecognizeFromClipAsync(_clip);
Debug.Log($"Async: {asyncResult?.Text}");
}
private void OnDestroy()
{
_orchestrator.Initialized -= Recognize;
}
}AsrOrchestrator initializes asynchronously on Awake. On Android, this includes
extracting model files from APK to persistentDataPath. Use IsInitialized or the
Initialized event to wait for completion.
using UnityEngine;
using PonyuDev.SherpaOnnx.Asr.Offline;
using PonyuDev.SherpaOnnx.Asr.Offline.Engine;
public class ManualAsrExample : MonoBehaviour
{
[SerializeField] private AudioClip _clip;
private IAsrService _asr;
private async void Awake()
{
_asr = new AsrService();
await _asr.InitializeAsync();
}
public async void Recognize()
{
if (_asr == null || !_asr.IsReady || _clip == null)
return;
float[] samples = new float[_clip.samples * _clip.channels];
_clip.GetData(samples, 0);
AsrResult result = await _asr.RecognizeAsync(samples, _clip.frequency);
Debug.Log($"Result: {result?.Text}");
}
private void OnDestroy()
{
_asr?.Dispose();
}
}Real-time speech recognition with microphone. Add OnlineAsrOrchestrator to a GameObject:
using UnityEngine;
using PonyuDev.SherpaOnnx.Asr.Online;
using PonyuDev.SherpaOnnx.Asr.Online.Engine;
using PonyuDev.SherpaOnnx.Common.Audio;
using PonyuDev.SherpaOnnx.Common.Audio.Config;
public class StreamingAsrExample : MonoBehaviour
{
[SerializeField] private OnlineAsrOrchestrator _orchestrator;
private MicrophoneSource _mic;
private void Start()
{
if (_orchestrator.IsInitialized)
SetupMicrophone();
else
_orchestrator.Initialized += SetupMicrophone;
_orchestrator.PartialResultReady += OnPartial;
_orchestrator.FinalResultReady += OnFinal;
_orchestrator.EndpointDetected += OnEndpoint;
}
private async void SetupMicrophone()
{
var micSettings = await MicrophoneSettingsLoader.LoadAsync();
_mic = new MicrophoneSource(micSettings);
_mic.SilenceDetected += OnSilenceDetected;
bool started = await _mic.StartRecordingAsync();
if (started)
_orchestrator.ConnectMicrophone(_mic);
}
public void StopRecording()
{
_orchestrator.DisconnectMicrophone();
_mic?.StopRecording();
}
private void OnPartial(OnlineAsrResult result)
{
Debug.Log($"Partial: {result.Text}");
}
private void OnFinal(OnlineAsrResult result)
{
Debug.Log($"Final: {result.Text}");
}
private void OnEndpoint()
{
Debug.Log("Endpoint detected — stream reset.");
}
private void OnSilenceDetected(string diagnosis)
{
// Microphone returned silence on all available paths.
// Stop recording and notify the user.
_orchestrator.DisconnectMicrophone();
_mic?.StopRecording();
Debug.LogWarning(
"Voice capture unavailable on this device. " +
"Diag: " + diagnosis);
}
private void OnDestroy()
{
_orchestrator.Initialized -= SetupMicrophone;
_orchestrator.PartialResultReady -= OnPartial;
_orchestrator.FinalResultReady -= OnFinal;
_orchestrator.EndpointDetected -= OnEndpoint;
if (_mic != null)
_mic.SilenceDetected -= OnSilenceDetected;
_orchestrator.DisconnectMicrophone();
_mic?.Dispose();
}
}OnlineAsrOrchestrator automatically:
- Subscribes to
MicrophoneSource.SamplesAvailableviaConnectMicrophone() - Feeds audio to the engine and calls
ProcessAvailableFrames()each frame - Forwards
PartialResultReady,FinalResultReady,EndpointDetectedevents - Resets the stream on endpoint detection when
_autoResetOnEndpointis enabled (default)
using UnityEngine;
using PonyuDev.SherpaOnnx.Asr.Online;
using PonyuDev.SherpaOnnx.Asr.Online.Engine;
using PonyuDev.SherpaOnnx.Common.Audio;
using PonyuDev.SherpaOnnx.Common.Audio.Config;
public class ManualStreamingExample : MonoBehaviour
{
private IOnlineAsrService _asr;
private MicrophoneSource _mic;
private async void Awake()
{
_asr = new OnlineAsrService();
await _asr.InitializeAsync();
_asr.PartialResultReady += OnPartial;
_asr.FinalResultReady += OnFinal;
_asr.EndpointDetected += OnEndpoint;
var micSettings = await MicrophoneSettingsLoader.LoadAsync();
_mic = new MicrophoneSource(micSettings);
_mic.SilenceDetected += OnSilenceDetected;
bool started = await _mic.StartRecordingAsync();
if (started)
{
_mic.SamplesAvailable += OnMicSamples;
_asr.StartSession();
}
}
private void OnMicSamples(float[] samples)
{
if (!_asr.IsSessionActive)
return;
_asr.AcceptSamples(samples, _mic.SampleRate);
_asr.ProcessAvailableFrames();
}
private void OnPartial(OnlineAsrResult r) => Debug.Log($"Partial: {r.Text}");
private void OnFinal(OnlineAsrResult r) => Debug.Log($"Final: {r.Text}");
private void OnEndpoint()
{
_asr.ResetStream();
}
private void OnSilenceDetected(string diagnosis)
{
_mic.SamplesAvailable -= OnMicSamples;
_mic.StopRecording();
_asr.StopSession();
Debug.LogWarning(
"Voice capture unavailable. Diag: " + diagnosis);
}
private void OnDestroy()
{
_mic.SamplesAvailable -= OnMicSamples;
_mic.SilenceDetected -= OnSilenceDetected;
_asr.PartialResultReady -= OnPartial;
_asr.FinalResultReady -= OnFinal;
_asr.EndpointDetected -= OnEndpoint;
_mic?.Dispose();
_asr?.Dispose();
}
}// By name
_asr.SwitchProfile("sherpa-onnx-zipformer-small-en-2023-06-26");
// By index
_asr.SwitchProfile(0);
// Recognize with new profile
AsrResult result = _asr.Recognize(samples, sampleRate);Multiple native recognizer instances allow concurrent recognition:
_asr.EnginePoolSize = 4; // up to 4 parallel recognitionsIf your audio sample rate differs from the model's expected rate (usually 16 kHz):
using PonyuDev.SherpaOnnx.Common.Audio;
// Resample mono audio
float[] resampled = AudioResampler.Resample(samples, fromRate: 44100, toRate: 16000);
// Downmix stereo to mono + resample
float[] mono16k = AudioResampler.ResampleMono(
stereoSamples, channels: 2, fromRate: 48000, toRate: 16000);using PonyuDev.SherpaOnnx.Asr.Offline;
using PonyuDev.SherpaOnnx.Asr.Online;
using PonyuDev.SherpaOnnx.Common.Audio;
using PonyuDev.SherpaOnnx.Common.Audio.Config;
using VContainer;
using VContainer.Unity;
public class AsrLifetimeScope : LifetimeScope
{
protected override void Configure(IContainerBuilder builder)
{
// Offline ASR
builder.Register<AsrService>(Lifetime.Singleton)
.As<IAsrService>();
// Online (streaming) ASR
builder.Register<OnlineAsrService>(Lifetime.Singleton)
.As<IOnlineAsrService>();
// Microphone settings (loaded from StreamingAssets JSON)
builder.Register<MicrophoneSettingsData>(Lifetime.Singleton);
// Microphone (shared instance)
builder.Register<MicrophoneSource>(Lifetime.Singleton);
// Async initialization
builder.RegisterEntryPoint<AsrInitializer>();
}
}using System.Threading;
using Cysharp.Threading.Tasks;
using PonyuDev.SherpaOnnx.Asr.Offline;
using PonyuDev.SherpaOnnx.Asr.Online;
using PonyuDev.SherpaOnnx.Common.Audio.Config;
using VContainer.Unity;
public class AsrInitializer : IAsyncStartable
{
private readonly IAsrService _offline;
private readonly IOnlineAsrService _online;
private readonly MicrophoneSettingsData _micSettings;
public AsrInitializer(
IAsrService offline,
IOnlineAsrService online,
MicrophoneSettingsData micSettings)
{
_offline = offline;
_online = online;
_micSettings = micSettings;
}
public async UniTask StartAsync(CancellationToken ct)
{
// Load microphone settings from JSON
var loaded = await MicrophoneSettingsLoader.LoadAsync(ct);
_micSettings.sampleRate = loaded.sampleRate;
_micSettings.clipLengthSec = loaded.clipLengthSec;
_micSettings.micStartTimeoutSec = loaded.micStartTimeoutSec;
_micSettings.silenceThreshold = loaded.silenceThreshold;
_micSettings.silenceFrameLimit = loaded.silenceFrameLimit;
_micSettings.diagFrameCount = loaded.diagFrameCount;
await _offline.InitializeAsync(ct: ct);
await _online.InitializeAsync(ct: ct);
}
}using System;
using PonyuDev.SherpaOnnx.Asr.Offline;
using PonyuDev.SherpaOnnx.Asr.Offline.Engine;
using VContainer;
using VContainer.Unity;
public class DictationPresenter : IStartable, IDisposable
{
private readonly IAsrService _asr;
[Inject]
public DictationPresenter(IAsrService asr)
{
_asr = asr;
}
public void Start() { }
public async void RecognizeClip(float[] samples, int sampleRate)
{
AsrResult result = await _asr.RecognizeAsync(samples, sampleRate);
if (result != null && result.IsValid)
OnRecognized(result.Text);
}
private void OnRecognized(string text)
{
// Update UI or model
}
public void Dispose()
{
// IAsrService lifetime is managed by the container
}
}using System;
using PonyuDev.SherpaOnnx.Asr.Online;
using PonyuDev.SherpaOnnx.Asr.Online.Engine;
using PonyuDev.SherpaOnnx.Common.Audio;
using VContainer;
using VContainer.Unity;
public class LiveCaptionPresenter : IStartable, IDisposable
{
private readonly IOnlineAsrService _asr;
private readonly MicrophoneSource _mic;
[Inject]
public LiveCaptionPresenter(
IOnlineAsrService asr,
MicrophoneSource mic)
{
_asr = asr;
_mic = mic;
}
public void Start()
{
_asr.PartialResultReady += OnPartial;
_asr.FinalResultReady += OnFinal;
_asr.EndpointDetected += OnEndpoint;
}
public async void StartCapture()
{
bool ok = await _mic.StartRecordingAsync();
if (!ok) return;
_mic.SamplesAvailable += OnMicSamples;
_asr.StartSession();
}
public void StopCapture()
{
_mic.SamplesAvailable -= OnMicSamples;
_mic.StopRecording();
_asr.StopSession();
}
private void OnMicSamples(float[] samples)
{
if (!_asr.IsSessionActive) return;
_asr.AcceptSamples(samples, _mic.SampleRate);
_asr.ProcessAvailableFrames();
}
private void OnPartial(OnlineAsrResult r)
{
// Update partial caption UI
}
private void OnFinal(OnlineAsrResult r)
{
// Append final text to transcript
}
private void OnEndpoint()
{
_asr.ResetStream();
}
public void Dispose()
{
_asr.PartialResultReady -= OnPartial;
_asr.FinalResultReady -= OnFinal;
_asr.EndpointDetected -= OnEndpoint;
StopCapture();
}
}using PonyuDev.SherpaOnnx.Asr.Offline;
using PonyuDev.SherpaOnnx.Asr.Online;
using PonyuDev.SherpaOnnx.Common.Audio;
using PonyuDev.SherpaOnnx.Common.Audio.Config;
using Zenject;
public class AsrInstaller : MonoInstaller
{
public override void InstallBindings()
{
// Offline ASR
Container.Bind<IAsrService>()
.To<AsrService>()
.AsSingle();
// Online (streaming) ASR
Container.Bind<IOnlineAsrService>()
.To<OnlineAsrService>()
.AsSingle();
// Microphone settings (loaded from StreamingAssets JSON)
Container.Bind<MicrophoneSettingsData>()
.AsSingle();
// Microphone
Container.Bind<MicrophoneSource>()
.AsSingle();
// Initialization
Container.BindInterfacesTo<AsrInitializer>()
.AsSingle();
}
}using System;
using Cysharp.Threading.Tasks;
using PonyuDev.SherpaOnnx.Asr.Offline;
using PonyuDev.SherpaOnnx.Asr.Online;
using PonyuDev.SherpaOnnx.Common.Audio.Config;
using Zenject;
public class AsrInitializer : IInitializable, IDisposable
{
private readonly IAsrService _offline;
private readonly IOnlineAsrService _online;
private readonly MicrophoneSettingsData _micSettings;
public AsrInitializer(
IAsrService offline,
IOnlineAsrService online,
MicrophoneSettingsData micSettings)
{
_offline = offline;
_online = online;
_micSettings = micSettings;
}
public async void Initialize()
{
// Load microphone settings from JSON
var loaded = await MicrophoneSettingsLoader.LoadAsync();
_micSettings.sampleRate = loaded.sampleRate;
_micSettings.clipLengthSec = loaded.clipLengthSec;
_micSettings.micStartTimeoutSec = loaded.micStartTimeoutSec;
_micSettings.silenceThreshold = loaded.silenceThreshold;
_micSettings.silenceFrameLimit = loaded.silenceFrameLimit;
_micSettings.diagFrameCount = loaded.diagFrameCount;
await _offline.InitializeAsync();
await _online.InitializeAsync();
}
public void Dispose()
{
_offline.Dispose();
_online.Dispose();
}
}using UnityEngine;
using PonyuDev.SherpaOnnx.Asr.Offline;
using PonyuDev.SherpaOnnx.Asr.Offline.Engine;
using PonyuDev.SherpaOnnx.Asr.Online;
using PonyuDev.SherpaOnnx.Asr.Online.Engine;
using PonyuDev.SherpaOnnx.Common.Audio;
using Zenject;
public class SubtitleController : MonoBehaviour
{
[Inject] private IOnlineAsrService _asr;
[Inject] private MicrophoneSource _mic;
private void OnEnable()
{
_asr.PartialResultReady += OnPartial;
_asr.FinalResultReady += OnFinal;
_asr.EndpointDetected += OnEndpoint;
}
public async void StartListening()
{
bool ok = await _mic.StartRecordingAsync();
if (!ok) return;
_mic.SamplesAvailable += OnMicSamples;
_asr.StartSession();
}
public void StopListening()
{
_mic.SamplesAvailable -= OnMicSamples;
_mic.StopRecording();
_asr.StopSession();
}
private void OnMicSamples(float[] samples)
{
if (!_asr.IsSessionActive) return;
_asr.AcceptSamples(samples, _mic.SampleRate);
_asr.ProcessAvailableFrames();
}
private void OnPartial(OnlineAsrResult r) { /* update subtitle */ }
private void OnFinal(OnlineAsrResult r) { /* finalize subtitle */ }
private void OnEndpoint() { _asr.ResetStream(); }
private void OnDisable()
{
_asr.PartialResultReady -= OnPartial;
_asr.FinalResultReady -= OnFinal;
_asr.EndpointDetected -= OnEndpoint;
StopListening();
}
}On Android, StreamingAssets files live inside the APK archive and are not
accessible via System.IO.File. The package handles this automatically:
- At build time, a manifest of all SherpaOnnx files is generated
- On first launch, files are extracted from APK to
persistentDataPath - Subsequent launches skip extraction (version marker check)
You must use InitializeAsync() on Android. The synchronous Initialize()
cannot extract files from the APK.
MicrophoneSource requests microphone permission automatically on Android
(configurable via constructor parameter requestPermission). If permission is
denied, StartRecordingAsync() returns false.
On some Android devices (e.g. certain Samsung models), Unity's Microphone API
returns silence. MicrophoneSource automatically detects this and falls back to
a native AudioRecord implementation via JNI. The fallback tries multiple audio
sources: VOICE_RECOGNITION → VOICE_COMMUNICATION → MIC.
Audio processing effects (NoiseSuppressor, AGC, AcousticEchoCanceler) are disabled on the native path to get a clean signal.
Configure microphone behavior via SherpaOnnx/microphone-settings.json in
StreamingAssets:
{
"sampleRate": 16000,
"clipLengthSec": 10,
"micStartTimeoutSec": 2.0,
"silenceThreshold": 0.05,
"silenceFrameLimit": 90,
"diagFrameCount": 5
}| Setting | Default | Description |
|---|---|---|
sampleRate |
16000 | Capture sample rate in Hz |
clipLengthSec |
10 | Circular buffer length in seconds |
micStartTimeoutSec |
2.0 | Max wait for microphone to start producing samples |
silenceThreshold |
0.05 | Amplitude below this is treated as silence |
silenceFrameLimit |
90 | Silent frames before fallback triggers (~3s at 30fps) |
diagFrameCount |
5 | Diagnostic log frames at recording start |
Monitor extraction progress on Android (useful for loading screens):
var progress = new Progress<float>(p =>
Debug.Log($"Extracting: {p:P0}"));
await _asr.InitializeAsync(progress);| Category | Method | Description |
|---|---|---|
| Lifecycle | Initialize() |
Sync init (Desktop only) |
InitializeAsync(progress, ct) |
Async init (all platforms, required on Android) | |
LoadProfile(profile) |
Load a specific profile | |
SwitchProfile(index) |
Switch by index | |
SwitchProfile(name) |
Switch by name | |
| Properties | IsReady |
true when engine is loaded |
ActiveProfile |
Current AsrProfile |
|
Settings |
All loaded AsrSettingsData |
|
EnginePoolSize |
Get/set concurrent native instances | |
| Recognition | Recognize(samples, sampleRate) |
Sync recognition, returns AsrResult |
RecognizeAsync(samples, sampleRate) |
Background thread recognition |
| Category | Method | Description |
|---|---|---|
| Lifecycle | Initialize() |
Sync init (Desktop only) |
InitializeAsync(progress, ct) |
Async init (all platforms, required on Android) | |
LoadProfile(profile) |
Load a specific profile | |
SwitchProfile(index) |
Switch by index | |
SwitchProfile(name) |
Switch by name | |
| Properties | IsReady |
true when engine is loaded |
IsSessionActive |
true during active recognition session |
|
ActiveProfile |
Current OnlineAsrProfile |
|
Settings |
All loaded OnlineAsrSettingsData |
|
| Session | StartSession() |
Begin streaming recognition |
StopSession() |
End streaming recognition | |
| Audio | AcceptSamples(samples, sampleRate) |
Feed PCM audio samples |
ProcessAvailableFrames() |
Process buffered audio, fire result events | |
ResetStream() |
Reset decoder state (call after endpoint) | |
| Events | PartialResultReady |
Fires with intermediate recognition text |
FinalResultReady |
Fires with final text when endpoint detected | |
EndpointDetected |
Fires when a speech endpoint is detected |
| Member | Type | Description |
|---|---|---|
Text |
string |
Recognized text |
Tokens |
string[] |
Individual tokens (may be null) |
Timestamps |
float[] |
Per-token timestamps in seconds (may be null) |
Durations |
float[] |
Per-token durations in seconds (may be null) |
IsValid |
bool |
true when Text is not empty |
| Member | Type | Description |
|---|---|---|
Text |
string |
Recognized text so far |
Tokens |
string[] |
Individual tokens (may be null) |
Timestamps |
float[] |
Per-token timestamps in seconds (may be null) |
IsFinal |
bool |
true when endpoint was detected |
IsValid |
bool |
true when Text is not empty |
Constructor: MicrophoneSource(MicrophoneSettingsData settings = null, string deviceName = null, bool requestPermission = true)
Settings are loaded from SherpaOnnx/microphone-settings.json in StreamingAssets via MicrophoneSettingsLoader.LoadAsync(). Falls back to defaults when file is missing.
| Category | Member | Description |
|---|---|---|
| Properties | IsRecording |
true while capturing |
DeviceName |
Microphone device name (null = default) | |
SampleRate |
Capture sample rate (from settings, default 16000) | |
| Methods | StartRecordingAsync(ct) |
Request permission and start capture |
StopRecording() |
Stop capture | |
ReadNewSamples() |
Pull model: get new samples since last call | |
ReadAllSamples() |
Get entire circular buffer | |
Dispose() |
Stop and release resources | |
| Events | SamplesAvailable |
Push model: fires each frame with new PCM samples |
RecordingStopped |
Fires when recording stops | |
SilenceDetected |
Fires with diagnostics when sustained silence is detected |
| Member | Description |
|---|---|
Service |
IAsrService — the underlying service |
IsInitialized |
true after async initialization completes |
Initialized |
Event: fires once after initialization |
RecognizeFromClip(clip) |
Extract PCM from AudioClip and recognize (sync) |
RecognizeFromClipAsync(clip) |
Same but recognition runs on background thread |
| Member | Description |
|---|---|
Service |
IOnlineAsrService — the underlying service |
IsInitialized |
true after async initialization completes |
Initialized |
Event: fires once after initialization |
ConnectMicrophone(mic) |
Wire MicrophoneSource to the streaming pipeline |
DisconnectMicrophone() |
Disconnect and stop forwarding audio |
PartialResultReady |
Forwarded from the service |
FinalResultReady |
Forwarded from the service |
EndpointDetected |
Forwarded from the service |
| Issue | Solution |
|---|---|
AsrService is not initialized |
Call Initialize() or InitializeAsync() before Recognize() |
No active profile found |
Set an active profile in Project Settings > Sherpa-ONNX > ASR |
| SIGSEGV crash on Android | Ensure you use InitializeAsync(), not Initialize() |
StartRecordingAsync returns false |
Microphone permission denied or no devices found |
| No partial results appear | Ensure StartSession() is called and SamplesAvailable is wired |
| Endpoint never fires | Check endpoint detection settings in the online profile |
Recognize() returns null |
Check logs for engine errors; verify model files exist |
| Audio sounds too slow/fast | Use AudioResampler to match the model's expected sample rate |
| Android mic silence (SilenceDetected fires) | Device HAL issue; native fallback activates automatically. Check logcat for diagnostics |
| Fallback triggers during speech pauses | Increase silenceFrameLimit in microphone-settings.json |