Skip to content

Commit 8137ac9

Browse files
authored
Add Pascal API for Dolphin CTC models (#2096)
1 parent 07a5701 commit 8137ac9

File tree

11 files changed

+343
-7
lines changed

11 files changed

+343
-7
lines changed

.github/workflows/pascal.yaml

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -149,6 +149,11 @@ jobs:
149149
cd ./pascal-api-examples
150150
151151
pushd non-streaming-asr
152+
153+
./run-dolphin-ctc.sh
154+
rm -rf sherpa-onnx-*
155+
echo "---"
156+
152157
./run-zipformer-transducer.sh
153158
rm -rf sherpa-onnx-*
154159
echo "---"
@@ -253,7 +258,13 @@ jobs:
253258
254259
cd ./pascal-api-examples
255260
261+
256262
pushd vad-with-non-streaming-asr
263+
264+
time ./run-vad-with-dolphin-ctc.sh
265+
rm -rf sherpa-onnx-*
266+
echo "---"
267+
257268
time ./run-vad-with-moonshine.sh
258269
rm -rf sherpa-onnx-*
259270
echo "---"

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ This repository supports running the following functions **locally**
6060

6161
on the following platforms and operating systems:
6262

63-
- x86, ``x86_64``, 32-bit ARM, 64-bit ARM (arm64, aarch64), RISC-V (riscv64)
63+
- x86, ``x86_64``, 32-bit ARM, 64-bit ARM (arm64, aarch64), RISC-V (riscv64), **RK NPU**
6464
- Linux, macOS, Windows, openKylin
6565
- Android, WearOS
6666
- iOS

pascal-api-examples/non-streaming-asr/README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ APIs with non-streaming models for speech recognition.
55

66
|File|Description|
77
|----|-----------|
8+
|[run-dolphin-ctc.sh](./run-dolphin-ctc.sh)|Use a non-streaming [Dolphin](https://github.com/DataoceanAI/Dolphin) CTC model for speech recognition|
89
|[run-nemo-ctc.sh](./run-nemo-ctc.sh)|Use a non-streaming NeMo CTC model for speech recognition|
910
|[run-nemo-transducer.sh](./run-nemo-transducer.sh)|Use a non-streaming NeMo transducer model for speech recognition|
1011
|[run-paraformer-itn.sh](./run-paraformer-itn.sh)|Use a non-streaming Paraformer model for speech recognition with inverse text normalization for numbers|
Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
{ Copyright (c) 2025 Xiaomi Corporation }
2+
3+
{
4+
This file shows how to use a non-streaming Dolphin CTC model
5+
to decode files.
6+
7+
You can download the model files from
8+
https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models
9+
}
10+
11+
program dolphin_ctc;
12+
13+
{$mode objfpc}
14+
15+
uses
16+
sherpa_onnx,
17+
DateUtils,
18+
SysUtils;
19+
20+
var
21+
Wave: TSherpaOnnxWave;
22+
WaveFilename: AnsiString;
23+
24+
Config: TSherpaOnnxOfflineRecognizerConfig;
25+
Recognizer: TSherpaOnnxOfflineRecognizer;
26+
Stream: TSherpaOnnxOfflineStream;
27+
RecognitionResult: TSherpaOnnxOfflineRecognizerResult;
28+
29+
Start: TDateTime;
30+
Stop: TDateTime;
31+
32+
Elapsed: Single;
33+
Duration: Single;
34+
RealTimeFactor: Single;
35+
begin
36+
Initialize(Config);
37+
38+
Config.ModelConfig.Dolphin.Model := './sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02/model.int8.onnx';
39+
Config.ModelConfig.Tokens := './sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02/tokens.txt';
40+
Config.ModelConfig.Provider := 'cpu';
41+
Config.ModelConfig.NumThreads := 1;
42+
Config.ModelConfig.Debug := False;
43+
44+
WaveFilename := './sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02/test_wavs/0.wav';
45+
46+
Wave := SherpaOnnxReadWave(WaveFilename);
47+
48+
Recognizer := TSherpaOnnxOfflineRecognizer.Create(Config);
49+
Stream := Recognizer.CreateStream();
50+
Start := Now;
51+
52+
Stream.AcceptWaveform(Wave.Samples, Wave.SampleRate);
53+
Recognizer.Decode(Stream);
54+
55+
RecognitionResult := Recognizer.GetResult(Stream);
56+
57+
Stop := Now;
58+
59+
Elapsed := MilliSecondsBetween(Stop, Start) / 1000;
60+
Duration := Length(Wave.Samples) / Wave.SampleRate;
61+
RealTimeFactor := Elapsed / Duration;
62+
63+
WriteLn(RecognitionResult.ToString);
64+
WriteLn(Format('NumThreads %d', [Config.ModelConfig.NumThreads]));
65+
WriteLn(Format('Elapsed %.3f s', [Elapsed]));
66+
WriteLn(Format('Wave duration %.3f s', [Duration]));
67+
WriteLn(Format('RTF = %.3f/%.3f = %.3f', [Elapsed, Duration, RealTimeFactor]));
68+
69+
{Free resources to avoid memory leak.
70+
71+
Note: You don't need to invoke them for this simple script.
72+
However, you have to invoke them in your own large/complex project.
73+
}
74+
FreeAndNil(Stream);
75+
FreeAndNil(Recognizer);
76+
end.
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
#!/usr/bin/env bash
2+
3+
set -ex
4+
5+
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
6+
SHERPA_ONNX_DIR=$(cd $SCRIPT_DIR/../.. && pwd)
7+
8+
echo "SHERPA_ONNX_DIR: $SHERPA_ONNX_DIR"
9+
10+
if [[ ! -f ../../build/install/lib/libsherpa-onnx-c-api.dylib && ! -f ../../build/install/lib/libsherpa-onnx-c-api.so && ! -f ../../build/install/lib/sherpa-onnx-c-api.dll ]]; then
11+
mkdir -p ../../build
12+
pushd ../../build
13+
cmake \
14+
-DCMAKE_INSTALL_PREFIX=./install \
15+
-DSHERPA_ONNX_ENABLE_PYTHON=OFF \
16+
-DSHERPA_ONNX_ENABLE_TESTS=OFF \
17+
-DSHERPA_ONNX_ENABLE_CHECK=OFF \
18+
-DBUILD_SHARED_LIBS=ON \
19+
-DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \
20+
..
21+
22+
cmake --build . --target install --config Release
23+
ls -lh lib
24+
popd
25+
fi
26+
27+
if [ ! -f ./sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02/model.int8.onnx ]; then
28+
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02.tar.bz2
29+
tar xvf sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02.tar.bz2
30+
rm sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02.tar.bz2
31+
fi
32+
33+
fpc \
34+
-dSHERPA_ONNX_USE_SHARED_LIBS \
35+
-Fu$SHERPA_ONNX_DIR/sherpa-onnx/pascal-api \
36+
-Fl$SHERPA_ONNX_DIR/build/install/lib \
37+
./dolphin_ctc.pas
38+
39+
export LD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$LD_LIBRARY_PATH
40+
export DYLD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$DYLD_LIBRARY_PATH
41+
42+
./dolphin_ctc

pascal-api-examples/vad-with-non-streaming-asr/README.md

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,10 @@ with non-streaming speech recognition models.
66

77
|Directory| Description|
88
|---------|------------|
9-
|[run-vad-with-whisper.sh](./run-vad-with-whisper.sh)|It shows how to use the VAD + Whisper for speech recognition.|
10-
|[run-vad-with-sense-voice.sh](./run-vad-with-sense-voice.sh)|It shows how to use the VAD + SenseVoice for speech recognition.|
9+
|[run-vad-with-dolphin-ctc.sh](./run-vad-with-dolphin-ctc.sh)|It shows how to use the VAD + [Dolphin](https://github.com/DataoceanAI/Dolphin) for speech recognition.|
10+
|[run-vad-with-whisper.sh](./run-vad-with-whisper.sh)|It shows how to use the VAD + [Whisper](https://github.com/openai/whisper) for speech recognition.|
11+
|[run-vad-with-sense-voice.sh](./run-vad-with-sense-voice.sh)|It shows how to use the VAD + [SenseVoice](https://github.com/FunAudioLLM/SenseVoice) for speech recognition.|
12+
|[run-vad-with-moonshine.sh](./run-vad-with-moonshine.sh)|It shows how to use the VAD + [Moonshine](https://github.com/usefulsensors/moonshine) for speech recognition.|
13+
1114

1215
Please refer to [non-streaming-asr](../non-streaming-asr) for more kinds of non-streaming models.
Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
#!/usr/bin/env bash
2+
3+
set -ex
4+
5+
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
6+
SHERPA_ONNX_DIR=$(cd $SCRIPT_DIR/../.. && pwd)
7+
8+
echo "SHERPA_ONNX_DIR: $SHERPA_ONNX_DIR"
9+
10+
if [[ ! -f ../../build/install/lib/libsherpa-onnx-c-api.dylib && ! -f ../../build/install/lib/libsherpa-onnx-c-api.so && ! -f ../../build/install/lib/sherpa-onnx-c-api.dll ]]; then
11+
mkdir -p ../../build
12+
pushd ../../build
13+
cmake \
14+
-DCMAKE_INSTALL_PREFIX=./install \
15+
-DSHERPA_ONNX_ENABLE_PYTHON=OFF \
16+
-DSHERPA_ONNX_ENABLE_TESTS=OFF \
17+
-DSHERPA_ONNX_ENABLE_CHECK=OFF \
18+
-DBUILD_SHARED_LIBS=ON \
19+
-DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \
20+
..
21+
22+
cmake --build . --target install --config Release
23+
popd
24+
fi
25+
26+
if [[ ! -f ./silero_vad.onnx ]]; then
27+
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
28+
fi
29+
30+
if [ ! -f ./lei-jun-test.wav ]; then
31+
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/lei-jun-test.wav
32+
fi
33+
34+
if [ ! -f ./sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02/model.int8.onnx ]; then
35+
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02.tar.bz2
36+
tar xvf sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02.tar.bz2
37+
rm sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02.tar.bz2
38+
fi
39+
40+
fpc \
41+
-dSHERPA_ONNX_USE_SHARED_LIBS \
42+
-Fu$SHERPA_ONNX_DIR/sherpa-onnx/pascal-api \
43+
-Fl$SHERPA_ONNX_DIR/build/install/lib \
44+
./vad_with_dolphin.pas
45+
46+
export LD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$LD_LIBRARY_PATH
47+
export DYLD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$DYLD_LIBRARY_PATH
48+
49+
./vad_with_dolphin
Lines changed: 135 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,135 @@
1+
{ Copyright (c) 2025 Xiaomi Corporation }
2+
3+
{
4+
This file shows how to use a non-streaming Dolphin model
5+
with silero VAD to decode files.
6+
7+
You can download the model files from
8+
https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models
9+
}
10+
11+
program vad_with_dolphin;
12+
13+
{$mode objfpc}
14+
15+
uses
16+
sherpa_onnx,
17+
SysUtils;
18+
19+
function CreateVad(): TSherpaOnnxVoiceActivityDetector;
20+
var
21+
Config: TSherpaOnnxVadModelConfig;
22+
23+
SampleRate: Integer;
24+
WindowSize: Integer;
25+
begin
26+
Initialize(Config);
27+
28+
SampleRate := 16000; {Please don't change it unless you know the details}
29+
WindowSize := 512; {Please don't change it unless you know the details}
30+
31+
Config.SileroVad.Model := './silero_vad.onnx';
32+
Config.SileroVad.MinSpeechDuration := 0.5;
33+
Config.SileroVad.MinSilenceDuration := 0.5;
34+
Config.SileroVad.Threshold := 0.5;
35+
Config.SileroVad.WindowSize := WindowSize;
36+
Config.NumThreads:= 1;
37+
Config.Debug:= True;
38+
Config.Provider:= 'cpu';
39+
Config.SampleRate := SampleRate;
40+
41+
Result := TSherpaOnnxVoiceActivityDetector.Create(Config, 30);
42+
end;
43+
44+
function CreateOfflineRecognizer(): TSherpaOnnxOfflineRecognizer;
45+
var
46+
Config: TSherpaOnnxOfflineRecognizerConfig;
47+
begin
48+
Initialize(Config);
49+
50+
Config.ModelConfig.Dolphin.Model := './sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02/model.int8.onnx';
51+
Config.ModelConfig.Tokens := './sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02/tokens.txt';
52+
Config.ModelConfig.Provider := 'cpu';
53+
Config.ModelConfig.NumThreads := 1;
54+
Config.ModelConfig.Debug := False;
55+
56+
Result := TSherpaOnnxOfflineRecognizer.Create(Config);
57+
end;
58+
59+
var
60+
Wave: TSherpaOnnxWave;
61+
62+
Recognizer: TSherpaOnnxOfflineRecognizer;
63+
Vad: TSherpaOnnxVoiceActivityDetector;
64+
65+
Offset: Integer;
66+
WindowSize: Integer;
67+
SpeechSegment: TSherpaOnnxSpeechSegment;
68+
69+
Start: Single;
70+
Duration: Single;
71+
72+
Stream: TSherpaOnnxOfflineStream;
73+
RecognitionResult: TSherpaOnnxOfflineRecognizerResult;
74+
begin
75+
Vad := CreateVad();
76+
Recognizer := CreateOfflineRecognizer();
77+
78+
Wave := SherpaOnnxReadWave('./lei-jun-test.wav');
79+
if Wave.SampleRate <> Vad.Config.SampleRate then
80+
begin
81+
WriteLn(Format('Expected sample rate: %d. Given: %d',
82+
[Vad.Config.SampleRate, Wave.SampleRate]));
83+
84+
Exit;
85+
end;
86+
87+
WindowSize := Vad.Config.SileroVad.WindowSize;
88+
Offset := 0;
89+
while Offset + WindowSize <= Length(Wave.Samples) do
90+
begin
91+
Vad.AcceptWaveform(Wave.Samples, Offset, WindowSize);
92+
Offset += WindowSize;
93+
94+
while not Vad.IsEmpty do
95+
begin
96+
SpeechSegment := Vad.Front();
97+
Vad.Pop();
98+
Stream := Recognizer.CreateStream();
99+
100+
Stream.AcceptWaveform(SpeechSegment.Samples, Wave.SampleRate);
101+
Recognizer.Decode(Stream);
102+
RecognitionResult := Recognizer.GetResult(Stream);
103+
104+
Start := SpeechSegment.Start / Wave.SampleRate;
105+
Duration := Length(SpeechSegment.Samples) / Wave.SampleRate;
106+
WriteLn(Format('%.3f -- %.3f %s',
107+
[Start, Start + Duration, RecognitionResult.Text]));
108+
109+
FreeAndNil(Stream);
110+
end;
111+
end;
112+
113+
Vad.Flush;
114+
115+
while not Vad.IsEmpty do
116+
begin
117+
SpeechSegment := Vad.Front();
118+
Vad.Pop();
119+
Stream := Recognizer.CreateStream();
120+
121+
Stream.AcceptWaveform(SpeechSegment.Samples, Wave.SampleRate);
122+
Recognizer.Decode(Stream);
123+
RecognitionResult := Recognizer.GetResult(Stream);
124+
125+
Start := SpeechSegment.Start / Wave.SampleRate;
126+
Duration := Length(SpeechSegment.Samples) / Wave.SampleRate;
127+
WriteLn(Format('%.3f -- %.3f %s',
128+
[Start, Start + Duration, RecognitionResult.Text]));
129+
130+
FreeAndNil(Stream);
131+
end;
132+
133+
FreeAndNil(Recognizer);
134+
FreeAndNil(Vad);
135+
end.

pascal-api-examples/vad-with-non-streaming-asr/vad_with_sense_voice.pas

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models
99
}
1010

11-
program vad_with_whisper;
11+
program vad_with_sense_voice;
1212

1313
{$mode objfpc}
1414

sherpa-onnx/c-api/c-api.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1969,7 +1969,7 @@ int32_t SherpaOnnxLinearResamplerResampleGetOutputSampleRate(
19691969
return p->impl->GetOutputSamplingRate();
19701970
}
19711971

1972-
void SherpaOnnxLinearResamplerReset(SherpaOnnxLinearResampler *p) {
1972+
void SherpaOnnxLinearResamplerReset(const SherpaOnnxLinearResampler *p) {
19731973
p->impl->Reset();
19741974
}
19751975

0 commit comments

Comments
 (0)