Skip to content

Commit 46f2e32

Browse files
authored
Add Pascal API for Kokoro TTS models (#1724)
1 parent 4335e2a commit 46f2e32

File tree

10 files changed

+444
-7
lines changed

10 files changed

+444
-7
lines changed

.github/workflows/pascal.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -154,6 +154,12 @@ jobs:
154154
ls -lh
155155
echo "---"
156156
157+
./run-kokoro-en.sh
158+
rm -rf kokoro-en-*
159+
rm kokoro-en
160+
ls -lh
161+
echo "---"
162+
157163
./run-matcha-zh.sh
158164
rm -rf matcha-icefall-*
159165
rm matcha-zh

pascal-api-examples/tts/.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,3 +6,5 @@ matcha-zh
66
matcha-en
77
matcha-zh-playback
88
matcha-en-playback
9+
kokoro-en
10+
kokoro-en-playback
Lines changed: 239 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,239 @@
1+
{ Copyright (c) 2025 Xiaomi Corporation }
2+
program kokoro_en_playback;
3+
{
4+
This file shows how to use the text to speech API of sherpa-onnx
5+
with Kokoro models.
6+
7+
It generates speech from text and saves it to a wave file.
8+
9+
Note that it plays the audio back as it is still generating.
10+
}
11+
12+
{$mode objfpc}
13+
14+
uses
15+
{$ifdef unix}
16+
cthreads,
17+
{$endif}
18+
SysUtils,
19+
dos,
20+
ctypes,
21+
portaudio,
22+
sherpa_onnx;
23+
24+
var
25+
CriticalSection: TRTLCriticalSection;
26+
27+
Tts: TSherpaOnnxOfflineTts;
28+
Audio: TSherpaOnnxGeneratedAudio;
29+
Resampler: TSherpaOnnxLinearResampler;
30+
31+
Text: AnsiString;
32+
Speed: Single = 1.0; {Use a larger value to speak faster}
33+
SpeakerId: Integer = 7;
34+
Buffer: TSherpaOnnxCircularBuffer;
35+
FinishedGeneration: Boolean = False;
36+
FinishedPlaying: Boolean = False;
37+
38+
Version: String;
39+
EnvStr: String;
40+
Status: Integer;
41+
NumDevices: Integer;
42+
DeviceIndex: Integer;
43+
DeviceInfo: PPaDeviceInfo;
44+
45+
{ If you get EDivByZero: Division by zero error, please change the sample rate
46+
to the one supported by your microphone.
47+
}
48+
DeviceSampleRate: Integer = 48000;
49+
I: Integer;
50+
Param: TPaStreamParameters;
51+
Stream: PPaStream;
52+
Wave: TSherpaOnnxWave;
53+
54+
function GenerateCallback(
55+
Samples: pcfloat; N: cint32;
56+
Arg: Pointer): cint; cdecl;
57+
begin
58+
EnterCriticalSection(CriticalSection);
59+
try
60+
if Resampler <> nil then
61+
Buffer.Push(Resampler.Resample(Samples, N, False))
62+
else
63+
Buffer.Push(Samples, N);
64+
finally
65+
LeaveCriticalSection(CriticalSection);
66+
end;
67+
68+
{ 1 means to continue generating; 0 means to stop generating. }
69+
Result := 1;
70+
end;
71+
72+
function PlayCallback(
73+
input: Pointer; output: Pointer;
74+
frameCount: culong;
75+
timeInfo: PPaStreamCallbackTimeInfo;
76+
statusFlags: TPaStreamCallbackFlags;
77+
userData: Pointer ): cint; cdecl;
78+
var
79+
Samples: TSherpaOnnxSamplesArray;
80+
I: Integer;
81+
begin
82+
EnterCriticalSection(CriticalSection);
83+
try
84+
if Buffer.Size >= frameCount then
85+
begin
86+
Samples := Buffer.Get(Buffer.Head, FrameCount);
87+
Buffer.Pop(FrameCount);
88+
end
89+
else if Buffer.Size > 0 then
90+
begin
91+
Samples := Buffer.Get(Buffer.Head, Buffer.Size);
92+
Buffer.Pop(Buffer.Size);
93+
SetLength(Samples, frameCount);
94+
end
95+
else
96+
SetLength(Samples, frameCount);
97+
98+
for I := 0 to frameCount - 1 do
99+
pcfloat(output)[I] := Samples[I];
100+
101+
if (Buffer.Size > 0) or (not FinishedGeneration) then
102+
Result := paContinue
103+
else
104+
begin
105+
Result := paComplete;
106+
FinishedPlaying := True;
107+
end;
108+
finally
109+
LeaveCriticalSection(CriticalSection);
110+
end;
111+
end;
112+
113+
function GetOfflineTts: TSherpaOnnxOfflineTts;
114+
var
115+
Config: TSherpaOnnxOfflineTtsConfig;
116+
begin
117+
Config.Model.Kokoro.Model := './kokoro-en-v0_19/model.onnx';
118+
Config.Model.Kokoro.Voices := './kokoro-en-v0_19/voices.bin';
119+
Config.Model.Kokoro.Tokens := './kokoro-en-v0_19/tokens.txt';
120+
Config.Model.Kokoro.DataDir := './kokoro-en-v0_19/espeak-ng-data';
121+
Config.Model.NumThreads := 2;
122+
Config.Model.Debug := False;
123+
Config.MaxNumSentences := 1;
124+
125+
Result := TSherpaOnnxOfflineTts.Create(Config);
126+
end;
127+
128+
begin
129+
Tts := GetOfflineTts;
130+
if Tts.GetSampleRate <> DeviceSampleRate then
131+
Resampler := TSherpaOnnxLinearResampler.Create(Tts.GetSampleRate, DeviceSampleRate);
132+
133+
Version := String(Pa_GetVersionText);
134+
WriteLn('Version is ', Version);
135+
Status := Pa_Initialize;
136+
if Status <> paNoError then
137+
begin
138+
WriteLn('Failed to initialize portaudio, ', Pa_GetErrorText(Status));
139+
Exit;
140+
end;
141+
142+
NumDevices := Pa_GetDeviceCount;
143+
WriteLn('Num devices: ', NumDevices);
144+
145+
DeviceIndex := Pa_GetDefaultOutputDevice;
146+
147+
if DeviceIndex = paNoDevice then
148+
begin
149+
WriteLn('No default output device found');
150+
Pa_Terminate;
151+
Exit;
152+
end;
153+
154+
EnvStr := GetEnv('SHERPA_ONNX_MIC_DEVICE');
155+
if EnvStr <> '' then
156+
begin
157+
DeviceIndex := StrToIntDef(EnvStr, DeviceIndex);
158+
WriteLn('Use device index from environment variable SHERPA_ONNX_MIC_DEVICE: ', EnvStr);
159+
end;
160+
161+
for I := 0 to (NumDevices - 1) do
162+
begin
163+
DeviceInfo := Pa_GetDeviceInfo(I);
164+
if I = DeviceIndex then
165+
{ WriteLn(Format(' * %d %s', [I, DeviceInfo^.Name])) }
166+
WriteLn(Format(' * %d %s', [I, AnsiString(DeviceInfo^.Name)]))
167+
else
168+
WriteLn(Format(' %d %s', [I, AnsiString(DeviceInfo^.Name)]));
169+
end;
170+
171+
WriteLn('Use device ', DeviceIndex);
172+
WriteLn(' Name ', Pa_GetDeviceInfo(DeviceIndex)^.Name);
173+
WriteLn(' Max output channels ', Pa_GetDeviceInfo(DeviceIndex)^.MaxOutputChannels);
174+
175+
Initialize(Param);
176+
Param.Device := DeviceIndex;
177+
Param.ChannelCount := 1;
178+
Param.SampleFormat := paFloat32;
179+
param.SuggestedLatency := Pa_GetDeviceInfo(DeviceIndex)^.DefaultHighOutputLatency;
180+
param.HostApiSpecificStreamInfo := nil;
181+
182+
Buffer := TSherpaOnnxCircularBuffer.Create(30 * DeviceSampleRate);
183+
184+
185+
{ Note(fangjun): PortAudio invokes PlayCallback in a separate thread. }
186+
Status := Pa_OpenStream(stream, nil, @Param, DeviceSampleRate, paFramesPerBufferUnspecified, paNoFlag,
187+
PPaStreamCallback(@PlayCallback), nil);
188+
189+
if Status <> paNoError then
190+
begin
191+
WriteLn('Failed to open stream, ', Pa_GetErrorText(Status));
192+
Pa_Terminate;
193+
Exit;
194+
end;
195+
196+
InitCriticalSection(CriticalSection);
197+
198+
Status := Pa_StartStream(stream);
199+
if Status <> paNoError then
200+
begin
201+
WriteLn('Failed to start stream, ', Pa_GetErrorText(Status));
202+
Pa_Terminate;
203+
Exit;
204+
end;
205+
206+
WriteLn('There are ', Tts.GetNumSpeakers, ' speakers');
207+
208+
Text := 'Friends fell out often because life was changing so fast. The easiest thing in the world was to lose touch with someone.';
209+
210+
Audio := Tts.Generate(Text, SpeakerId, Speed,
211+
PSherpaOnnxGeneratedAudioCallbackWithArg(@GenerateCallback), nil);
212+
FinishedGeneration := True;
213+
SherpaOnnxWriteWave('./kokoro-en-playback-7.wav', Audio.Samples, Audio.SampleRate);
214+
WriteLn('Saved to ./kokoro-en-playback-7.wav');
215+
216+
while not FinishedPlaying do
217+
Pa_Sleep(100); {sleep for 0.1 second }
218+
{TODO(fangjun): Use an event to indicate the play is finished}
219+
220+
DoneCriticalSection(CriticalSection);
221+
222+
FreeAndNil(Tts);
223+
FreeAndNil(Resampler);
224+
225+
Status := Pa_CloseStream(stream);
226+
if Status <> paNoError then
227+
begin
228+
WriteLn('Failed to close stream, ', Pa_GetErrorText(Status));
229+
Exit;
230+
end;
231+
232+
Status := Pa_Terminate;
233+
if Status <> paNoError then
234+
begin
235+
WriteLn('Failed to deinitialize portaudio, ', Pa_GetErrorText(Status));
236+
Exit;
237+
end;
238+
end.
239+
Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
{ Copyright (c) 2025 Xiaomi Corporation }
2+
program kokoro_en;
3+
{
4+
This file shows how to use the text to speech API of sherpa-onnx
5+
with Kokoro TTS models.
6+
7+
It generates speech from text and saves it to a wave file.
8+
9+
If you want to play it while it is generating, please see
10+
./kokoro-en-playback.pas
11+
}
12+
13+
{$mode objfpc}
14+
15+
uses
16+
SysUtils,
17+
sherpa_onnx;
18+
19+
function GetOfflineTts: TSherpaOnnxOfflineTts;
20+
var
21+
Config: TSherpaOnnxOfflineTtsConfig;
22+
begin
23+
Config.Model.Kokoro.Model := './kokoro-en-v0_19/model.onnx';
24+
Config.Model.Kokoro.Voices := './kokoro-en-v0_19/voices.bin';
25+
Config.Model.Kokoro.Tokens := './kokoro-en-v0_19/tokens.txt';
26+
Config.Model.Kokoro.DataDir := './kokoro-en-v0_19/espeak-ng-data';
27+
Config.Model.NumThreads := 2;
28+
Config.Model.Debug := False;
29+
Config.MaxNumSentences := 1;
30+
31+
Result := TSherpaOnnxOfflineTts.Create(Config);
32+
end;
33+
34+
var
35+
Tts: TSherpaOnnxOfflineTts;
36+
Audio: TSherpaOnnxGeneratedAudio;
37+
38+
Text: AnsiString;
39+
Speed: Single = 1.0; {Use a larger value to speak faster}
40+
SpeakerId: Integer = 8;
41+
42+
begin
43+
Tts := GetOfflineTts;
44+
45+
WriteLn('There are ', Tts.GetNumSpeakers, ' speakers');
46+
47+
Text := 'Friends fell out often because life was changing so fast. The easiest thing in the world was to lose touch with someone.';
48+
49+
Audio := Tts.Generate(Text, SpeakerId, Speed);
50+
SherpaOnnxWriteWave('./kokoro-en-8.wav', Audio.Samples, Audio.SampleRate);
51+
WriteLn('Saved to ./kokoro-en-8.wav');
52+
53+
FreeAndNil(Tts);
54+
end.
55+

pascal-api-examples/tts/matcha-en-playback.pas

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
program matcha_en_playback;
33
{
44
This file shows how to use the text to speech API of sherpa-onnx
5-
with Piper models.
5+
with MatchaTTS models.
66
77
It generates speech from text and saves it to a wave file.
88
@@ -210,8 +210,8 @@ function GetOfflineTts: TSherpaOnnxOfflineTts;
210210
Audio := Tts.Generate(Text, SpeakerId, Speed,
211211
PSherpaOnnxGeneratedAudioCallbackWithArg(@GenerateCallback), nil);
212212
FinishedGeneration := True;
213-
SherpaOnnxWriteWave('./matcha-zh-playback.wav', Audio.Samples, Audio.SampleRate);
214-
WriteLn('Saved to ./matcha-zh-playback.wav');
213+
SherpaOnnxWriteWave('./matcha-en-playback.wav', Audio.Samples, Audio.SampleRate);
214+
WriteLn('Saved to ./matcha-en-playback.wav');
215215

216216
while not FinishedPlaying do
217217
Pa_Sleep(100); {sleep for 0.1 second }

pascal-api-examples/tts/matcha-en.pas

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
It generates speech from text and saves it to a wave file.
88
99
If you want to play it while it is generating, please see
10-
./matcha-zh-playback.pas
10+
./matcha-en-playback.pas
1111
}
1212

1313
{$mode objfpc}

pascal-api-examples/tts/matcha-zh-playback.pas

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
program matcha_zh_playback;
33
{
44
This file shows how to use the text to speech API of sherpa-onnx
5-
with Piper models.
5+
with MatchaTTS models.
66
77
It generates speech from text and saves it to a wave file.
88

0 commit comments

Comments
 (0)