Skip to content

Commit c82b134

Browse files
author
litongjava
committed
add /paddlespeech/asr/streaming
1 parent afd06fa commit c82b134

12 files changed

+426
-243
lines changed

CMakeLists.txt

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
cmake_minimum_required(VERSION 3.23)
22
project(whisper_cpp_server)
33

4-
set(CMAKE_CXX_STANDARD 17)
4+
set(CMAKE_CXX_STANDARD 20)
55
# 查找 uWebSockets 的头文件路径
66
find_path(UWEBSOCKETS_INCLUDE_DIRS "uwebsockets/App.h")
77
# 查找 zlib 库
@@ -44,18 +44,16 @@ target_link_libraries(sdl_version ${SDL2_LIBRARIES})
4444
add_executable(simplest simplest.cpp common.cpp)
4545
target_link_libraries(simplest whisper)
4646

47-
add_executable(stream_local common.cpp common-sdl.cpp stream_local.cpp
47+
add_executable(stream_local stream_local.cpp common.cpp common-sdl.cpp
4848
stream_components_service.cpp stream_components_audio.cpp
4949
stream_components_output.cpp
50-
whisper_server_base_on_uwebsockets.cpp
5150
)
5251
target_link_libraries(stream_local whisper ${SDL2_LIBRARIES})
5352

54-
add_executable(whisper_http_server_base_httplib whisper_http_server_base_httplib.cpp common.cpp httplib.h json.hpp inference_handler.cpp whisper_params.cpp)
53+
add_executable(whisper_http_server_base_httplib whisper_http_server_base_httplib.cpp common.cpp httplib.h nlohmann/json.hpp inference_handler.cpp whisper_params.cpp)
5554
target_link_libraries(whisper_http_server_base_httplib whisper)
5655

57-
# 链接 uWebSockets、zlib、libuv 和 uSockets 库
58-
add_executable(whisper_server_base_on_uwebsockets whisper_server_base_on_uwebsockets.cpp)
56+
add_executable(whisper_server_base_on_uwebsockets whisper_server_base_on_uwebsockets.cpp stream_components_service.cpp)
5957
#添加头文件
6058
target_include_directories(whisper_server_base_on_uwebsockets PRIVATE ${UWEBSOCKETS_INCLUDE_DIRS})
6159
# 链接 uWebSockets、zlib、libuv 和 uSockets 库

inference_handler.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
#include "inference_handler.h"
33
#include "common.h"
44
#include "whisper_params.h"
5-
#include "json.hpp"
5+
#include "nlohmann/json.hpp"
66

77
using json = nlohmann::json;
88

File renamed without changes.

simplest.cpp

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ void replace_all(std::string &s, const std::string &search, const std::string &r
5252
}
5353

5454
// command-line parameters
55-
struct whisper_local_stream_params {
55+
struct whisper_local_params {
5656
int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
5757
int32_t n_processors = 1;
5858
int32_t offset_t_ms = 0;
@@ -97,9 +97,9 @@ struct whisper_local_stream_params {
9797
std::vector<std::string> fname_out = {};
9898
};
9999

100-
void whisper_print_usage(int argc, char **argv, const whisper_local_stream_params &params);
100+
void whisper_print_usage(int argc, char **argv, const whisper_local_params &params);
101101

102-
bool whisper_params_parse(int argc, char **argv, whisper_local_stream_params &params) {
102+
bool whisper_params_parse(int argc, char **argv, whisper_local_params &params) {
103103
for (int i = 1; i < argc; i++) {
104104
std::string arg = argv[i];
105105

@@ -128,7 +128,7 @@ bool whisper_params_parse(int argc, char **argv, whisper_local_stream_params &pa
128128
return true;
129129
}
130130

131-
void whisper_print_usage(int /*argc*/, char **argv, const whisper_local_stream_params &params) {
131+
void whisper_print_usage(int /*argc*/, char **argv, const whisper_local_params &params) {
132132
fprintf(stderr, "\n");
133133
fprintf(stderr, "usage: %s [options] file0.wav file1.wav ...\n", argv[0]);
134134
fprintf(stderr, "\n");
@@ -141,7 +141,7 @@ void whisper_print_usage(int /*argc*/, char **argv, const whisper_local_stream_p
141141
}
142142

143143
struct whisper_print_user_data {
144-
const whisper_local_stream_params *params;
144+
const whisper_local_params *params;
145145

146146
const std::vector<std::vector<float>> *pcmf32s;
147147
int progress_prev;
@@ -298,7 +298,7 @@ char *escape_double_quotes_and_backslashes(const char *str) {
298298

299299
int main(int argc, char **argv) {
300300
printf("start\n");
301-
whisper_local_stream_params params;
301+
whisper_local_params params;
302302

303303
if (whisper_params_parse(argc, argv, params) == false) {
304304
whisper_print_usage(argc, argv, params);

stream_components.h

Lines changed: 77 additions & 136 deletions
Original file line numberDiff line numberDiff line change
@@ -1,145 +1,86 @@
1-
#include <iostream>
2-
#include "stream_components_audio.h"
3-
#include "stream_components_params.h"
4-
#include "stream_components_output.h"
5-
#include "stream_components_service.h"
1+
#pragma once
62

7-
using namespace stream_components;
3+
namespace stream_components {
4+
struct whisper_local_stream_params {
5+
audio_params audio;
6+
service_params service;
87

9-
struct whisper_params {
10-
audio_params audio;
11-
service_params server;
12-
13-
void initialize() {
14-
audio.initialize();
15-
server.initialize();
16-
}
17-
};
18-
19-
20-
void whisper_print_usage(int argc, char **argv, const whisper_params &params);
21-
22-
bool whisper_params_parse(int argc, char **argv, whisper_params &params) {
23-
for (int i = 1; i < argc; i++) {
24-
std::string arg = argv[i];
25-
26-
if (arg == "-h" || arg == "--help") {
27-
whisper_print_usage(argc, argv, params);
28-
exit(0);
29-
} else if (arg == "-t" || arg == "--threads") { params.server.n_threads = std::stoi(argv[++i]); }
30-
else if (arg == "--step") { params.audio.step_ms = std::stoi(argv[++i]); }
31-
else if (arg == "--length") { params.audio.length_ms = std::stoi(argv[++i]); }
32-
else if (arg == "--keep") { params.audio.keep_ms = std::stoi(argv[++i]); }
33-
else if (arg == "-c" || arg == "--capture") { params.audio.capture_id = std::stoi(argv[++i]); }
34-
//else if (arg == "-mt" || arg == "--max-tokens") { params.max_tokens = std::stoi(argv[++i]); }
35-
else if (arg == "-ac" || arg == "--audio-ctx") { params.audio.audio_ctx = std::stoi(argv[++i]); }
36-
else if (arg == "-vth" || arg == "--vad-thold") { params.audio.vad_thold = std::stof(argv[++i]); }
37-
else if (arg == "-fth" || arg == "--freq-thold") { params.audio.freq_thold = std::stof(argv[++i]); }
38-
else if (arg == "-su" || arg == "--speed-up") { params.server.speed_up = true; }
39-
else if (arg == "-tr" || arg == "--translate") { params.server.translate = true; }
40-
else if (arg == "-nf" || arg == "--no-fallback") { params.server.no_fallback = true; }
41-
//else if (arg == "-ps" || arg == "--print-special") { params.print_special = true; }
42-
else if (arg == "-kc" || arg == "--keep-context") { params.server.no_context = false; }
43-
else if (arg == "-l" || arg == "--language") { params.server.language = argv[++i]; }
44-
else if (arg == "-m" || arg == "--model") { params.server.model = argv[++i]; }
45-
//else if (arg == "-f" || arg == "--file") { params.fname_out = argv[++i]; }
46-
else if (arg == "-tdrz" || arg == "--tinydiarize") { params.server.tinydiarize = true; }
47-
//else if (arg == "-sa" || arg == "--save-audio") { params.save_audio = true; }
48-
49-
else {
50-
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
51-
whisper_print_usage(argc, argv, params);
52-
exit(0);
8+
void initialize() {
9+
audio.initialize();
10+
service.initialize();
5311
}
12+
};
13+
14+
void whisper_print_usage(int /*argc*/, char **argv, const whisper_local_stream_params &params) {
15+
fprintf(stderr, "\n");
16+
fprintf(stderr, "usage: %s [options]\n", argv[0]);
17+
fprintf(stderr, "\n");
18+
fprintf(stderr, "options:\n");
19+
fprintf(stderr, " -h, --help [default] show this help message and exit\n");
20+
fprintf(stderr, " -t N, --threads N [%-7d] number of threads to use during computation\n",
21+
params.service.n_threads);
22+
fprintf(stderr, " --step N [%-7d] audio step size in milliseconds\n", params.audio.step_ms);
23+
fprintf(stderr, " --length N [%-7d] audio length in milliseconds\n", params.audio.length_ms);
24+
fprintf(stderr, " --keep N [%-7d] audio to keep from previous step in ms\n",
25+
params.audio.keep_ms);
26+
fprintf(stderr, " -c ID, --capture ID [%-7d] capture device ID\n", params.audio.capture_id);
27+
//fprintf(stderr, " -mt N, --max-tokens N [%-7d] maximum number of tokens per audio chunk\n", params.max_tokens);
28+
fprintf(stderr, " -ac N, --audio-ctx N [%-7d] audio context size (0 - all)\n", params.audio.audio_ctx);
29+
fprintf(stderr, " -vth N, --vad-thold N [%-7.2f] voice activity detection threshold\n",
30+
params.audio.vad_thold);
31+
fprintf(stderr, " -fth N, --freq-thold N [%-7.2f] high-pass frequency cutoff\n", params.audio.freq_thold);
32+
fprintf(stderr, " -su, --speed-up [%-7s] speed up audio by x2 (reduced accuracy)\n",
33+
params.service.speed_up ? "true" : "false");
34+
fprintf(stderr, " -tr, --translate [%-7s] translate from source language to english\n",
35+
params.service.translate ? "true" : "false");
36+
fprintf(stderr, " -nf, --no-fallback [%-7s] do not use temperature fallback while decoding\n",
37+
params.service.no_fallback ? "true" : "false");
38+
//fprintf(stderr, " -ps, --print-special [%-7s] print special tokens\n", params.print_special ? "true" : "false");
39+
fprintf(stderr, " -kc, --keep-context [%-7s] keep context between audio chunks\n",
40+
params.service.no_context ? "false" : "true");
41+
fprintf(stderr, " -l LANG, --language LANG [%-7s] spoken language\n", params.service.language.c_str());
42+
fprintf(stderr, " -m FNAME, --model FNAME [%-7s] model path\n", params.service.model.c_str());
43+
//fprintf(stderr, " -f FNAME, --file FNAME [%-7s] text output file name\n", params.fname_out.c_str());
44+
fprintf(stderr, " -tdrz, --tinydiarize [%-7s] enable tinydiarize (requires a tdrz model)\n",
45+
params.service.tinydiarize ? "true" : "false");
46+
//fprintf(stderr, " -sa, --save-audio [%-7s] save the recorded audio to a file\n", params.save_audio ? "true" : "false");
47+
fprintf(stderr, "\n");
5448
}
5549

56-
return true;
57-
}
58-
59-
void whisper_print_usage(int /*argc*/, char **argv, const whisper_params &params) {
60-
fprintf(stderr, "\n");
61-
fprintf(stderr, "usage: %s [options]\n", argv[0]);
62-
fprintf(stderr, "\n");
63-
fprintf(stderr, "options:\n");
64-
fprintf(stderr, " -h, --help [default] show this help message and exit\n");
65-
fprintf(stderr, " -t N, --threads N [%-7d] number of threads to use during computation\n",
66-
params.server.n_threads);
67-
fprintf(stderr, " --step N [%-7d] audio step size in milliseconds\n", params.audio.step_ms);
68-
fprintf(stderr, " --length N [%-7d] audio length in milliseconds\n", params.audio.length_ms);
69-
fprintf(stderr, " --keep N [%-7d] audio to keep from previous step in ms\n", params.audio.keep_ms);
70-
fprintf(stderr, " -c ID, --capture ID [%-7d] capture device ID\n", params.audio.capture_id);
71-
//fprintf(stderr, " -mt N, --max-tokens N [%-7d] maximum number of tokens per audio chunk\n", params.max_tokens);
72-
fprintf(stderr, " -ac N, --audio-ctx N [%-7d] audio context size (0 - all)\n", params.audio.audio_ctx);
73-
fprintf(stderr, " -vth N, --vad-thold N [%-7.2f] voice activity detection threshold\n", params.audio.vad_thold);
74-
fprintf(stderr, " -fth N, --freq-thold N [%-7.2f] high-pass frequency cutoff\n", params.audio.freq_thold);
75-
fprintf(stderr, " -su, --speed-up [%-7s] speed up audio by x2 (reduced accuracy)\n",
76-
params.server.speed_up ? "true" : "false");
77-
fprintf(stderr, " -tr, --translate [%-7s] translate from source language to english\n",
78-
params.server.translate ? "true" : "false");
79-
fprintf(stderr, " -nf, --no-fallback [%-7s] do not use temperature fallback while decoding\n",
80-
params.server.no_fallback ? "true" : "false");
81-
//fprintf(stderr, " -ps, --print-special [%-7s] print special tokens\n", params.print_special ? "true" : "false");
82-
fprintf(stderr, " -kc, --keep-context [%-7s] keep context between audio chunks\n",
83-
params.server.no_context ? "false" : "true");
84-
fprintf(stderr, " -l LANG, --language LANG [%-7s] spoken language\n", params.server.language.c_str());
85-
fprintf(stderr, " -m FNAME, --model FNAME [%-7s] model path\n", params.server.model.c_str());
86-
//fprintf(stderr, " -f FNAME, --file FNAME [%-7s] text output file name\n", params.fname_out.c_str());
87-
fprintf(stderr, " -tdrz, --tinydiarize [%-7s] enable tinydiarize (requires a tdrz model)\n",
88-
params.server.tinydiarize ? "true" : "false");
89-
//fprintf(stderr, " -sa, --save-audio [%-7s] save the recorded audio to a file\n", params.save_audio ? "true" : "false");
90-
fprintf(stderr, "\n");
91-
}
92-
93-
int main(int argc, char **argv) {
94-
95-
// Read parameters...
96-
whisper_params params;
97-
98-
if (whisper_params_parse(argc, argv, params) == false) {
99-
return 1;
100-
}
101-
102-
// Compute derived parameters
103-
params.initialize();
104-
105-
// Check parameters
106-
if (params.server.language != "auto" && whisper_lang_id(params.server.language.c_str()) == -1) {
107-
fprintf(stderr, "error: unknown language '%s'\n", params.server.language.c_str());
108-
whisper_print_usage(argc, argv, params);
109-
exit(0);
110-
}
111-
112-
// Instantiate the audio input
113-
stream_components::LocalSDLMicrophone audio(params.audio);
114-
115-
// Instantiate the server
116-
stream_components::WhisperServer server(params.server, params.audio);
117-
118-
// Print the 'header'...
119-
WhisperOutput::server_to_json(std::cout, params.server, server.ctx);
120-
121-
// Run until Ctrl + C
122-
bool is_running = true;
123-
while (is_running) {
124-
125-
// handle Ctrl + C
126-
is_running = sdl_poll_events();
127-
if (!is_running) {
128-
break;
50+
bool whisper_params_parse(int argc, char **argv, whisper_local_stream_params &params) {
51+
for (int i = 1; i < argc; i++) {
52+
std::string arg = argv[i];
53+
54+
if (arg == "-h" || arg == "--help") {
55+
whisper_print_usage(argc, argv, params);
56+
exit(0);
57+
} else if (arg == "-t" || arg == "--threads") { params.service.n_threads = std::stoi(argv[++i]); }
58+
else if (arg == "--step") { params.audio.step_ms = std::stoi(argv[++i]); }
59+
else if (arg == "--length") { params.audio.length_ms = std::stoi(argv[++i]); }
60+
else if (arg == "--keep") { params.audio.keep_ms = std::stoi(argv[++i]); }
61+
else if (arg == "-c" || arg == "--capture") { params.audio.capture_id = std::stoi(argv[++i]); }
62+
//else if (arg == "-mt" || arg == "--max-tokens") { params.max_tokens = std::stoi(argv[++i]); }
63+
else if (arg == "-ac" || arg == "--audio-ctx") { params.audio.audio_ctx = std::stoi(argv[++i]); }
64+
else if (arg == "-vth" || arg == "--vad-thold") { params.audio.vad_thold = std::stof(argv[++i]); }
65+
else if (arg == "-fth" || arg == "--freq-thold") { params.audio.freq_thold = std::stof(argv[++i]); }
66+
else if (arg == "-su" || arg == "--speed-up") { params.service.speed_up = true; }
67+
else if (arg == "-tr" || arg == "--translate") { params.service.translate = true; }
68+
else if (arg == "-nf" || arg == "--no-fallback") { params.service.no_fallback = true; }
69+
//else if (arg == "-ps" || arg == "--print-special") { params.print_special = true; }
70+
else if (arg == "-kc" || arg == "--keep-context") { params.service.no_context = false; }
71+
else if (arg == "-l" || arg == "--language") { params.service.language = argv[++i]; }
72+
else if (arg == "-m" || arg == "--model") { params.service.model = argv[++i]; }
73+
//else if (arg == "-f" || arg == "--file") { params.fname_out = argv[++i]; }
74+
else if (arg == "-tdrz" || arg == "--tinydiarize") { params.service.tinydiarize = true; }
75+
//else if (arg == "-sa" || arg == "--save-audio") { params.save_audio = true; }
76+
77+
else {
78+
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
79+
whisper_print_usage(argc, argv, params);
80+
exit(0);
81+
}
12982
}
13083

131-
// get next audio section
132-
auto pcmf32 = audio.get_next();
133-
134-
// get the whisper output
135-
auto result = server.process(pcmf32.data(), pcmf32.size());
136-
137-
// write the output as json to stdout (for this example)
138-
if (result) {
139-
result->transcription_to_json(std::cout);
140-
}
84+
return true;
14185
}
142-
143-
std::cout << "EXITED MAIN LOOP" << std::endl;
144-
return 0;
14586
}

stream_components_params.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33

44
#include <string>
55
#include <thread>
6-
6+
#include "whisper.h"
77
namespace stream_components {
88

99
struct audio_params {
@@ -53,7 +53,7 @@ namespace stream_components {
5353

5454
void initialize() {}
5555
};
56-
5756
} // namespace stream_components
5857

58+
5959
#endif // WHISPER_STREAM_COMPONENTS_PARAMS_H

0 commit comments

Comments
 (0)