@@ -88,14 +88,29 @@ class WhisperServer {
88
88
}
89
89
}
90
90
else if (msg->type == ix::WebSocketMessageType::Message) {
91
- std::lock_guard<std::mutex> lock (clients_mtx);
91
+ // std::lock_guard<std::mutex> lock(clients_mtx);
92
+
92
93
if (auto it = clients.find (client_id); it != clients.end ()) {
93
94
auto & session = *it->second ;
94
95
std::lock_guard<std::mutex> session_lock (session.mtx );
95
96
97
+ if (!msg->binary ) {
98
+ webSocket.sendText (" Error: Expected binary data" );
99
+ fprintf (stderr, " Client %s sent text data\n " , client_id.c_str ());
100
+ return ;
101
+ }
102
+
103
+ const auto &data = msg->str ;
104
+ size_t data_size = data.size ();
105
+
106
+ if (data_size % sizeof (int16_t ) != 0 ) {
107
+ webSocket.sendText (" Error: Invalid data size" );
108
+ fprintf (stderr, " Invalid data size from %s: %zu\n " , client_id.c_str (), data_size);
109
+ return ;
110
+ }
96
111
// PCM16 -> FLOAT32
97
- const int16_t * pcm16 = reinterpret_cast <const int16_t *>(msg-> str .data ());
98
- const size_t num_samples = msg-> str . size () / sizeof (int16_t );
112
+ const int16_t * pcm16 = reinterpret_cast <const int16_t *>(data .data ());
113
+ const size_t num_samples = data_size / sizeof (int16_t );
99
114
100
115
session.pcm_buffer .reserve (session.pcm_buffer .size () + num_samples);
101
116
for (size_t i = 0 ; i < num_samples; ++i) {
@@ -108,12 +123,15 @@ class WhisperServer {
108
123
}
109
124
110
125
void processClientAudio (std::string client_id) {
111
- constexpr int step_ms = 3000 ;
126
+ constexpr int step_ms = 300 ;
112
127
constexpr int n_samples_step = (1e-3 * step_ms) * WHISPER_SAMPLE_RATE;
113
128
129
+ fprintf (stderr, " Started thread for: %s\n " , client_id);
114
130
while (true ) {
115
131
std::vector<float > audio_chunk;
116
132
{
133
+
134
+ // fprintf(stderr, "Started read chunk from: %s\n", client_id);
117
135
std::unique_lock<std::mutex> lock (clients_mtx);
118
136
if (!clients.count (client_id)) break ;
119
137
auto & session = *clients[client_id];
@@ -129,14 +147,36 @@ class WhisperServer {
129
147
audio_chunk.assign (session.pcm_buffer .begin (), session.pcm_buffer .begin () + n_samples_step);
130
148
session.pcm_buffer .erase (session.pcm_buffer .begin (), session.pcm_buffer .begin () + n_samples_step);
131
149
}
150
+
151
+ size_t available = session.pcm_buffer .size ();
152
+ if (available >= n_samples_step) {
153
+ size_t take = std::min (available, (size_t )n_samples_step);
154
+ audio_chunk.assign (
155
+ session.pcm_buffer .begin (),
156
+ session.pcm_buffer .begin () + take
157
+ );
158
+ session.pcm_buffer .erase (
159
+ session.pcm_buffer .begin (),
160
+ session.pcm_buffer .begin () + take
161
+ );
162
+ }
163
+ // fprintf(stderr, "End of read chunk: %s\n", client_id);
132
164
}
133
165
134
166
if (!audio_chunk.empty ()) {
135
- whisper_full_params wparams = whisper_full_default_params (WHISPER_SAMPLING_GREEDY);
167
+
168
+ // fprintf(stderr, "Good, chunk not empty for: %s\n", client_id);
169
+ whisper_full_params wparams = whisper_full_default_params (WHISPER_SAMPLING_BEAM_SEARCH);
136
170
wparams.n_threads = params.n_threads ;
137
- wparams.print_progress = false ;
171
+ wparams.language = " en" ;
172
+ wparams.print_realtime = false ;
173
+ wparams.print_progress = false ;
174
+ wparams.single_segment = true ;
175
+ wparams.max_tokens = 32 ;
138
176
139
177
if (whisper_full (ctx, wparams, audio_chunk.data (), audio_chunk.size ()) == 0 ) {
178
+
179
+ fprintf (stderr, " whisper_full == 0: %s\n " , client_id);
140
180
const int n_segments = whisper_full_n_segments (ctx);
141
181
for (int i = 0 ; i < n_segments; ++i) {
142
182
const char * text = whisper_full_get_segment_text (ctx, i);
@@ -151,7 +191,7 @@ class WhisperServer {
151
191
int main (int argc, char ** argv) {
152
192
server_params params;
153
193
params.port = 9002 ;
154
- params.model = " ../models/for-tests-ggml-base .bin" ;
194
+ params.model = " ggml-large-v3-turbo .bin" ;
155
195
156
196
WhisperServer server (params);
157
197
server.run ();
0 commit comments