second try

lexasub · lexasub · commit 98c916b3ac39 · 2025-04-16T09:05:16.000+04:00
diff --git a/examples/websocket-stream/stream.cpp b/examples/websocket-stream/stream.cpp
@@ -88,14 +88,29 @@ class WhisperServer {
             }
         }
         else if(msg->type == ix::WebSocketMessageType::Message) {
-            std::lock_guard<std::mutex> lock(clients_mtx);
+            //std::lock_guard<std::mutex> lock(clients_mtx);
+
             if(auto it = clients.find(client_id); it != clients.end()) {
                 auto& session = *it->second;
                 std::lock_guard<std::mutex> session_lock(session.mtx);
                 
+	       	if (!msg->binary) {
+                    webSocket.sendText("Error: Expected binary data");
+                    fprintf(stderr, "Client %s sent text data\n", client_id.c_str());
+                    return;
+                }
+
+                const auto &data = msg->str;
+                size_t data_size = data.size();
+                
+                if (data_size % sizeof(int16_t) != 0) {
+                    webSocket.sendText("Error: Invalid data size");
+                    fprintf(stderr, "Invalid data size from %s: %zu\n", client_id.c_str(), data_size);
+                    return;
+                }	
                 //PCM16 -> FLOAT32
-                const int16_t* pcm16 = reinterpret_cast<const int16_t*>(msg->str.data());
-                const size_t num_samples = msg->str.size() / sizeof(int16_t);
+                const int16_t* pcm16 = reinterpret_cast<const int16_t*>(data.data());
+                const size_t num_samples =  data_size / sizeof(int16_t);
                 
                 session.pcm_buffer.reserve(session.pcm_buffer.size() + num_samples);
                 for(size_t i = 0; i < num_samples; ++i) {
@@ -108,12 +123,15 @@ class WhisperServer {
     }
 
     void processClientAudio(std::string client_id) {
-        constexpr int step_ms = 3000;
+        constexpr int step_ms = 300;
         constexpr int n_samples_step = (1e-3 * step_ms) * WHISPER_SAMPLE_RATE;
         
+        fprintf(stderr, "Started thread for: %s\n", client_id);
         while(true) {
             std::vector<float> audio_chunk;
             {
+
+        	//fprintf(stderr, "Started read chunk from: %s\n", client_id);
                 std::unique_lock<std::mutex> lock(clients_mtx);
                 if(!clients.count(client_id)) break;
                 auto& session = *clients[client_id];
@@ -129,14 +147,36 @@ class WhisperServer {
                     audio_chunk.assign(session.pcm_buffer.begin(), session.pcm_buffer.begin() + n_samples_step);
                     session.pcm_buffer.erase(session.pcm_buffer.begin(), session.pcm_buffer.begin() + n_samples_step);
                 }
+
+		size_t available = session.pcm_buffer.size();
+                if(available >= n_samples_step) {
+                    size_t take = std::min(available, (size_t)n_samples_step);
+                    audio_chunk.assign(
+                        session.pcm_buffer.begin(),
+                        session.pcm_buffer.begin() + take
+                    );
+                    session.pcm_buffer.erase(
+                        session.pcm_buffer.begin(),
+                        session.pcm_buffer.begin() + take
+                    );
+                }
+        	//fprintf(stderr, "End of read chunk: %s\n", client_id);
             }
             
             if(!audio_chunk.empty()) {
-                whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
+
+        	//fprintf(stderr, "Good, chunk not empty for: %s\n", client_id);
+                whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_BEAM_SEARCH);
                 wparams.n_threads = params.n_threads;
-                wparams.print_progress = false;
+		wparams.language = "en";                    
+		wparams.print_realtime   = false;
+		wparams.print_progress   = false;
+		wparams.single_segment   = true;            
+		wparams.max_tokens       = 32;              
                 
                 if(whisper_full(ctx, wparams, audio_chunk.data(), audio_chunk.size()) == 0) {
+
+        	    fprintf(stderr, "whisper_full == 0: %s\n", client_id);
                     const int n_segments = whisper_full_n_segments(ctx);
                     for(int i = 0; i < n_segments; ++i) {
                         const char* text = whisper_full_get_segment_text(ctx, i);
@@ -151,7 +191,7 @@ class WhisperServer {
 int main(int argc, char** argv) {
     server_params params;
     params.port = 9002;
-    params.model = "../models/for-tests-ggml-base.bin";
+    params.model = "ggml-large-v3-turbo.bin";
     
     WhisperServer server(params);
     server.run();