dirty experiments with websocket-stream

lexasub · lexasub · commit 22f6a815016b · 2025-04-16T06:34:09.000+04:00
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
@@ -107,6 +107,7 @@ else()
     add_subdirectory(quantize)
     if (WHISPER_SDL2)
         add_subdirectory(stream)
+        add_subdirectory(websocket-stream)
         add_subdirectory(command)
         add_subdirectory(talk-llama)
         add_subdirectory(lsp)
diff --git a/examples/websocket-stream/CMakeLists.txt b/examples/websocket-stream/CMakeLists.txt
@@ -0,0 +1,10 @@
+if (WHISPER_SDL2)
+    set(TARGET whisper-stream-websocket)
+    add_executable(${TARGET} stream.cpp)
+    find_package(ixwebsocket)
+    include(DefaultTargetOptions)
+
+    target_link_libraries(${TARGET} PRIVATE common common-sdl whisper ixwebsocket z ${CMAKE_THREAD_LIBS_INIT})
+
+    install(TARGETS ${TARGET} RUNTIME)
+endif ()
diff --git a/examples/websocket-stream/README.md b/examples/websocket-stream/README.md
@@ -0,0 +1,51 @@
+# whisper.cpp/examples/stream
+
+This is a naive example of performing real-time inference on audio from your microphone.
+The `whisper-stream` tool samples the audio every half a second and runs the transcription continously.
+More info is available in [issue #10](https://github.com/ggerganov/whisper.cpp/issues/10).
+
+```bash
+./build/bin/whisper-stream -m ./models/ggml-base.en.bin -t 8 --step 500 --length 5000
+```
+
+https://user-images.githubusercontent.com/1991296/194935793-76afede7-cfa8-48d8-a80f-28ba83be7d09.mp4
+
+## Sliding window mode with VAD
+
+Setting the `--step` argument to `0` enables the sliding window mode:
+
+```bash
+ ./build/bin/whisper-stream -m ./models/ggml-base.en.bin -t 6 --step 0 --length 30000 -vth 0.6
+```
+
+In this mode, the tool will transcribe only after some speech activity is detected. A very
+basic VAD detector is used, but in theory a more sophisticated approach can be added. The
+`-vth` argument determines the VAD threshold - higher values will make it detect silence more often.
+It's best to tune it to the specific use case, but a value around `0.6` should be OK in general.
+When silence is detected, it will transcribe the last `--length` milliseconds of audio and output
+a transcription block that is suitable for parsing.
+
+## Building
+
+The `whisper-stream` tool depends on SDL2 library to capture audio from the microphone. You can build it like this:
+
+```bash
+# Install SDL2
+# On Debian based linux distributions:
+sudo apt-get install libsdl2-dev
+
+# On Fedora Linux:
+sudo dnf install SDL2 SDL2-devel
+
+# Install SDL2 on Mac OS
+brew install sdl2
+
+cmake -B build -DWHISPER_SDL2=ON
+cmake --build build --config Release
+
+./build/bin/whisper-stream
+```
+
+## Web version
+
+This tool can also run in the browser: [examples/stream.wasm](/examples/stream.wasm)
diff --git a/examples/websocket-stream/stream.cpp b/examples/websocket-stream/stream.cpp
@@ -0,0 +1,159 @@
+#include "common.h"
+#include "common-whisper.h"
+#include "whisper.h"
+#include "ixwebsocket/IXWebSocketServer.h"
+#include "ixwebsocket/IXNetSystem.h"
+
+#include <atomic>
+#include <chrono>
+#include <condition_variable>
+#include <cstdio>
+#include <fstream>
+#include <mutex>
+#include <string>
+#include <thread>
+#include <unordered_map>
+#include <vector>
+
+struct ClientSession {
+    std::vector<float> pcm_buffer;
+    std::mutex mtx;
+    std::condition_variable cv;
+    std::atomic<bool> active{false};
+    std::atomic<bool> terminate{false};
+};
+
+struct server_params {
+    int32_t port = 9002;
+    int32_t n_threads = std::min(4, (int32_t)std::thread::hardware_concurrency());
+    std::string model = "models/ggml-base.en.bin";
+    bool use_gpu = true;
+};
+
+class WhisperServer {
+private:
+    server_params params;
+    ix::WebSocketServer server;
+    std::unordered_map<std::string, std::unique_ptr<ClientSession>> clients;
+    std::mutex clients_mtx;
+    whisper_context* ctx = nullptr;
+
+public:
+    WhisperServer(const server_params& params) : params(params), server(params.port, "0.0.0.0") {
+        ix::initNetSystem();
+        
+        whisper_context_params cparams = whisper_context_default_params();
+        cparams.use_gpu = params.use_gpu;
+        ctx = whisper_init_from_file_with_params(params.model.c_str(), cparams);
+        
+        server.setTLSOptions({});
+	server.setOnClientMessageCallback([this](std::shared_ptr<ix::ConnectionState> connectionState, ix::WebSocket& webSocket, const ix::WebSocketMessagePtr& msg) {
+            this->handleMessage(connectionState, webSocket, msg);
+        });
+    }
+
+    ~WhisperServer() {
+        server.stop();
+        ix::uninitNetSystem();
+        if(ctx) whisper_free(ctx);
+    }
+
+    void run() {
+        server.listenAndStart();
+        fprintf(stderr, "Server started on port %d\n", params.port);
+        
+        while(true) {
+            std::this_thread::sleep_for(std::chrono::seconds(1));
+        }
+    }
+
+private:
+    void handleMessage(std::shared_ptr<ix::ConnectionState> connectionState, ix::WebSocket& webSocket, const ix::WebSocketMessagePtr& msg) {
+        const auto client_id = connectionState->getId();
+        
+        if(msg->type == ix::WebSocketMessageType::Open) {
+            fprintf(stderr, "New client connected: %s\n", client_id);
+            std::lock_guard<std::mutex> lock(clients_mtx);
+            clients[client_id] = std::make_unique<ClientSession>();
+            clients[client_id]->active = true;
+            std::thread(&WhisperServer::processClientAudio, this, client_id).detach();
+        }
+        else if(msg->type == ix::WebSocketMessageType::Close) {
+            fprintf(stderr, "Client disconnected: %s\n", client_id);
+            std::lock_guard<std::mutex> lock(clients_mtx);
+            if(clients.count(client_id)) {
+                clients[client_id]->terminate = true;
+                clients[client_id]->cv.notify_one();
+                clients.erase(client_id);
+            }
+        }
+        else if(msg->type == ix::WebSocketMessageType::Message) {
+            std::lock_guard<std::mutex> lock(clients_mtx);
+            if(auto it = clients.find(client_id); it != clients.end()) {
+                auto& session = *it->second;
+                std::lock_guard<std::mutex> session_lock(session.mtx);
+                
+                //PCM16 -> FLOAT32
+                const int16_t* pcm16 = reinterpret_cast<const int16_t*>(msg->str.data());
+                const size_t num_samples = msg->str.size() / sizeof(int16_t);
+                
+                session.pcm_buffer.reserve(session.pcm_buffer.size() + num_samples);
+                for(size_t i = 0; i < num_samples; ++i) {
+                    session.pcm_buffer.push_back(pcm16[i] / 32768.0f);
+                }
+                
+                session.cv.notify_one();
+            }
+        }
+    }
+
+    void processClientAudio(std::string client_id) {
+        constexpr int step_ms = 3000;
+        constexpr int n_samples_step = (1e-3 * step_ms) * WHISPER_SAMPLE_RATE;
+        
+        while(true) {
+            std::vector<float> audio_chunk;
+            {
+                std::unique_lock<std::mutex> lock(clients_mtx);
+                if(!clients.count(client_id)) break;
+                auto& session = *clients[client_id];
+                
+                std::unique_lock<std::mutex> session_lock(session.mtx);
+                session.cv.wait_for(session_lock, std::chrono::milliseconds(100), [&session] {
+                    return session.pcm_buffer.size() >= n_samples_step || session.terminate;
+                });
+                
+                if(session.terminate) break;
+                
+                if(session.pcm_buffer.size() >= n_samples_step) {
+                    audio_chunk.assign(session.pcm_buffer.begin(), session.pcm_buffer.begin() + n_samples_step);
+                    session.pcm_buffer.erase(session.pcm_buffer.begin(), session.pcm_buffer.begin() + n_samples_step);
+                }
+            }
+            
+            if(!audio_chunk.empty()) {
+                whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
+                wparams.n_threads = params.n_threads;
+                wparams.print_progress = false;
+                
+                if(whisper_full(ctx, wparams, audio_chunk.data(), audio_chunk.size()) == 0) {
+                    const int n_segments = whisper_full_n_segments(ctx);
+                    for(int i = 0; i < n_segments; ++i) {
+                        const char* text = whisper_full_get_segment_text(ctx, i);
+                        fprintf(stdout, "[Client %s] %s\n", client_id, text);
+                    }
+                }
+            }
+        }
+    }
+};
+
+int main(int argc, char** argv) {
+    server_params params;
+    params.port = 9002;
+    params.model = "../models/for-tests-ggml-base.bin";
+    
+    WhisperServer server(params);
+    server.run();
+    return 0;
+}
diff --git a/examples/websocket-stream/test_whisper_stream.py b/examples/websocket-stream/test_whisper_stream.py
@@ -0,0 +1,76 @@
+# test_whisper_server.py
+import websockets
+import asyncio
+import numpy as np
+import threading
+import time
+import json
+from queue import Queue
+
+# Конфигурация теста
+SERVER_URL = "ws://localhost:9002"
+NUM_CLIENTS = 3
+TEST_DURATION = 15  # сек
+SAMPLE_RATE = 16000
+
+# Очередь для сбора результатов
+results = Queue()
+
+# Генератор тестового аудио (синусоида с разными частотами для каждого клиента)
+def generate_audio(client_id, duration_sec):
+    t = np.linspace(0, duration_sec, int(SAMPLE_RATE * duration_sec), False)
+    freq = 440 + client_id * 100  # Уникальная частота для каждого клиента
+    audio = np.sin(2 * np.pi * freq * t) * 0.5
+    return (audio * 32767).astype(np.int16).tobytes()
+
+async def client_worker(client_id):
+    try:
+        async with websockets.connect(SERVER_URL) as ws:
+            print(f"Client {client_id} connected")
+            
+            start_time = time.time()
+            while time.time() - start_time < TEST_DURATION:
+                audio_data = generate_audio(client_id, 0.5)  # 500ms chunks
+                await ws.send(audio_data)
+                await asyncio.sleep(0.1)
+                
+            await ws.close()
+            results.put((client_id, "OK"))
+    except Exception as e:
+        results.put((client_id, f"Error: {str(e)}"))
+
+def run_server():
+    import subprocess
+    # Замените на путь к вашему бинарнику
+    subprocess.run(["./bin/whisper-stream-websocket"], check=True)
+
+def test_multi_clients():
+    # Запуск сервера в отдельном потоке
+    #server_thread = threading.Thread(target=run_server, daemon=True)
+    #server_thread.start()
+    #time.sleep(2)  # Даем серверу время на запуск
+
+    # Запуск клиентов
+    loop = asyncio.new_event_loop()
+    asyncio.set_event_loop(loop)
+    
+    tasks = []
+    for i in range(NUM_CLIENTS):
+        tasks.append(client_worker(i))
+    
+    loop.run_until_complete(asyncio.gather(*tasks))
+    loop.close()
+
+    # Проверка результатов
+    all_ok = True
+    while not results.empty():
+        client_id, status = results.get()
+        print(f"Client {client_id}: {status}")
+        if status != "OK":
+            all_ok = False
+    
+    assert all_ok, "Some clients failed"
+    print("All clients finished successfully")
+
+if __name__ == "__main__":
+    test_multi_clients()