router : add real-time model swap notifications via SSE

ServeurpersoCom · ServeurpersoCom · commit 8644a21cd125 · 2025-12-01T14:44:26.000+01:00
Implement notification sink to stream lifecycle events during model
swaps. Notifications sent via delta.reasoning_content (OpenAI-compatible)

Progress events emitted during ensure_running()
- Unloading previous model(s)
- Loading new model
- Backend ready confirmation

Refactor proxy_request() to handle ensure_running() with optional
sink attachment for streaming feedback

{
  "router": {
    "notify_model_swap": true
  }
}
diff --git a/tools/router/router-app.cpp b/tools/router/router-app.cpp
@@ -58,6 +58,7 @@ bool RouterApp::ensure_running(const std::string & model_name, std::string & err
                 target_group.c_str());
 
         terminate_process(it_proc->second);
+        notify_progress("[llama-router] Unloading " + it_proc->first + " (" + running_group + ")\n");
         wait_for_process_exit(it_proc->second, ROUTER_PROCESS_SHUTDOWN_TIMEOUT_MS);
         model_ports.erase(it_proc->first);
         it_proc = processes.erase(it_proc);
@@ -102,6 +103,8 @@ bool RouterApp::ensure_running(const std::string & model_name, std::string & err
     last_spawned_model = model_name;
     LOG_INF("Spawned %s (group '%s') with %zu args\n", model_name.c_str(), target_group.c_str(), command.size());
 
+    notify_progress("[llama-router] Loading " + model_name + " (" + target_group + ")\n");
+
     const std::string health_endpoint = spawn_cfg.health_endpoint.empty() ? "/health" : spawn_cfg.health_endpoint;
     if (!wait_for_backend_ready(port, health_endpoint, ROUTER_BACKEND_READY_TIMEOUT_MS, &proc_it->second)) {
         error = "backend not ready";
@@ -116,6 +119,7 @@ bool RouterApp::ensure_running(const std::string & model_name, std::string & err
     }
 
     LOG_INF("Backend ready on port %d\n", port);
+    notify_progress("[llama-router] Backend ready, generating response...\n");
     return true;
 }
 
@@ -162,3 +166,20 @@ void RouterApp::stop_all() {
     }
     processes.clear();
 }
+
+void RouterApp::set_notification_sink(NotificationSink sink) {
+    std::lock_guard<std::mutex> lock(notification_mutex);
+    notification_sink = std::move(sink);
+}
+
+void RouterApp::clear_notification_sink() {
+    std::lock_guard<std::mutex> lock(notification_mutex);
+    notification_sink.reset();
+}
+
+void RouterApp::notify_progress(const std::string & message) {
+    std::lock_guard<std::mutex> lock(notification_mutex);
+    if (notification_sink) {
+        (*notification_sink)(ProgressNotification{message});
+    }
+}
diff --git a/tools/router/router-app.h b/tools/router/router-app.h
@@ -5,8 +5,10 @@
 
 #include <atomic>
 #include <mutex>
+#include <optional>
 #include <string>
 #include <unordered_map>
+#include <vector>
 
 class RouterApp {
 public:
@@ -20,16 +22,22 @@ class RouterApp {
     void stop_all();
     void update_config(RouterConfig cfg);
 
+    void set_notification_sink(NotificationSink sink);
+    void clear_notification_sink();
+
     const RouterConfig & get_config() const { return config; }
 
 private:
     RouterConfig config;
     std::atomic<int> next_port;
     std::mutex mutex;
+    std::optional<NotificationSink> notification_sink;
+    std::mutex notification_mutex;
     std::unordered_map<std::string, ModelConfig> model_lookup;
     std::unordered_map<std::string, ProcessHandle> processes;
     std::unordered_map<std::string, int> model_ports;
     std::string last_spawned_model;
 
     SpawnConfig resolve_spawn_config(const ModelConfig & cfg) const;
+    void notify_progress(const std::string & message);
 };
diff --git a/tools/router/router-config.cpp b/tools/router/router-config.cpp
@@ -135,6 +135,7 @@ const RouterOptions & get_default_router_options() {
         /*connection_timeout_s =*/ 5,
         /*read_timeout_s       =*/ 600,
         /*admin_token          =*/ "",
+        /*notify_model_swap    =*/ false,
     };
 
     return opts;
@@ -192,6 +193,10 @@ void write_config_file(const RouterConfig & cfg, const std::string & path) {
         out["router"]["admin_token"] = cfg.router.admin_token;
     }
 
+    if (cfg.router.notify_model_swap) {
+        out["router"]["notify_model_swap"] = true;
+    }
+
     out["models"] = json::array();
     for (const auto & m : cfg.models) {
         json obj;
@@ -316,6 +321,7 @@ RouterConfig load_config(const std::string & path) {
         if (r.contains("connection_timeout_s")) cfg.router.connection_timeout_s = r["connection_timeout_s"].get<int>();
         if (r.contains("read_timeout_s")) cfg.router.read_timeout_s = r["read_timeout_s"].get<int>();
         if (r.contains("admin_token")) cfg.router.admin_token = r["admin_token"].get<std::string>();
+        if (r.contains("notify_model_swap")) cfg.router.notify_model_swap = r["notify_model_swap"].get<bool>();
     }
     if (data.contains("models")) {
         for (const auto & m : data["models"]) {
diff --git a/tools/router/router-config.h b/tools/router/router-config.h
@@ -1,8 +1,16 @@
 #pragma once
 
+#include <functional>
+#include <optional>
 #include <string>
 #include <vector>
 
+struct ProgressNotification {
+    std::string message;
+};
+
+using NotificationSink = std::function<void(const ProgressNotification &)>;
+
 struct SpawnConfig {
     std::vector<std::string> command;
     std::vector<std::string> proxy_endpoints;
@@ -28,6 +36,7 @@ struct RouterOptions {
     int         connection_timeout_s = 5;
     int         read_timeout_s       = 600;
     std::string admin_token;
+    bool        notify_model_swap = false;
 };
 
 struct RouterConfig {
diff --git a/tools/router/router-endpoints.cpp b/tools/router/router-endpoints.cpp
@@ -45,17 +45,9 @@ void register_routes(httplib::Server & server, RouterApp & app) {
             res.set_content("no models running", "text/plain");
             return;
         }
-
-        std::string error;
-        if (!app.ensure_running(model, error)) {
-            LOG_WRN("Failed to ensure last spawned model %s: %s\n", model.c_str(), error.c_str());
-            res.status = 503;
-            res.set_content("no models running", "text/plain");
-            return;
-        }
         LOG_INF("Proxying %s to last spawned model %s\n", req.path.c_str(), model.c_str());
         const auto spawn_cfg = app.get_spawn_config(model);
-        proxy_request(req, res, app.upstream_for(model), app.get_config().router, spawn_cfg.proxy_endpoints);
+        proxy_request(req, res, app, model, spawn_cfg.proxy_endpoints);
     };
 
     server.Get("/props", proxy_last_spawned);
@@ -68,17 +60,10 @@ void register_routes(httplib::Server & server, RouterApp & app) {
         std::string model_name = model_it != req.matches.end() ? model_it->str() : std::string();
         ++model_it;
         const std::string endpoint_suffix = model_it != req.matches.end() ? model_it->str() : std::string();
-        std::string error;
-        if (!app.ensure_running(model_name, error)) {
-            LOG_WRN("Model %s unavailable: %s\n", model_name.c_str(), error.c_str());
-            res.status = 404;
-            res.set_content("{\"error\":\"model unavailable\"}", "application/json");
-            return;
-        }
         LOG_INF("Proxying %s for model %s\n", req.path.c_str(), model_name.c_str());
         const auto spawn_cfg = app.get_spawn_config(model_name);
         const std::string corrected_path = "/" + endpoint_suffix;
-        proxy_request(req, res, app.upstream_for(model_name), app.get_config().router, spawn_cfg.proxy_endpoints, corrected_path);
+        proxy_request(req, res, app, model_name, spawn_cfg.proxy_endpoints, corrected_path);
     });
 
     server.Post("/v1/chat/completions", [&app](const httplib::Request & req, httplib::Response & res) {
@@ -90,17 +75,9 @@ void register_routes(httplib::Server & server, RouterApp & app) {
             return;
         }
 
-        std::string error;
-        if (!app.ensure_running(model, error)) {
-            LOG_WRN("Model %s not available: %s\n", model.c_str(), error.c_str());
-            res.status = 404;
-            res.set_content("{\"error\":\"" + error + "\"}", "application/json");
-            return;
-        }
-
         LOG_INF("Proxying chat completion for model %s\n", model.c_str());
         const auto spawn_cfg = app.get_spawn_config(model);
-        proxy_request(req, res, app.upstream_for(model), app.get_config().router, spawn_cfg.proxy_endpoints);
+        proxy_request(req, res, app, model, spawn_cfg.proxy_endpoints);
     });
 
     server.set_error_handler([](const httplib::Request &, httplib::Response & res) {
diff --git a/tools/router/router-proxy.cpp b/tools/router/router-proxy.cpp
diff --git a/tools/router/router-proxy.h b/tools/router/router-proxy.h