fully working model maganement

ngxson · ngxson · commit 7a7de2a27e8b · 2025-11-15T23:26:31.000+01:00
diff --git a/tools/server/server-http.cpp b/tools/server/server-http.cpp
@@ -383,3 +383,88 @@ void server_http_context::post(const std::string & path, server_http_context::ha
     });
 }
 
+
+//
+// server_http_client
+//
+
+server_http_client::server_http_client(
+        const std::string & method,
+        const std::string & host,
+        int port,
+        const std::string & path,
+        const std::map<std::string, std::string> & headers,
+        const std::string & body,
+        const std::function<bool()> should_stop) {
+    // shared between reader and writer threads
+    auto cli  = std::make_shared<httplib::Client>(host, port);
+    auto pipe = std::make_shared<pipe_t<msg_t>>();
+
+    // setup Client
+    cli->set_connection_timeout(0, 200000); // 200 milliseconds
+    this->status = 500; // to be overwritten upon response
+    this->cleanup = [pipe]() {
+        pipe->close_read();
+        pipe->close_write();
+    };
+
+    // wire up the receive end of the pipe
+    this->next = [pipe, should_stop](std::string & out) -> bool {
+        msg_t msg;
+        bool has_next = pipe->read(msg, should_stop);
+        if (!msg.data.empty()) {
+            out = std::move(msg.data);
+        }
+        return has_next;
+    };
+
+    // wire up the HTTP client
+    // note: do NOT capture `this` pointer, as it may be destroyed before the thread ends
+    httplib::ResponseHandler response_handler = [pipe, cli](const httplib::Response & response) {
+        msg_t msg;
+        msg.status = response.status;
+        for (const auto & [key, value] : response.headers) {
+            msg.headers[key] = value;
+        }
+        pipe->write(std::move(msg)); // send headers first
+        return true;
+    };
+    httplib::ContentReceiverWithProgress content_receiver = [pipe](const char * data, size_t data_length, size_t, size_t) {
+        return pipe->write({{}, 0, std::string(data, data_length)}); // send data chunks
+    };
+
+    // prepare the request to destination server
+    httplib::Request req;
+    {
+        req.method = method;
+        req.path = path;
+        for (const auto & [key, value] : headers) {
+            req.set_header(key, value);
+        }
+        req.body = body;
+        req.response_handler = response_handler;
+        req.content_receiver = content_receiver;
+    }
+
+    // start the proxy thread
+    SRV_DBG("start proxy thread %s %s\n", req.method.c_str(), req.path.c_str());
+    this->thread = std::thread([cli, pipe, req]() {
+        auto result = cli->send(std::move(req));
+        if (result.error() != httplib::Error::Success) {
+            auto err_str = httplib::to_string(result.error());
+            SRV_ERR("http client error: %s\n", err_str.c_str());
+            pipe->write({{}, 500, ""}); // header
+            pipe->write({{}, 0, "proxy error: " + err_str}); // body
+        }
+        pipe->close_write(); // signal EOF to reader
+        SRV_DBG("%s", "client request thread ended\n");
+    });
+    this->thread.detach();
+
+    // wait for the first chunk (headers)
+    msg_t header;
+    pipe->read(header, should_stop);
+    SRV_DBG("%s", "received response headers\n");
+    this->status  = header.status;
+    this->headers = header.headers;
+}
diff --git a/tools/server/server-http.h b/tools/server/server-http.h
@@ -75,3 +75,79 @@ struct server_http_context {
     // for debugging
     std::string listening_address;
 };
+
+
+
+#include <queue>
+#include <mutex>
+#include <mutex>
+#include <condition_variable>
+
+struct server_http_client : server_http_res {
+    std::function<void()> cleanup = nullptr;
+public:
+    server_http_client(const std::string & method,
+                       const std::string & host,
+                       int port,
+                       const std::string & path,
+                       const std::map<std::string, std::string> & headers,
+                       const std::string & body,
+                       const std::function<bool()> should_stop);
+    ~server_http_client() {
+        if (cleanup) {
+            cleanup();
+        }
+    }
+private:
+    std::thread thread;
+    struct msg_t {
+        std::map<std::string, std::string> headers;
+        int status = 0;
+        std::string data;
+    };
+    // simple implementation of a pipe
+    template<typename T>
+    struct pipe_t {
+        std::mutex mutex;
+        std::condition_variable cv;
+        std::queue<T> queue;
+        std::atomic<bool> writer_closed{false};
+        std::atomic<bool> reader_closed{false};
+        void close_write() {
+            writer_closed.store(true);
+            cv.notify_all();
+        }
+        void close_read() {
+            reader_closed.store(true);
+            cv.notify_all();
+        }
+        bool read(T & output, const std::function<bool()> & should_stop) {
+            std::unique_lock<std::mutex> lk(mutex);
+            constexpr auto poll_interval = std::chrono::milliseconds(500);
+            while (true) {
+                if (!queue.empty()) {
+                    output = std::move(queue.front());
+                    queue.pop();
+                    return true;
+                }
+                if (writer_closed.load()) {
+                    return false; // clean EOF
+                }
+                if (should_stop()) {
+                    close_read(); // signal broken pipe to writer
+                    return false; // cancelled / reader no longer alive
+                }
+                cv.wait_for(lk, poll_interval);
+            }
+        }
+        bool write(T && data) {
+            std::lock_guard<std::mutex> lk(mutex);
+            if (reader_closed.load()) {
+                return false; // broken pipe
+            }
+            queue.push(std::move(data));
+            cv.notify_one();
+            return true;
+        }
+    };
+};
diff --git a/tools/server/server.cpp b/tools/server/server.cpp
@@ -5108,6 +5108,106 @@ struct server_routes {
         return res;
     };
 
+    //
+    // router server
+    //
+    char ** envp;
+    std::map<std::string, server_spawn_instance> map_model_to_port;
+    void maybe_load_it_why_not(std::string & custom_model) {
+        // HACKYYYY, but for demo purpose; we load the model if it's in the cached list
+        if (map_model_to_port.find(custom_model) != map_model_to_port.end()) {
+            return; // already loaded, do nothing
+        }
+        auto models = common_list_cached_models();
+        for (const auto & model : models) {
+            auto m = model.to_string();
+            if (m == custom_model) {
+                server_router_create_instance(envp, map_model_to_port, m);
+                std::this_thread::sleep_for(std::chrono::seconds(5)); // hacky wait for the process to be ready
+                return; // nice
+            }
+        }
+    }
+    std::string get_one_if_has_only_one(std::string & custom_model) {
+        // HACKYYYY, but for demo purpose; we get the only model if there's only one
+        if (map_model_to_port.size() == 1) {
+            return map_model_to_port.begin()->first;
+        }
+        return custom_model;
+    }
+    server_http_context::handler_t proxy_get = [this](const server_http_req & req) {
+        std::string method = "GET";
+        std::string model = req.get_param("model");
+        maybe_load_it_why_not(model);
+        model = get_one_if_has_only_one(model);
+        return handle_proxy(req, method, model);
+    };
+    server_http_context::handler_t proxy_post = [this](const server_http_req & req) {
+        std::string method = "POST";
+        json body = json::parse(req.body);
+        std::string model = json_value(body, "model", std::string());
+        maybe_load_it_why_not(model);
+        model = get_one_if_has_only_one(model);
+        return handle_proxy(req, method, model);
+    };
+    server_http_res_ptr handle_proxy(const server_http_req & req, std::string & method, std::string model) {
+        if (map_model_to_port.find(model) == map_model_to_port.end()) {
+            auto res = std::make_unique<server_res_generator>(ctx_server);
+            res->error(format_error_response("model parameter is invalid", ERROR_TYPE_INVALID_REQUEST));
+            return server_http_res_ptr(std::move(res));
+        }
+        server_http_res_ptr res(new server_http_client(
+            method, params.hostname, map_model_to_port[model].port,
+            req.path, req.headers, req.body, req.should_stop
+        ));
+        return res;
+    }
+    server_http_context::handler_t post_router_models_load = [this](const server_http_req & req) {
+        auto res = std::make_unique<server_res_generator>(ctx_server);
+        json body = json::parse(req.body);
+        std::string model = json_value(body, "model", std::string());
+        int status = server_router_create_instance(envp, map_model_to_port, model);
+        if (status != 0) {
+            res->error(format_error_response("fail to start the process", ERROR_TYPE_SERVER));
+            return res;
+        }
+        res->ok({{"success", true}});
+        return res;
+    };
+    server_http_context::handler_t get_router_models = [this](const server_http_req &) {
+        auto res = std::make_unique<server_res_generator>(ctx_server);
+        json models_json = json::array();
+        auto models = common_list_cached_models();
+        for (const auto & model : models) {
+            auto model_name = model.to_string();
+            bool loaded = map_model_to_port.find(model.to_string()) != map_model_to_port.end(); // TODO: thread safety
+            models_json.push_back(json {
+                {"model",  model_name},
+                {"name",   model_name},
+                {"id",     model_name},
+                // TODO: other fields...
+                {"status", {
+                    {"value", loaded ? "loaded" : "unloaded"}
+                }},
+            });
+        }
+        res->ok({{"data", models_json}});
+        return res;
+    };
+    server_http_context::handler_t post_router_models_unload = [this](const server_http_req & req) {
+        auto res = std::make_unique<server_res_generator>(ctx_server);
+        json body = json::parse(req.body);
+        std::string model = json_value(body, "model", std::string());
+        model = get_one_if_has_only_one(model);
+        if (map_model_to_port.find(model) == map_model_to_port.end()) {
+            res->error(format_error_response("model parameter is invalid", ERROR_TYPE_INVALID_REQUEST));
+            return res;
+        }
+        server_router_kill_single(map_model_to_port, model);
+        res->ok({{"success", true}});
+        return res;
+    };
+
 private:
     std::unique_ptr<server_res_generator> handle_completions_impl(
                 server_task_type type,
@@ -5501,7 +5601,7 @@ static server_http_context::handler_t ex_wrapper(server_http_context::handler_t
     };
 }
 
-int main(int argc, char ** argv) {
+int main(int argc, char ** argv, char ** envp) {
     // own arguments required by this example
     common_params params;
 
@@ -5549,6 +5649,34 @@ int main(int argc, char ** argv) {
     // register API routes
     server_routes routes(params, ctx_server, ctx_http);
 
+    // hacky, replace handlers with proxy handlers if this is a router server
+    bool is_router_server = params.model.path == DEFAULT_MODEL_PATH;
+    if (is_router_server) {
+        routes.envp = envp;
+        routes.get_props = routes.proxy_get;
+        routes.post_props = routes.proxy_post;
+        // routes.get_models = routes.proxy_get;
+        routes.post_completions = routes.proxy_post;
+        routes.post_completions_oai = routes.proxy_post;
+        routes.post_chat_completions = routes.proxy_post;
+        routes.post_infill = routes.proxy_post;
+        routes.post_embeddings = routes.proxy_post;
+        routes.post_embeddings_oai = routes.proxy_post;
+        routes.post_rerank = routes.proxy_post;
+        routes.post_tokenize = routes.proxy_post;
+        routes.post_detokenize = routes.proxy_post;
+        routes.post_apply_template = routes.proxy_post;
+        routes.get_lora_adapters = routes.proxy_get;
+        routes.post_lora_adapters = routes.proxy_post;
+        routes.get_slots = routes.proxy_get;
+        routes.post_slots = routes.proxy_post;
+
+        // custom routes for router
+        routes.get_models = routes.get_router_models;
+        ctx_http.post("/models/load", ex_wrapper(routes.post_router_models_load));
+        ctx_http.post("/models/unload", ex_wrapper(routes.post_router_models_unload));
+    }
+
     ctx_http.get ("/health",              ex_wrapper(routes.get_health)); // public endpoint (no API key check)
     ctx_http.get ("/v1/health",           ex_wrapper(routes.get_health)); // public endpoint (no API key check)
     ctx_http.get ("/metrics",             ex_wrapper(routes.get_metrics));
@@ -5594,6 +5722,8 @@ int main(int argc, char ** argv) {
         llama_backend_free();
     };
 
+if (!is_router_server) { // HACKY
+
     // start the HTTP server before loading the model to be able to serve /health requests
     if (!ctx_http.start()) {
         clean_up();
@@ -5631,6 +5761,8 @@ int main(int argc, char ** argv) {
         ctx_server.queue_tasks.terminate();
     };
 
+} // end of !is_router_server
+
 #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
     struct sigaction sigint_action;
     sigint_action.sa_handler = signal_handler;
@@ -5645,6 +5777,8 @@ int main(int argc, char ** argv) {
     SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
 #endif
 
+if (!is_router_server) { // HACKY
+
     LOG_INF("%s: server is listening on %s\n", __func__, ctx_http.listening_address.c_str());
     LOG_INF("%s: starting the main loop...\n", __func__);
     // this call blocks the main thread until queue_tasks.terminate() is called
@@ -5655,6 +5789,19 @@ int main(int argc, char ** argv) {
         ctx_http.thread.join();
     }
     llama_memory_breakdown_print(ctx_server.ctx);
+} else {
+    shutdown_handler = [&](int) {
+        ctx_http.stop();
+    };
+    if (!ctx_http.start()) {
+        LOG_ERR("%s: exiting due to HTTP server error\n", __func__);
+        return 1;
+    }
+    ctx_http.is_ready.store(true);
+    ctx_http.thread.join(); // keep the main thread alive
+    // kill_all_instances(routes.map_model_to_port); // why this also kill the main instance?
+    LOG_INF("%s: server stopped\n", __func__);
+} // end of !is_router_server
 
     return 0;
 }
diff --git a/tools/server/utils.hpp b/tools/server/utils.hpp