improve maybe_load_it_why_not

ngxson · ngxson · commit 2a200683b0e9 · 2025-11-16T11:58:11.000+01:00
diff --git a/tools/server/server.cpp b/tools/server/server.cpp
@@ -5118,36 +5118,39 @@ struct server_routes {
         if (map_model_to_port.find(custom_model) != map_model_to_port.end()) {
             return; // already loaded, do nothing
         }
+        // TODO: maybe unload least recently used model if too many models are loaded?
+        auto wait_until_loaded = [this, custom_model]() {
+            while (true) {
+                bool load_failed = map_model_to_port.find(custom_model) == map_model_to_port.end(); // model is deleted
+                bool is_loaded = !load_failed && map_model_to_port[custom_model].status == "loaded";
+                if (is_loaded || load_failed) {
+                    return;
+                }
+                std::this_thread::sleep_for(std::chrono::milliseconds(500));
+            }
+        };
         auto models = common_list_cached_models();
         for (const auto & model : models) {
             auto m = model.to_string();
             if (m == custom_model) {
-                server_router_create_instance(envp, map_model_to_port, m);
-                std::this_thread::sleep_for(std::chrono::seconds(5)); // hacky wait for the process to be ready
-                return; // nice
+                server_router_create_instance(envp, map_model_to_port, m, params.port);
+                wait_until_loaded();
+                SRV_INF("model %s loaded on-demand\n", custom_model.c_str());
+                return;
             }
         }
     }
-    std::string get_one_if_has_only_one(std::string & custom_model) {
-        // HACKYYYY, but for demo purpose; we get the only model if there's only one
-        if (map_model_to_port.size() == 1) {
-            return map_model_to_port.begin()->first;
-        }
-        return custom_model;
-    }
     server_http_context::handler_t proxy_get = [this](const server_http_req & req) {
         std::string method = "GET";
         std::string model = req.get_param("model");
         maybe_load_it_why_not(model);
-        model = get_one_if_has_only_one(model);
         return handle_proxy(req, method, model);
     };
     server_http_context::handler_t proxy_post = [this](const server_http_req & req) {
         std::string method = "POST";
         json body = json::parse(req.body);
         std::string model = json_value(body, "model", std::string());
         maybe_load_it_why_not(model);
-        model = get_one_if_has_only_one(model);
         return handle_proxy(req, method, model);
     };
     server_http_res_ptr handle_proxy(const server_http_req & req, std::string & method, std::string model) {
@@ -5166,28 +5169,41 @@ struct server_routes {
         auto res = std::make_unique<server_res_generator>(ctx_server);
         json body = json::parse(req.body);
         std::string model = json_value(body, "model", std::string());
-        int status = server_router_create_instance(envp, map_model_to_port, model);
+        int status = server_router_create_instance(envp, map_model_to_port, model, params.port);
         if (status != 0) {
             res->error(format_error_response("fail to start the process", ERROR_TYPE_SERVER));
             return res;
         }
         res->ok({{"success", true}});
         return res;
     };
+    server_http_context::handler_t post_router_models_status = [this](const server_http_req & req) {
+        auto res = std::make_unique<server_res_generator>(ctx_server);
+        json body = json::parse(req.body);
+        std::string model = json_value(body, "model", std::string());
+        std::string value = json_value(body, "value", std::string());
+        if (map_model_to_port.find(model) == map_model_to_port.end()) {
+            res->error(format_error_response("model parameter is invalid", ERROR_TYPE_INVALID_REQUEST));
+            return res;
+        }
+        map_model_to_port[model].status = value;
+        res->ok({{"success", true}});
+        return res;
+    };
     server_http_context::handler_t get_router_models = [this](const server_http_req &) {
         auto res = std::make_unique<server_res_generator>(ctx_server);
         json models_json = json::array();
         auto models = common_list_cached_models();
         for (const auto & model : models) {
             auto model_name = model.to_string();
-            bool loaded = map_model_to_port.find(model.to_string()) != map_model_to_port.end(); // TODO: thread safety
+            bool found = map_model_to_port.find(model.to_string()) != map_model_to_port.end(); // TODO: thread safety
             models_json.push_back(json {
                 {"model",  model_name},
                 {"name",   model_name},
                 {"id",     model_name},
                 // TODO: other fields...
                 {"status", {
-                    {"value", loaded ? "loaded" : "unloaded"}
+                    {"value", found ? map_model_to_port[model_name].status : "unloaded"}
                 }},
             });
         }
@@ -5198,7 +5214,6 @@ struct server_routes {
         auto res = std::make_unique<server_res_generator>(ctx_server);
         json body = json::parse(req.body);
         std::string model = json_value(body, "model", std::string());
-        model = get_one_if_has_only_one(model);
         if (map_model_to_port.find(model) == map_model_to_port.end()) {
             res->error(format_error_response("model parameter is invalid", ERROR_TYPE_INVALID_REQUEST));
             return res;
@@ -5673,8 +5688,9 @@ int main(int argc, char ** argv, char ** envp) {
 
         // custom routes for router
         routes.get_models = routes.get_router_models;
-        ctx_http.post("/models/load", ex_wrapper(routes.post_router_models_load));
+        ctx_http.post("/models/load",   ex_wrapper(routes.post_router_models_load));
         ctx_http.post("/models/unload", ex_wrapper(routes.post_router_models_unload));
+        ctx_http.post("/models/status", ex_wrapper(routes.post_router_models_status));
     }
 
     ctx_http.get ("/health",              ex_wrapper(routes.get_health)); // public endpoint (no API key check)
@@ -5779,6 +5795,21 @@ if (!is_router_server) { // HACKY
 
 if (!is_router_server) { // HACKY
 
+    // notify to main router if needed
+    char * router_port = std::getenv("LLAMA_SERVER_ROUTER_PORT");
+    if (router_port != nullptr) {
+        SRV_INF("%s: notifying to main router on port %s\n", __func__, router_port);
+        server_http_client notify_router(
+            "POST", params.hostname, std::atoi(router_port),
+            "/models/status",
+            { {"Content-Type", "application/json"} },
+            json {{ "model", params.model_alias }, { "value", "loaded" }}.dump(),
+            []() { return false; }
+        );
+        std::string dummy;
+        notify_router.next(dummy); // ignore the response
+    }
+
     LOG_INF("%s: server is listening on %s\n", __func__, ctx_http.listening_address.c_str());
     LOG_INF("%s: starting the main loop...\n", __func__);
     // this call blocks the main thread until queue_tasks.terminate() is called
diff --git a/tools/server/utils.hpp b/tools/server/utils.hpp
@@ -1615,9 +1615,10 @@ struct server_spawn_instance {
     pid_t pid = 0;
     int port = 0;
     std::thread th;
+    std::string status = "loading"; // "loading", "loaded"
 };
 
-inline int server_router_create_instance(char ** envp, std::map<std::string, server_spawn_instance> & mapping, const std::string & hf_model) {
+inline int server_router_create_instance(char ** envp, std::map<std::string, server_spawn_instance> & mapping, const std::string & hf_model, int router_port) {
     server_spawn_instance inst;
     inst.port = rand() % 10000 + 20000; // random port between 20000 and 29999
 
@@ -1635,6 +1636,8 @@ inline int server_router_create_instance(char ** envp, std::map<std::string, ser
         arg_strs.push_back(path);
         arg_strs.push_back("-hf");
         arg_strs.push_back(hf_model);
+        arg_strs.push_back("--alias");
+        arg_strs.push_back(hf_model);
         arg_strs.push_back("--port");
         arg_strs.push_back(std::to_string(inst.port));
 
@@ -1645,7 +1648,22 @@ inline int server_router_create_instance(char ** envp, std::map<std::string, ser
         }
         child_argv.push_back(nullptr);
 
-        if (posix_spawn(&pid, path.c_str(), NULL, NULL, child_argv.data(), envp) != 0) {
+        // clone envp while adding LLAMA_SERVER_ROUTER_PORT
+        std::vector<std::string> child_envs;
+        std::vector<char *> child_envp;
+        {
+            for (char ** e = envp; *e != nullptr; ++e) {
+                child_envs.emplace_back(*e);
+            }
+            child_envs.emplace_back("LLAMA_SERVER_ROUTER_PORT=" + std::to_string(router_port));
+            child_envp.reserve(child_envs.size() + 1);
+            for (auto & s : child_envs) {
+                child_envp.push_back(const_cast<char *>(s.c_str()));
+            }
+            child_envp.push_back(nullptr);
+        }
+
+        if (posix_spawn(&pid, path.c_str(), NULL, NULL, child_argv.data(), child_envp.data()) != 0) {
             perror("posix_spawn");
             exit(1); // for testing only
         } else {