server : PoC implementation of "interim" server

ngxson · ngxson · commit a3eb12f53f29 · 2025-05-09T11:17:41.000+02:00
diff --git a/common/arg.cpp b/common/arg.cpp
@@ -834,6 +834,26 @@ static std::string get_all_kv_cache_types() {
 // CLI argument parsing functions
 //
 
+// handle model and download
+void common_params_handle_models(enum llama_example cur_ex, common_params & params) {
+    auto res = common_params_handle_model(params.model, params.hf_token, "");
+    if (params.no_mmproj) {
+        params.mmproj = {};
+    } else if (res.found_mmproj && params.mmproj.path.empty() && params.mmproj.url.empty()) {
+        // optionally, handle mmproj model when -hf is specified
+        params.mmproj = res.mmproj;
+    }
+    // only download mmproj if the current example is using it
+    for (auto & ex : mmproj_examples) {
+        if (cur_ex == ex) {
+            common_params_handle_model(params.mmproj,    params.hf_token, "");
+            break;
+        }
+    }
+    common_params_handle_model(params.speculative.model, params.hf_token, "");
+    common_params_handle_model(params.vocoder.model,     params.hf_token, "");
+}
+
 static bool common_params_parse_ex(int argc, char ** argv, common_params_context & ctx_arg) {
     std::string arg;
     const std::string arg_prefix = "--";
@@ -933,24 +953,7 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
     }
 
     // handle model and download
-    {
-        auto res = common_params_handle_model(params.model, params.hf_token, DEFAULT_MODEL_PATH);
-        if (params.no_mmproj) {
-            params.mmproj = {};
-        } else if (res.found_mmproj && params.mmproj.path.empty() && params.mmproj.url.empty()) {
-            // optionally, handle mmproj model when -hf is specified
-            params.mmproj = res.mmproj;
-        }
-        // only download mmproj if the current example is using it
-        for (auto & ex : mmproj_examples) {
-            if (ctx_arg.ex == ex) {
-                common_params_handle_model(params.mmproj,    params.hf_token, "");
-                break;
-            }
-        }
-        common_params_handle_model(params.speculative.model, params.hf_token, "");
-        common_params_handle_model(params.vocoder.model,     params.hf_token, "");
-    }
+    common_params_handle_models(ctx_arg.ex, params);
 
     if (params.escape) {
         string_process_escapes(params.prompt);
@@ -2486,10 +2489,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         {"-m", "--model"}, "FNAME",
         ex == LLAMA_EXAMPLE_EXPORT_LORA
             ? std::string("model path from which to load base model")
-            : string_format(
-                "model path (default: `models/$filename` with filename from `--hf-file` "
-                "or `--model-url` if set, otherwise %s)", DEFAULT_MODEL_PATH
-            ),
+            : "model path (default: `models/$filename` with filename from `--hf-file` or `--model-url` if set)",
         [](common_params & params, const std::string & value) {
             params.model.path = value;
         }
diff --git a/common/arg.h b/common/arg.h
@@ -80,6 +80,9 @@ bool common_params_parse(int argc, char ** argv, common_params & params, llama_e
 common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
 bool common_has_curl();
 
+// handle model and download
+void common_params_handle_models(enum llama_example cur_ex, common_params & params);
+
 struct common_remote_params {
     std::vector<std::string> headers;
     long timeout = 0; // CURLOPT_TIMEOUT, in seconds ; 0 means no timeout
diff --git a/common/common.h b/common/common.h
@@ -23,8 +23,6 @@
     fprintf(stderr, "%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET);    \
 } while(0)
 
-#define DEFAULT_MODEL_PATH "models/7B/ggml-model-f16.gguf"
-
 struct common_adapter_lora_info {
     std::string path;
     float scale;
diff --git a/tools/server/server.cpp b/tools/server/server.cpp