Allow passing GGUF splits via repeated --model args

Piotr Stankiewicz · Piotr Stankiewicz · commit bb5e924aff8c · 2025-08-14T17:27:30.000+02:00
In case a segmented GGUF file does not follow the specified naming
convention, it will not be possible to load it. So, modify the argument
parser to allow repeated --model args to be specified on the CLI, and in
such case assume those are GGUF splits given in order.

Signed-off-by: Piotr Stankiewicz &lt;piotr.stankiewicz@docker.com&gt;
diff --git a/common/arg.cpp b/common/arg.cpp
@@ -496,8 +496,12 @@ static bool common_download_model(
         LOG_ERR("%s: invalid model url\n", __func__);
         return false;
     }
+    if (model.paths.size() != 1) {
+        LOG_ERR("%s: model url can only be specified with one path\n", __func__);
+        return false;
+    }
 
-    if (!common_download_file_single(model.url, model.path, bearer_token, offline)) {
+    if (!common_download_file_single(model.url, model.paths[0], bearer_token, offline)) {
         return false;
     }
 
@@ -508,9 +512,9 @@ static bool common_download_model(
             /*.no_alloc = */ true,
             /*.ctx      = */ NULL,
         };
-        auto * ctx_gguf = gguf_init_from_file(model.path.c_str(), gguf_params);
+        auto * ctx_gguf = gguf_init_from_file(model.paths[0].c_str(), gguf_params);
         if (!ctx_gguf) {
-            LOG_ERR("\n%s:  failed to load input GGUF from %s\n", __func__, model.path.c_str());
+            LOG_ERR("\n%s:  failed to load input GGUF from %s\n", __func__, model.paths[0].c_str());
             return false;
         }
 
@@ -529,8 +533,8 @@ static bool common_download_model(
         // Verify the first split file format
         // and extract split URL and PATH prefixes
         {
-            if (!llama_split_prefix(split_prefix, sizeof(split_prefix), model.path.c_str(), 0, n_split)) {
-                LOG_ERR("\n%s: unexpected model file name: %s n_split=%d\n", __func__, model.path.c_str(), n_split);
+            if (!llama_split_prefix(split_prefix, sizeof(split_prefix), model.paths[0].c_str(), 0, n_split)) {
+                LOG_ERR("\n%s: unexpected model file name: %s n_split=%d\n", __func__, model.paths[0].c_str(), n_split);
                 return false;
             }
 
@@ -548,7 +552,7 @@ static bool common_download_model(
             char split_url[LLAMA_CURL_MAX_URL_LENGTH] = {0};
             llama_split_path(split_url, sizeof(split_url), split_url_prefix, idx, n_split);
 
-            if (std::string(split_path) == model.path) {
+            if (std::string(split_path) == model.paths[0]) {
                 continue; // skip the already downloaded file
             }
 
@@ -798,7 +802,7 @@ static handle_model_result common_params_handle_model(
         if (!model.hf_repo.empty()) {
             // short-hand to avoid specifying --hf-file -> default it to --model
             if (model.hf_file.empty()) {
-                if (model.path.empty()) {
+                if (model.paths.empty()) {
                     auto auto_detected = common_get_hf_file(model.hf_repo, bearer_token, offline);
                     if (auto_detected.repo.empty() || auto_detected.ggufFile.empty()) {
                         exit(1); // built without CURL, error message already printed
@@ -811,30 +815,30 @@ static handle_model_result common_params_handle_model(
                         result.mmproj.hf_file = auto_detected.mmprojFile;
                     }
                 } else {
-                    model.hf_file = model.path;
+                    model.hf_file = model.paths[0];
                 }
             }
 
             std::string model_endpoint = get_model_endpoint();
             model.url = model_endpoint + model.hf_repo + "/resolve/main/" + model.hf_file;
             // make sure model path is present (for caching purposes)
-            if (model.path.empty()) {
+            if (model.paths.empty()) {
                 // this is to avoid different repo having same file name, or same file name in different subdirs
                 std::string filename = model.hf_repo + "_" + model.hf_file;
                 // to make sure we don't have any slashes in the filename
                 string_replace_all(filename, "/", "_");
-                model.path = fs_get_cache_file(filename);
+                model.paths.push_back(fs_get_cache_file(filename));
             }
 
         } else if (!model.url.empty()) {
-            if (model.path.empty()) {
+            if (model.paths.empty()) {
                 auto f = string_split<std::string>(model.url, '#').front();
                 f = string_split<std::string>(f, '?').front();
-                model.path = fs_get_cache_file(string_split<std::string>(f, '/').back());
+                model.paths.push_back(fs_get_cache_file(string_split<std::string>(f, '/').back()));
             }
 
-        } else if (model.path.empty()) {
-            model.path = model_path_default;
+        } else if (model.paths.empty() && !model_path_default.empty()) {
+            model.paths.push_back(model_path_default);
         }
     }
 
@@ -986,7 +990,7 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
         auto res = common_params_handle_model(params.model, params.hf_token, DEFAULT_MODEL_PATH, params.offline);
         if (params.no_mmproj) {
             params.mmproj = {};
-        } else if (res.found_mmproj && params.mmproj.path.empty() && params.mmproj.url.empty()) {
+        } else if (res.found_mmproj && params.mmproj.paths.empty() && params.mmproj.url.empty()) {
             // optionally, handle mmproj model when -hf is specified
             params.mmproj = res.mmproj;
         }
@@ -2285,7 +2289,11 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         "path to a multimodal projector file. see tools/mtmd/README.md\n"
         "note: if -hf is used, this argument can be omitted",
         [](common_params & params, const std::string & value) {
-            params.mmproj.path = value;
+            if (params.mmproj.paths.empty()) {
+                params.mmproj.paths.push_back(value);
+            } else {
+                params.mmproj.paths[0] = value;
+            }
         }
     ).set_examples(mmproj_examples).set_env("LLAMA_ARG_MMPROJ"));
     add_opt(common_arg(
@@ -2597,7 +2605,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
                 "or `--model-url` if set, otherwise %s)", DEFAULT_MODEL_PATH
             ),
         [](common_params & params, const std::string & value) {
-            params.model.path = value;
+            params.model.paths.push_back(value);
         }
     ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}).set_env("LLAMA_ARG_MODEL"));
     add_opt(common_arg(
@@ -3330,7 +3338,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         {"-md", "--model-draft"}, "FNAME",
         "draft model for speculative decoding (default: unused)",
         [](common_params & params, const std::string & value) {
-            params.speculative.model.path = value;
+            params.speculative.model.paths.push_back(value);
         }
     ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODEL_DRAFT"));
     add_opt(common_arg(
@@ -3371,7 +3379,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         {"-mv", "--model-vocoder"}, "FNAME",
         "vocoder model for audio generation (default: unused)",
         [](common_params & params, const std::string & value) {
-            params.vocoder.model.path = value;
+            params.vocoder.model.paths.push_back(value);
         }
     ).set_examples({LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_SERVER}));
      add_opt(common_arg(
diff --git a/common/common.cpp b/common/common.cpp
@@ -912,10 +912,24 @@ std::string fs_get_cache_file(const std::string & filename) {
 struct common_init_result common_init_from_params(common_params & params) {
     common_init_result iparams;
     auto mparams = common_model_params_to_llama(params);
+    llama_model * model = NULL;
+
+    if (params.model.paths.empty()) {
+        LOG_ERR("%s: failed to load model 'model path not specified'\n", __func__);
+        return iparams;
+    } else if (params.model.paths.size() == 1) {
+        model = llama_model_load_from_file(params.model.paths[0].c_str(), mparams);
+    } else {
+        std::vector<const char *> paths;
+        paths.reserve(params.model.paths.size());
+        for (const auto & path : params.model.paths) {
+            paths.push_back(path.c_str());
+        }
+        model = llama_model_load_from_splits(paths.data(), paths.size(), mparams);
+    }
 
-    llama_model * model = llama_model_load_from_file(params.model.path.c_str(), mparams);
     if (model == NULL) {
-        LOG_ERR("%s: failed to load model '%s'\n", __func__, params.model.path.c_str());
+        LOG_ERR("%s: failed to load model '%s'\n", __func__, params.model.paths[0].c_str());
         return iparams;
     }
 
@@ -925,7 +939,7 @@ struct common_init_result common_init_from_params(common_params & params) {
 
     llama_context * lctx = llama_init_from_model(model, cparams);
     if (lctx == NULL) {
-        LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.path.c_str());
+        LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.paths[0].c_str());
         llama_model_free(model);
         return iparams;
     }
diff --git a/common/common.h b/common/common.h
@@ -190,10 +190,10 @@ struct common_params_sampling {
 };
 
 struct common_params_model {
-    std::string path    = ""; // model local path                                           // NOLINT
-    std::string url     = ""; // model url to download                                      // NOLINT
-    std::string hf_repo = ""; // HF repo                                                    // NOLINT
-    std::string hf_file = ""; // HF file                                                    // NOLINT
+    std::vector<std::string> paths = {}; // model local path                                           // NOLINT
+    std::string url                = ""; // model url to download                                      // NOLINT
+    std::string hf_repo            = ""; // HF repo                                                    // NOLINT
+    std::string hf_file            = ""; // HF file                                                    // NOLINT
 };
 
 struct common_params_speculative {
diff --git a/examples/batched/batched.cpp b/examples/batched/batched.cpp
@@ -41,7 +41,20 @@ int main(int argc, char ** argv) {
 
     llama_model_params model_params = common_model_params_to_llama(params);
 
-    llama_model * model = llama_model_load_from_file(params.model.path.c_str(), model_params);
+    llama_model * model = NULL;
+    if (params.model.paths.empty()) {
+        LOG_ERR("%s: failed to load model 'model path not specified'\n", __func__);
+        return 1;
+    } else if (params.model.paths.size() == 1) {
+        model = llama_model_load_from_file(params.model.paths[0].c_str(), model_params);
+    } else {
+        std::vector<const char *> paths;
+        paths.reserve(params.model.paths.size());
+        for (const auto & path : params.model.paths) {
+            paths.push_back(path.c_str());
+        }
+        model = llama_model_load_from_splits(paths.data(), paths.size(), model_params);
+    }
 
     if (model == NULL) {
         LOG_ERR("%s: error: unable to load model\n" , __func__);
diff --git a/examples/diffusion/diffusion-cli.cpp b/examples/diffusion/diffusion-cli.cpp
@@ -548,9 +548,23 @@ int main(int argc, char ** argv) {
     model_params.use_mlock          = params.use_mlock;
     model_params.check_tensors      = params.check_tensors;
 
-    llama_model * model = llama_model_load_from_file(params.model.path.c_str(), model_params);
+    llama_model * model = NULL;
+    if (params.model.paths.empty()) {
+        LOG_ERR("error: failed to load model 'model path not specified'\n");
+        return 1;
+    } else if (params.model.paths.size() == 1) {
+        model = llama_model_load_from_file(params.model.paths[0].c_str(), model_params);
+    } else {
+        std::vector<const char *> paths;
+        paths.reserve(params.model.paths.size());
+        for (const auto & path : params.model.paths) {
+            paths.push_back(path.c_str());
+        }
+        model = llama_model_load_from_splits(paths.data(), paths.size(), model_params);
+    }
+
     if (!model) {
-        LOG_ERR("error: failed to load model '%s'\n", params.model.path.c_str());
+        LOG_ERR("error: failed to load model '%s'\n", params.model.paths[0].c_str());
         return 1;
     }
 
diff --git a/examples/gritlm/gritlm.cpp b/examples/gritlm/gritlm.cpp
@@ -168,7 +168,20 @@ int main(int argc, char * argv[]) {
 
     llama_backend_init();
 
-    llama_model * model = llama_model_load_from_file(params.model.path.c_str(), mparams);
+    llama_model * model = NULL;
+    if (params.model.paths.empty()) {
+        fprintf(stderr, "failed to load model 'model path not specified'\n");
+        return 1;
+    } else if (params.model.paths.size() == 1) {
+        model = llama_model_load_from_file(params.model.paths[0].c_str(), mparams);
+    } else {
+        std::vector<const char *> paths;
+        paths.reserve(params.model.paths.size());
+        for (const auto & path : params.model.paths) {
+            paths.push_back(path.c_str());
+        }
+        model = llama_model_load_from_splits(paths.data(), paths.size(), mparams);
+    }
 
     // create generation context
     llama_context * ctx = llama_init_from_model(model, cparams);
diff --git a/examples/parallel/parallel.cpp b/examples/parallel/parallel.cpp
@@ -495,7 +495,7 @@ int main(int argc, char ** argv) {
         params.prompt_file = "used built-in defaults";
     }
     LOG_INF("External prompt file: \033[32m%s\033[0m\n", params.prompt_file.c_str());
-    LOG_INF("Model and path used:  \033[32m%s\033[0m\n\n", params.model.path.c_str());
+    LOG_INF("Model and path used:  \033[32m%s\033[0m\n\n", params.model.paths[0].c_str());
 
     LOG_INF("Total prompt tokens: %6d, speed: %5.2f t/s\n", n_total_prompt, (double) (n_total_prompt              ) / (t_main_end - t_main_start) * 1e6);
     LOG_INF("Total gen tokens:    %6d, speed: %5.2f t/s\n", n_total_gen,    (double) (n_total_gen                 ) / (t_main_end - t_main_start) * 1e6);
diff --git a/examples/passkey/passkey.cpp b/examples/passkey/passkey.cpp
@@ -64,7 +64,20 @@ int main(int argc, char ** argv) {
 
     llama_model_params model_params = common_model_params_to_llama(params);
 
-    llama_model * model = llama_model_load_from_file(params.model.path.c_str(), model_params);
+    llama_model * model;
+    if (params.model.paths.empty()) {
+        LOG_ERR("%s: failed to load model 'model path not specified'\n", __func__);
+        return 1;
+    } else if (params.model.paths.size() == 1) {
+        model = llama_model_load_from_file(params.model.paths[0].c_str(), model_params);
+    } else {
+        std::vector<const char *> paths;
+        paths.reserve(params.model.paths.size());
+        for (const auto & path : params.model.paths) {
+            paths.push_back(path.c_str());
+        }
+        model = llama_model_load_from_splits(paths.data(), paths.size(), model_params);
+    }
 
     if (model == NULL) {
         LOG_ERR("%s: unable to load model\n" , __func__);
diff --git a/examples/speculative-simple/speculative-simple.cpp b/examples/speculative-simple/speculative-simple.cpp
@@ -24,7 +24,7 @@ int main(int argc, char ** argv) {
 
     common_init();
 
-    if (params.speculative.model.path.empty()) {
+    if (params.speculative.model.paths.empty()) {
         LOG_ERR("%s: --model-draft is required\n", __func__);
         return 1;
     }
@@ -67,7 +67,7 @@ int main(int argc, char ** argv) {
     ctx_dft   = llama_init_dft.context.get();
 
     if (!common_speculative_are_compatible(ctx_tgt, ctx_dft)) {
-        LOG_INF("the draft model '%s' is not compatible with the target model '%s'. tokens will be translated between the draft and target models.\n", params.speculative.model.path.c_str(), params.model.path.c_str());
+        LOG_INF("the draft model '%s' is not compatible with the target model '%s'. tokens will be translated between the draft and target models.\n", params.speculative.model.paths[0].c_str(), params.model.paths[0].c_str());
     }
 
     // Tokenize the prompt
diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp
@@ -46,7 +46,7 @@ int main(int argc, char ** argv) {
 
     common_init();
 
-    if (params.speculative.model.path.empty()) {
+    if (params.speculative.model.paths.empty()) {
         LOG_ERR("%s: --model-draft is required\n", __func__);
         return 1;
     }
diff --git a/src/llama.cpp b/src/llama.cpp
@@ -258,6 +258,7 @@ struct llama_model * llama_model_load_from_splits(
         return nullptr;
     }
     for (size_t i = 0; i < n_paths; ++i) {
+        LLAMA_LOG_INFO("%s: splits[%zu] = '%s'\n", __func__, i, paths[i]);
         splits.push_back(paths[i]);
     }
     return llama_model_load_from_file_impl(splits.front(), splits, params);
diff --git a/tests/test-arg-parser.cpp b/tests/test-arg-parser.cpp
@@ -77,7 +77,7 @@ int main(void) {
 
     argv = {"binary_name", "-m", "model_file.gguf"};
     assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
-    assert(params.model.path == "model_file.gguf");
+    assert(params.model.paths[0] == "model_file.gguf");
 
     argv = {"binary_name", "-t", "1234"};
     assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
@@ -89,7 +89,7 @@ int main(void) {
 
     argv = {"binary_name", "-m", "abc.gguf", "--predict", "6789", "--batch-size", "9090"};
     assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
-    assert(params.model.path == "abc.gguf");
+    assert(params.model.paths[0] == "abc.gguf");
     assert(params.n_predict == 6789);
     assert(params.n_batch == 9090);
 
@@ -112,7 +112,7 @@ int main(void) {
     setenv("LLAMA_ARG_THREADS", "1010", true);
     argv = {"binary_name"};
     assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
-    assert(params.model.path == "blah.gguf");
+    assert(params.model.paths[0] == "blah.gguf");
     assert(params.cpuparams.n_threads == 1010);
 
 
@@ -122,7 +122,7 @@ int main(void) {
     setenv("LLAMA_ARG_THREADS", "1010", true);
     argv = {"binary_name", "-m", "overwritten.gguf"};
     assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
-    assert(params.model.path == "overwritten.gguf");
+    assert(params.model.paths[0] == "overwritten.gguf");
     assert(params.cpuparams.n_threads == 1010);
 #endif // _WIN32
 
diff --git a/tests/test-thread-safety.cpp b/tests/test-thread-safety.cpp
@@ -66,9 +66,23 @@ int main(int argc, char ** argv) {
             mparams.split_mode = LLAMA_SPLIT_MODE_LAYER;;
         }
 
-        llama_model * model = llama_model_load_from_file(params.model.path.c_str(), mparams);
+        llama_model * model = NULL;
+        if (params.model.paths.empty()) {
+            LOG_ERR("%s: failed to load model 'model path not specified'\n", __func__);
+            return 1;
+        } else if (params.model.paths.size() == 1) {
+            model = llama_model_load_from_file(params.model.paths[0].c_str(), mparams);
+        } else {
+            std::vector<const char *> paths;
+            paths.reserve(params.model.paths.size());
+            for (const auto & path : params.model.paths) {
+                paths.push_back(path.c_str());
+            }
+            model = llama_model_load_from_splits(paths.data(), paths.size(), mparams);
+        }
+
         if (model == NULL) {
-            LOG_ERR("%s: failed to load model '%s'\n", __func__, params.model.path.c_str());
+            LOG_ERR("%s: failed to load model '%s'\n", __func__, params.model.paths[0].c_str());
             return 1;
         }
 
diff --git a/tools/batched-bench/batched-bench.cpp b/tools/batched-bench/batched-bench.cpp
diff --git a/tools/export-lora/export-lora.cpp b/tools/export-lora/export-lora.cpp
diff --git a/tools/mtmd/mtmd-cli.cpp b/tools/mtmd/mtmd-cli.cpp
diff --git a/tools/server/server.cpp b/tools/server/server.cpp

Original file line number	Diff line number	Diff line change
`@@ -495,7 +495,7 @@ int main(int argc, char ** argv) {`
`495`	`495`	`params.prompt_file = "used built-in defaults";`
`496`	`496`	`}`
`497`	`497`	`LOG_INF("External prompt file: \033[32m%s\033[0m\n", params.prompt_file.c_str());`
`498`		`- LOG_INF("Model and path used: \033[32m%s\033[0m\n\n", params.model.path.c_str());`
	`498`	`+ LOG_INF("Model and path used: \033[32m%s\033[0m\n\n", params.model.paths[0].c_str());`
`499`	`499`
`500`	`500`	`LOG_INF("Total prompt tokens: %6d, speed: %5.2f t/s\n", n_total_prompt, (double) (n_total_prompt ) / (t_main_end - t_main_start) * 1e6);`
`501`	`501`	`LOG_INF("Total gen tokens: %6d, speed: %5.2f t/s\n", n_total_gen, (double) (n_total_gen ) / (t_main_end - t_main_start) * 1e6);`
Original file line number	Diff line number	Diff line change
`@@ -24,7 +24,7 @@ int main(int argc, char ** argv) {`
`24`	`24`
`25`	`25`	`common_init();`
`26`	`26`
`27`		`- if (params.speculative.model.path.empty()) {`
	`27`	`+ if (params.speculative.model.paths.empty()) {`
`28`	`28`	`LOG_ERR("%s: --model-draft is required\n", __func__);`
`29`	`29`	`return 1;`
`30`	`30`	`}`
`@@ -67,7 +67,7 @@ int main(int argc, char ** argv) {`
`67`	`67`	`ctx_dft = llama_init_dft.context.get();`
`68`	`68`
`69`	`69`	`if (!common_speculative_are_compatible(ctx_tgt, ctx_dft)) {`
`70`		`- LOG_INF("the draft model '%s' is not compatible with the target model '%s'. tokens will be translated between the draft and target models.\n", params.speculative.model.path.c_str(), params.model.path.c_str());`
	`70`	`+ LOG_INF("the draft model '%s' is not compatible with the target model '%s'. tokens will be translated between the draft and target models.\n", params.speculative.model.paths[0].c_str(), params.model.paths[0].c_str());`
`71`	`71`	`}`
`72`	`72`
`73`	`73`	`// Tokenize the prompt`
Original file line number	Diff line number	Diff line change
`@@ -46,7 +46,7 @@ int main(int argc, char ** argv) {`
`46`	`46`
`47`	`47`	`common_init();`
`48`	`48`
`49`		`- if (params.speculative.model.path.empty()) {`
	`49`	`+ if (params.speculative.model.paths.empty()) {`
`50`	`50`	`LOG_ERR("%s: --model-draft is required\n", __func__);`
`51`	`51`	`return 1;`
`52`	`52`	`}`