diff --git a/common/arg.cpp b/common/arg.cpp index 98baac4c14da2..bc7004d080d8b 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -496,8 +496,12 @@ static bool common_download_model( LOG_ERR("%s: invalid model url\n", __func__); return false; } + if (model.paths.size() != 1) { + LOG_ERR("%s: model url can only be specified with one path\n", __func__); + return false; + } - if (!common_download_file_single(model.url, model.path, bearer_token, offline)) { + if (!common_download_file_single(model.url, model.paths[0], bearer_token, offline)) { return false; } @@ -508,9 +512,9 @@ static bool common_download_model( /*.no_alloc = */ true, /*.ctx = */ NULL, }; - auto * ctx_gguf = gguf_init_from_file(model.path.c_str(), gguf_params); + auto * ctx_gguf = gguf_init_from_file(model.paths[0].c_str(), gguf_params); if (!ctx_gguf) { - LOG_ERR("\n%s: failed to load input GGUF from %s\n", __func__, model.path.c_str()); + LOG_ERR("\n%s: failed to load input GGUF from %s\n", __func__, model.paths[0].c_str()); return false; } @@ -529,8 +533,8 @@ static bool common_download_model( // Verify the first split file format // and extract split URL and PATH prefixes { - if (!llama_split_prefix(split_prefix, sizeof(split_prefix), model.path.c_str(), 0, n_split)) { - LOG_ERR("\n%s: unexpected model file name: %s n_split=%d\n", __func__, model.path.c_str(), n_split); + if (!llama_split_prefix(split_prefix, sizeof(split_prefix), model.paths[0].c_str(), 0, n_split)) { + LOG_ERR("\n%s: unexpected model file name: %s n_split=%d\n", __func__, model.paths[0].c_str(), n_split); return false; } @@ -548,7 +552,7 @@ static bool common_download_model( char split_url[LLAMA_CURL_MAX_URL_LENGTH] = {0}; llama_split_path(split_url, sizeof(split_url), split_url_prefix, idx, n_split); - if (std::string(split_path) == model.path) { + if (std::string(split_path) == model.paths[0]) { continue; // skip the already downloaded file } @@ -798,7 +802,7 @@ static handle_model_result common_params_handle_model( if (!model.hf_repo.empty()) { // short-hand to avoid specifying --hf-file -> default it to --model if (model.hf_file.empty()) { - if (model.path.empty()) { + if (model.paths.empty()) { auto auto_detected = common_get_hf_file(model.hf_repo, bearer_token, offline); if (auto_detected.repo.empty() || auto_detected.ggufFile.empty()) { exit(1); // built without CURL, error message already printed @@ -811,30 +815,30 @@ static handle_model_result common_params_handle_model( result.mmproj.hf_file = auto_detected.mmprojFile; } } else { - model.hf_file = model.path; + model.hf_file = model.paths[0]; } } std::string model_endpoint = get_model_endpoint(); model.url = model_endpoint + model.hf_repo + "/resolve/main/" + model.hf_file; // make sure model path is present (for caching purposes) - if (model.path.empty()) { + if (model.paths.empty()) { // this is to avoid different repo having same file name, or same file name in different subdirs std::string filename = model.hf_repo + "_" + model.hf_file; // to make sure we don't have any slashes in the filename string_replace_all(filename, "/", "_"); - model.path = fs_get_cache_file(filename); + model.paths.push_back(fs_get_cache_file(filename)); } } else if (!model.url.empty()) { - if (model.path.empty()) { + if (model.paths.empty()) { auto f = string_split(model.url, '#').front(); f = string_split(f, '?').front(); - model.path = fs_get_cache_file(string_split(f, '/').back()); + model.paths.push_back(fs_get_cache_file(string_split(f, '/').back())); } - } else if (model.path.empty()) { - model.path = model_path_default; + } else if (model.paths.empty() && !model_path_default.empty()) { + model.paths.push_back(model_path_default); } } @@ -986,7 +990,7 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context auto res = common_params_handle_model(params.model, params.hf_token, DEFAULT_MODEL_PATH, params.offline); if (params.no_mmproj) { params.mmproj = {}; - } else if (res.found_mmproj && params.mmproj.path.empty() && params.mmproj.url.empty()) { + } else if (res.found_mmproj && params.mmproj.paths.empty() && params.mmproj.url.empty()) { // optionally, handle mmproj model when -hf is specified params.mmproj = res.mmproj; } @@ -2285,7 +2289,11 @@ common_params_context common_params_parser_init(common_params & params, llama_ex "path to a multimodal projector file. see tools/mtmd/README.md\n" "note: if -hf is used, this argument can be omitted", [](common_params & params, const std::string & value) { - params.mmproj.path = value; + if (params.mmproj.paths.empty()) { + params.mmproj.paths.push_back(value); + } else { + params.mmproj.paths[0] = value; + } } ).set_examples(mmproj_examples).set_env("LLAMA_ARG_MMPROJ")); add_opt(common_arg( @@ -2597,7 +2605,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex "or `--model-url` if set, otherwise %s)", DEFAULT_MODEL_PATH ), [](common_params & params, const std::string & value) { - params.model.path = value; + params.model.paths.push_back(value); } ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}).set_env("LLAMA_ARG_MODEL")); add_opt(common_arg( @@ -3330,7 +3338,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex {"-md", "--model-draft"}, "FNAME", "draft model for speculative decoding (default: unused)", [](common_params & params, const std::string & value) { - params.speculative.model.path = value; + params.speculative.model.paths.push_back(value); } ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODEL_DRAFT")); add_opt(common_arg( @@ -3371,7 +3379,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex {"-mv", "--model-vocoder"}, "FNAME", "vocoder model for audio generation (default: unused)", [](common_params & params, const std::string & value) { - params.vocoder.model.path = value; + params.vocoder.model.paths.push_back(value); } ).set_examples({LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_SERVER})); add_opt(common_arg( diff --git a/common/common.cpp b/common/common.cpp index 67dd5404fff90..03759ca66d579 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -912,10 +912,24 @@ std::string fs_get_cache_file(const std::string & filename) { struct common_init_result common_init_from_params(common_params & params) { common_init_result iparams; auto mparams = common_model_params_to_llama(params); + llama_model * model = NULL; + + if (params.model.paths.empty()) { + LOG_ERR("%s: failed to load model 'model path not specified'\n", __func__); + return iparams; + } else if (params.model.paths.size() == 1) { + model = llama_model_load_from_file(params.model.paths[0].c_str(), mparams); + } else { + std::vector paths; + paths.reserve(params.model.paths.size()); + for (const auto & path : params.model.paths) { + paths.push_back(path.c_str()); + } + model = llama_model_load_from_splits(paths.data(), paths.size(), mparams); + } - llama_model * model = llama_model_load_from_file(params.model.path.c_str(), mparams); if (model == NULL) { - LOG_ERR("%s: failed to load model '%s'\n", __func__, params.model.path.c_str()); + LOG_ERR("%s: failed to load model '%s'\n", __func__, params.model.paths[0].c_str()); return iparams; } @@ -925,7 +939,7 @@ struct common_init_result common_init_from_params(common_params & params) { llama_context * lctx = llama_init_from_model(model, cparams); if (lctx == NULL) { - LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.path.c_str()); + LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.paths[0].c_str()); llama_model_free(model); return iparams; } diff --git a/common/common.h b/common/common.h index 75596e6b32979..be40d07d1dde7 100644 --- a/common/common.h +++ b/common/common.h @@ -190,10 +190,10 @@ struct common_params_sampling { }; struct common_params_model { - std::string path = ""; // model local path // NOLINT - std::string url = ""; // model url to download // NOLINT - std::string hf_repo = ""; // HF repo // NOLINT - std::string hf_file = ""; // HF file // NOLINT + std::vector paths = {}; // model local path // NOLINT + std::string url = ""; // model url to download // NOLINT + std::string hf_repo = ""; // HF repo // NOLINT + std::string hf_file = ""; // HF file // NOLINT }; struct common_params_speculative { diff --git a/examples/batched/batched.cpp b/examples/batched/batched.cpp index 1a5de5928a526..485f4853f3d47 100644 --- a/examples/batched/batched.cpp +++ b/examples/batched/batched.cpp @@ -41,7 +41,20 @@ int main(int argc, char ** argv) { llama_model_params model_params = common_model_params_to_llama(params); - llama_model * model = llama_model_load_from_file(params.model.path.c_str(), model_params); + llama_model * model = NULL; + if (params.model.paths.empty()) { + LOG_ERR("%s: failed to load model 'model path not specified'\n", __func__); + return 1; + } else if (params.model.paths.size() == 1) { + model = llama_model_load_from_file(params.model.paths[0].c_str(), model_params); + } else { + std::vector paths; + paths.reserve(params.model.paths.size()); + for (const auto & path : params.model.paths) { + paths.push_back(path.c_str()); + } + model = llama_model_load_from_splits(paths.data(), paths.size(), model_params); + } if (model == NULL) { LOG_ERR("%s: error: unable to load model\n" , __func__); diff --git a/examples/diffusion/diffusion-cli.cpp b/examples/diffusion/diffusion-cli.cpp index 8431dcea8fe2a..de1cc9b2be7f4 100644 --- a/examples/diffusion/diffusion-cli.cpp +++ b/examples/diffusion/diffusion-cli.cpp @@ -548,9 +548,23 @@ int main(int argc, char ** argv) { model_params.use_mlock = params.use_mlock; model_params.check_tensors = params.check_tensors; - llama_model * model = llama_model_load_from_file(params.model.path.c_str(), model_params); + llama_model * model = NULL; + if (params.model.paths.empty()) { + LOG_ERR("error: failed to load model 'model path not specified'\n"); + return 1; + } else if (params.model.paths.size() == 1) { + model = llama_model_load_from_file(params.model.paths[0].c_str(), model_params); + } else { + std::vector paths; + paths.reserve(params.model.paths.size()); + for (const auto & path : params.model.paths) { + paths.push_back(path.c_str()); + } + model = llama_model_load_from_splits(paths.data(), paths.size(), model_params); + } + if (!model) { - LOG_ERR("error: failed to load model '%s'\n", params.model.path.c_str()); + LOG_ERR("error: failed to load model '%s'\n", params.model.paths[0].c_str()); return 1; } diff --git a/examples/gritlm/gritlm.cpp b/examples/gritlm/gritlm.cpp index bdab052c3390f..cde0a727e670d 100644 --- a/examples/gritlm/gritlm.cpp +++ b/examples/gritlm/gritlm.cpp @@ -168,7 +168,20 @@ int main(int argc, char * argv[]) { llama_backend_init(); - llama_model * model = llama_model_load_from_file(params.model.path.c_str(), mparams); + llama_model * model = NULL; + if (params.model.paths.empty()) { + fprintf(stderr, "failed to load model 'model path not specified'\n"); + return 1; + } else if (params.model.paths.size() == 1) { + model = llama_model_load_from_file(params.model.paths[0].c_str(), mparams); + } else { + std::vector paths; + paths.reserve(params.model.paths.size()); + for (const auto & path : params.model.paths) { + paths.push_back(path.c_str()); + } + model = llama_model_load_from_splits(paths.data(), paths.size(), mparams); + } // create generation context llama_context * ctx = llama_init_from_model(model, cparams); diff --git a/examples/parallel/parallel.cpp b/examples/parallel/parallel.cpp index e48f48fc32216..2f4f5f871d3e6 100644 --- a/examples/parallel/parallel.cpp +++ b/examples/parallel/parallel.cpp @@ -495,7 +495,7 @@ int main(int argc, char ** argv) { params.prompt_file = "used built-in defaults"; } LOG_INF("External prompt file: \033[32m%s\033[0m\n", params.prompt_file.c_str()); - LOG_INF("Model and path used: \033[32m%s\033[0m\n\n", params.model.path.c_str()); + LOG_INF("Model and path used: \033[32m%s\033[0m\n\n", params.model.paths[0].c_str()); LOG_INF("Total prompt tokens: %6d, speed: %5.2f t/s\n", n_total_prompt, (double) (n_total_prompt ) / (t_main_end - t_main_start) * 1e6); LOG_INF("Total gen tokens: %6d, speed: %5.2f t/s\n", n_total_gen, (double) (n_total_gen ) / (t_main_end - t_main_start) * 1e6); diff --git a/examples/passkey/passkey.cpp b/examples/passkey/passkey.cpp index 8a4faa383bf32..f100eb3414351 100644 --- a/examples/passkey/passkey.cpp +++ b/examples/passkey/passkey.cpp @@ -64,7 +64,20 @@ int main(int argc, char ** argv) { llama_model_params model_params = common_model_params_to_llama(params); - llama_model * model = llama_model_load_from_file(params.model.path.c_str(), model_params); + llama_model * model; + if (params.model.paths.empty()) { + LOG_ERR("%s: failed to load model 'model path not specified'\n", __func__); + return 1; + } else if (params.model.paths.size() == 1) { + model = llama_model_load_from_file(params.model.paths[0].c_str(), model_params); + } else { + std::vector paths; + paths.reserve(params.model.paths.size()); + for (const auto & path : params.model.paths) { + paths.push_back(path.c_str()); + } + model = llama_model_load_from_splits(paths.data(), paths.size(), model_params); + } if (model == NULL) { LOG_ERR("%s: unable to load model\n" , __func__); diff --git a/examples/speculative-simple/speculative-simple.cpp b/examples/speculative-simple/speculative-simple.cpp index a8e53f28eb597..65aa7742102df 100644 --- a/examples/speculative-simple/speculative-simple.cpp +++ b/examples/speculative-simple/speculative-simple.cpp @@ -24,7 +24,7 @@ int main(int argc, char ** argv) { common_init(); - if (params.speculative.model.path.empty()) { + if (params.speculative.model.paths.empty()) { LOG_ERR("%s: --model-draft is required\n", __func__); return 1; } @@ -67,7 +67,7 @@ int main(int argc, char ** argv) { ctx_dft = llama_init_dft.context.get(); if (!common_speculative_are_compatible(ctx_tgt, ctx_dft)) { - LOG_INF("the draft model '%s' is not compatible with the target model '%s'. tokens will be translated between the draft and target models.\n", params.speculative.model.path.c_str(), params.model.path.c_str()); + LOG_INF("the draft model '%s' is not compatible with the target model '%s'. tokens will be translated between the draft and target models.\n", params.speculative.model.paths[0].c_str(), params.model.paths[0].c_str()); } // Tokenize the prompt diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp index 8449406a6d27a..73e1b559667e2 100644 --- a/examples/speculative/speculative.cpp +++ b/examples/speculative/speculative.cpp @@ -46,7 +46,7 @@ int main(int argc, char ** argv) { common_init(); - if (params.speculative.model.path.empty()) { + if (params.speculative.model.paths.empty()) { LOG_ERR("%s: --model-draft is required\n", __func__); return 1; } diff --git a/src/llama.cpp b/src/llama.cpp index 34906cdb62844..8e24f7a21f579 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -258,6 +258,7 @@ struct llama_model * llama_model_load_from_splits( return nullptr; } for (size_t i = 0; i < n_paths; ++i) { + LLAMA_LOG_INFO("%s: splits[%zu] = '%s'\n", __func__, i, paths[i]); splits.push_back(paths[i]); } return llama_model_load_from_file_impl(splits.front(), splits, params); diff --git a/tests/test-arg-parser.cpp b/tests/test-arg-parser.cpp index e2836ca4814b4..fd331756a6bf2 100644 --- a/tests/test-arg-parser.cpp +++ b/tests/test-arg-parser.cpp @@ -77,7 +77,7 @@ int main(void) { argv = {"binary_name", "-m", "model_file.gguf"}; assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON)); - assert(params.model.path == "model_file.gguf"); + assert(params.model.paths[0] == "model_file.gguf"); argv = {"binary_name", "-t", "1234"}; assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON)); @@ -89,7 +89,7 @@ int main(void) { argv = {"binary_name", "-m", "abc.gguf", "--predict", "6789", "--batch-size", "9090"}; assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON)); - assert(params.model.path == "abc.gguf"); + assert(params.model.paths[0] == "abc.gguf"); assert(params.n_predict == 6789); assert(params.n_batch == 9090); @@ -112,7 +112,7 @@ int main(void) { setenv("LLAMA_ARG_THREADS", "1010", true); argv = {"binary_name"}; assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON)); - assert(params.model.path == "blah.gguf"); + assert(params.model.paths[0] == "blah.gguf"); assert(params.cpuparams.n_threads == 1010); @@ -122,7 +122,7 @@ int main(void) { setenv("LLAMA_ARG_THREADS", "1010", true); argv = {"binary_name", "-m", "overwritten.gguf"}; assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON)); - assert(params.model.path == "overwritten.gguf"); + assert(params.model.paths[0] == "overwritten.gguf"); assert(params.cpuparams.n_threads == 1010); #endif // _WIN32 diff --git a/tests/test-thread-safety.cpp b/tests/test-thread-safety.cpp index 853495b00d9d2..cbe52505be078 100644 --- a/tests/test-thread-safety.cpp +++ b/tests/test-thread-safety.cpp @@ -66,9 +66,23 @@ int main(int argc, char ** argv) { mparams.split_mode = LLAMA_SPLIT_MODE_LAYER;; } - llama_model * model = llama_model_load_from_file(params.model.path.c_str(), mparams); + llama_model * model = NULL; + if (params.model.paths.empty()) { + LOG_ERR("%s: failed to load model 'model path not specified'\n", __func__); + return 1; + } else if (params.model.paths.size() == 1) { + model = llama_model_load_from_file(params.model.paths[0].c_str(), mparams); + } else { + std::vector paths; + paths.reserve(params.model.paths.size()); + for (const auto & path : params.model.paths) { + paths.push_back(path.c_str()); + } + model = llama_model_load_from_splits(paths.data(), paths.size(), mparams); + } + if (model == NULL) { - LOG_ERR("%s: failed to load model '%s'\n", __func__, params.model.path.c_str()); + LOG_ERR("%s: failed to load model '%s'\n", __func__, params.model.paths[0].c_str()); return 1; } diff --git a/tools/batched-bench/batched-bench.cpp b/tools/batched-bench/batched-bench.cpp index 03628f74b2880..d1d164b392c70 100644 --- a/tools/batched-bench/batched-bench.cpp +++ b/tools/batched-bench/batched-bench.cpp @@ -38,7 +38,20 @@ int main(int argc, char ** argv) { llama_model_params model_params = common_model_params_to_llama(params); - llama_model * model = llama_model_load_from_file(params.model.path.c_str(), model_params); + llama_model * model = NULL; + if (params.model.paths.empty()) { + fprintf(stderr, "%s: failed to load model 'model path not specified'\n", __func__); + return 1; + } else if (params.model.paths.size() == 1) { + model = llama_model_load_from_file(params.model.paths[0].c_str(), model_params); + } else { + std::vector paths; + paths.reserve(params.model.paths.size()); + for (const auto & path : params.model.paths) { + paths.push_back(path.c_str()); + } + model = llama_model_load_from_splits(paths.data(), paths.size(), model_params); + } if (model == NULL) { fprintf(stderr , "%s: error: unable to load model\n" , __func__); diff --git a/tools/export-lora/export-lora.cpp b/tools/export-lora/export-lora.cpp index f038019b007b4..7e2d290dc584b 100644 --- a/tools/export-lora/export-lora.cpp +++ b/tools/export-lora/export-lora.cpp @@ -419,9 +419,14 @@ int main(int argc, char ** argv) { return 1; } + if (params.model.paths.size() != 1) { + fprintf(stderr, "exactly one model path needs to be specified, got %zu\n", params.model.paths.size()); + exit(EXIT_FAILURE); + } + g_verbose = (params.verbosity > 1); try { - lora_merge_ctx ctx(params.model.path, params.lora_adapters, params.out_file, params.cpuparams.n_threads); + lora_merge_ctx ctx(params.model.paths[0], params.lora_adapters, params.out_file, params.cpuparams.n_threads); ctx.run_merge(); } catch (const std::exception & err) { fprintf(stderr, "%s\n", err.what()); diff --git a/tools/mtmd/mtmd-cli.cpp b/tools/mtmd/mtmd-cli.cpp index 599e682e0f894..bd04602c76449 100644 --- a/tools/mtmd/mtmd-cli.cpp +++ b/tools/mtmd/mtmd-cli.cpp @@ -126,7 +126,7 @@ struct mtmd_cli_context { } void init_vision_context(common_params & params) { - const char * clip_path = params.mmproj.path.c_str(); + const char * clip_path = params.mmproj.paths[0].c_str(); mtmd_context_params mparams = mtmd_context_params_default(); mparams.use_gpu = params.mmproj_use_gpu; mparams.print_timings = true; @@ -257,14 +257,20 @@ int main(int argc, char ** argv) { common_init(); - if (params.mmproj.path.empty()) { + if (params.mmproj.paths.empty()) { show_additional_info(argc, argv); LOG_ERR("ERR: Missing --mmproj argument\n"); return 1; } + if (params.mmproj.paths.size() == 1) { + show_additional_info(argc, argv); + LOG_ERR("ERR: Only one --mmproj argument is supported\n"); + return 1; + } + mtmd_cli_context ctx(params); - LOG("%s: loading model: %s\n", __func__, params.model.path.c_str()); + LOG("%s: loading model: %s\n", __func__, params.model.paths[0].c_str()); bool is_single_turn = !params.prompt.empty() && !params.image.empty(); diff --git a/tools/server/server.cpp b/tools/server/server.cpp index 8578d49e0394b..0c699ec59644a 100644 --- a/tools/server/server.cpp +++ b/tools/server/server.cpp @@ -1991,7 +1991,12 @@ struct server_context { } bool load_model(const common_params & params) { - SRV_INF("loading model '%s'\n", params.model.path.c_str()); + if (params.model.paths.empty()) { + SRV_ERR("%s: no model path(s) specified\n", __func__); + return false; + } + + SRV_INF("loading model '%s'\n", params.model.paths[0].c_str()); params_base = params; @@ -2001,7 +2006,7 @@ struct server_context { ctx = llama_init.context.get(); if (model == nullptr) { - SRV_ERR("failed to load model, '%s'\n", params_base.model.path.c_str()); + SRV_ERR("failed to load model, '%s'\n", params_base.model.paths[0].c_str()); return false; } @@ -2011,8 +2016,13 @@ struct server_context { add_bos_token = llama_vocab_get_add_bos(vocab); - if (!params_base.speculative.model.path.empty() || !params_base.speculative.model.hf_repo.empty()) { - SRV_INF("loading draft model '%s'\n", params_base.speculative.model.path.c_str()); + if (!params_base.speculative.model.paths.empty() || !params_base.speculative.model.hf_repo.empty()) { + if (params_base.speculative.model.paths.empty()) { + SRV_ERR("%s: no speculative model path(s) specified\n", __func__); + return false; + } + + SRV_INF("loading draft model '%s'\n", params_base.speculative.model.paths[0].c_str()); auto params_dft = params_base; @@ -2033,13 +2043,13 @@ struct server_context { model_dft = llama_init_dft.model.get(); if (model_dft == nullptr) { - SRV_ERR("failed to load draft model, '%s'\n", params_base.speculative.model.path.c_str()); + SRV_ERR("failed to load draft model, '%s'\n", params_base.speculative.model.paths[0].c_str()); return false; } vocab_dft_compatible = common_speculative_are_compatible(ctx, llama_init_dft.context.get()); if (!vocab_dft_compatible) { - SRV_INF("the draft model '%s' is not compatible with the target model '%s'. tokens will be translated between the draft and target models.\n", params_base.speculative.model.path.c_str(), params_base.model.path.c_str()); + SRV_INF("the draft model '%s' is not compatible with the target model '%s'. tokens will be translated between the draft and target models.\n", params_base.speculative.model.paths[0].c_str(), params_base.model.paths[0].c_str()); } const int n_ctx_dft = llama_n_ctx(llama_init_dft.context.get()); @@ -2060,8 +2070,12 @@ struct server_context { chat_templates = common_chat_templates_init(model, "chatml"); } - std::string & mmproj_path = params_base.mmproj.path; - if (!mmproj_path.empty()) { + if (!params_base.mmproj.paths.empty()) { + if (params_base.mmproj.paths.size() != 1) { + SRV_ERR("%s: only one mmproj path can be specified\n", __func__); + return false; + } + std::string & mmproj_path = params_base.mmproj.paths[0]; mtmd_context_params mparams = mtmd_context_params_default(); mparams.use_gpu = params_base.mmproj_use_gpu; mparams.print_timings = false; @@ -2084,7 +2098,7 @@ struct server_context { SRV_WRN("%s\n", "cache_reuse is not supported by multimodal, it will be disabled"); } - if (!params_base.speculative.model.path.empty()) { + if (!params_base.speculative.model.paths.empty()) { SRV_ERR("%s\n", "err: speculative decode is not supported by multimodal"); return false; } @@ -4246,7 +4260,7 @@ int main(int argc, char ** argv) { json data = { { "default_generation_settings", ctx_server.default_generation_settings_for_props }, { "total_slots", ctx_server.params_base.n_parallel }, - { "model_path", ctx_server.params_base.model.path }, + { "model_path", ctx_server.params_base.model.paths[0] }, { "modalities", json{ {"vision", ctx_server.oai_parser_opt.allow_image}, {"audio", ctx_server.oai_parser_opt.allow_audio}, @@ -4608,8 +4622,8 @@ int main(int argc, char ** argv) { json models = { {"models", { { - {"name", params.model_alias.empty() ? params.model.path : params.model_alias}, - {"model", params.model_alias.empty() ? params.model.path : params.model_alias}, + {"name", params.model_alias.empty() ? params.model.paths[0] : params.model_alias}, + {"model", params.model_alias.empty() ? params.model.paths[0] : params.model_alias}, {"modified_at", ""}, {"size", ""}, {"digest", ""}, // dummy value, llama.cpp does not support managing model file's hash @@ -4631,7 +4645,7 @@ int main(int argc, char ** argv) { {"object", "list"}, {"data", { { - {"id", params.model_alias.empty() ? params.model.path : params.model_alias}, + {"id", params.model_alias.empty() ? params.model.paths[0] : params.model_alias}, {"object", "model"}, {"created", std::time(0)}, {"owned_by", "llamacpp"},