1717#include < cinttypes>
1818#include < memory>
1919#include < unordered_set>
20+ #include < filesystem>
2021
2122// fix problem with std::min and std::max
2223#if defined(_WIN32)
@@ -518,6 +519,8 @@ struct server_context_impl {
518519 // Necessary similarity of prompt for slot selection
519520 float slot_prompt_similarity = 0 .0f ;
520521
522+ std::string model_name; // name of the loaded model, to be used by API
523+
521524 common_chat_templates_ptr chat_templates;
522525 oaicompat_parser_options oai_parser_opt;
523526
@@ -758,6 +761,18 @@ struct server_context_impl {
758761 }
759762 SRV_WRN (" %s" , " for more info see https://github.com/ggml-org/llama.cpp/pull/16391\n " );
760763
764+ if (!params_base.model_alias .empty ()) {
765+ // user explicitly specified model name
766+ model_name = params_base.model_alias ;
767+ } else if (!params_base.model .name .empty ()) {
768+ // use model name in registry format (for models in cache)
769+ model_name = params_base.model .name ;
770+ } else {
771+ // fallback: derive model name from file name
772+ auto model_path = std::filesystem::path (params_base.model .path );
773+ model_name = model_path.filename ().string ();
774+ }
775+
761776 // thinking is enabled if:
762777 // 1. It's not explicitly disabled (reasoning_budget == 0)
763778 // 2. The chat template supports it
@@ -2611,7 +2626,7 @@ static std::unique_ptr<server_res_generator> handle_completions_impl(
26112626 // OAI-compat
26122627 task.params .res_type = res_type;
26132628 task.params .oaicompat_cmpl_id = completion_id;
2614- // oaicompat_model is already populated by params_from_json_cmpl
2629+ task. params . oaicompat_model = ctx_server. model_name ;
26152630
26162631 tasks.push_back (std::move (task));
26172632 }
@@ -2939,7 +2954,7 @@ void server_routes::init_routes() {
29392954 json data = {
29402955 { " default_generation_settings" , default_generation_settings_for_props },
29412956 { " total_slots" , ctx_server.params_base .n_parallel },
2942- { " model_alias" , ctx_server.params_base . model_alias },
2957+ { " model_alias" , ctx_server.model_name },
29432958 { " model_path" , ctx_server.params_base .model .path },
29442959 { " modalities" , json {
29452960 {" vision" , ctx_server.oai_parser_opt .allow_image },
@@ -3181,8 +3196,8 @@ void server_routes::init_routes() {
31813196 json models = {
31823197 {" models" , {
31833198 {
3184- {" name" , params. model_alias . empty () ? params. model . path : params. model_alias },
3185- {" model" , params. model_alias . empty () ? params. model . path : params. model_alias },
3199+ {" name" , ctx_server. model_name },
3200+ {" model" , ctx_server. model_name },
31863201 {" modified_at" , " " },
31873202 {" size" , " " },
31883203 {" digest" , " " }, // dummy value, llama.cpp does not support managing model file's hash
@@ -3204,7 +3219,7 @@ void server_routes::init_routes() {
32043219 {" object" , " list" },
32053220 {" data" , {
32063221 {
3207- {" id" , params. model_alias . empty () ? params. model . path : params. model_alias },
3222+ {" id" , ctx_server. model_name },
32083223 {" object" , " model" },
32093224 {" created" , std::time (0 )},
32103225 {" owned_by" , " llamacpp" },
@@ -3351,6 +3366,7 @@ void server_routes::init_routes() {
33513366 // write JSON response
33523367 json root = format_response_rerank (
33533368 body,
3369+ ctx_server.model_name ,
33543370 responses,
33553371 is_tei_format,
33563372 documents,
@@ -3613,7 +3629,7 @@ std::unique_ptr<server_res_generator> server_routes::handle_embeddings_impl(cons
36133629
36143630 // write JSON response
36153631 json root = res_type == TASK_RESPONSE_TYPE_OAI_EMBD
3616- ? format_embeddings_response_oaicompat (body, responses, use_base64)
3632+ ? format_embeddings_response_oaicompat (body, ctx_server. model_name , responses, use_base64)
36173633 : json (responses);
36183634 res->ok (root);
36193635 return res;
0 commit comments