Merge branch 'master' into add-fh1-rebased

younesbelkada · web-flow · commit 7b9aa7ba3a8d · 2025-07-08T19:31:39.000+02:00
diff --git a/common/arg.cpp b/common/arg.cpp
@@ -2734,6 +2734,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             params.public_path = value;
         }
     ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_STATIC_PATH"));
+    add_opt(common_arg(
+        {"--api-prefix"}, "PREFIX",
+        string_format("prefix path the server serves from, without the trailing slash (default: %s)", params.api_prefix.c_str()),
+        [](common_params & params, const std::string & value) {
+            params.api_prefix = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_API_PREFIX"));
     add_opt(common_arg(
         {"--no-webui"},
         string_format("Disable the Web UI (default: %s)", params.webui ? "enabled" : "disabled"),
diff --git a/common/common.h b/common/common.h
@@ -370,6 +370,7 @@ struct common_params {
 
     std::string hostname      = "127.0.0.1";
     std::string public_path   = "";                                                                         // NOLINT
+    std::string api_prefix    = "";                                                                         // NOLINT
     std::string chat_template = "";                                                                         // NOLINT
     bool use_jinja = false;                                                                                 // NOLINT
     bool enable_chat_template = true;
diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
@@ -6813,6 +6813,9 @@ def prepare_tensors(self):
                 raise ValueError(f"Unprocessed experts: {experts}")
 
 
+@ModelBase.register("SmolLM3ForCausalLM")
+class SmolLM3Model(LlamaModel):
+    model_arch = gguf.MODEL_ARCH.SMOLLM3
 ###### CONVERSION LOGIC ######
 
 
diff --git a/docs/development/HOWTO-add-model.md b/docs/development/HOWTO-add-model.md
@@ -83,20 +83,22 @@ NOTE: Tensor names must end with `.weight` or `.bias` suffixes, that is the conv
 
 ### 2. Define the model architecture in `llama.cpp`
 
-The model params and tensors layout must be defined in `llama.cpp`:
-1. Define a new `llm_arch`
-2. Define the tensors layout in `LLM_TENSOR_NAMES`
-3. Add any non-standard metadata in `llm_load_hparams`
-4. Create the tensors for inference in `llm_load_tensors`
-5. If the model has a RoPE operation, add the rope type in `llama_rope_type`
+The model params and tensors layout must be defined in `llama.cpp` source files:
+1. Define a new `llm_arch` enum value in `src/llama-arch.h`.
+2. In `src/llama-arch.cpp`:
+    - Add the architecture name to the `LLM_ARCH_NAMES` map.
+    - Add the tensor mappings to the `LLM_TENSOR_NAMES` map.
+3. Add any non-standard metadata loading in the `llama_model_loader` constructor in `src/llama-model-loader.cpp`.
+4. If the model has a RoPE operation, add a case for the architecture in `llama_model_rope_type` function in `src/llama-model.cpp`.
 
 NOTE: The dimensions in `ggml` are typically in the reverse order of the `pytorch` dimensions.
 
 ### 3. Build the GGML graph implementation
 
-This is the funniest part, you have to provide the inference graph implementation of the new model architecture in `llama_build_graph`.
-
-Have a look at existing implementations like `build_llama`, `build_dbrx` or `build_bert`.
+This is the funniest part, you have to provide the inference graph implementation of the new model architecture in `src/llama-model.cpp`.
+Create a new struct that inherits from `llm_graph_context` and implement the graph-building logic in its constructor.
+Have a look at existing implementations like `llm_build_llama`, `llm_build_dbrx` or `llm_build_bert`.
+Then, in the `llama_model::build_graph` method, add a case for your architecture to instantiate your new graph-building struct.
 
 Some `ggml` backends do not support all operations. Backend implementations can be added in a separate PR.
 
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp b/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp
@@ -14,21 +14,19 @@ void main() {
 
     const uint row_dst = gl_GlobalInvocationID.x;
 
-    if (i0 >= p.n_dims) {
-        const uint i = row_dst*ne0 + i0;
-
-        data_d[i + 0] = data_a[i + 0];
-        data_d[i + 1] = data_a[i + 1];
-
-        return;
-    }
-
     const uint row_x     = row_dst % ne1;
     const uint channel_x = row_dst / ne1;
 
     const uint idst = row_dst*ne0 + i0/2;
     const uint ix   = channel_x*p.s2 + row_x*p.s1 + i0/2;
 
+    if (i0 >= p.n_dims) {
+        data_d[idst + i0/2 + 0] = data_a[ix + i0/2 + 0];
+        data_d[idst + i0/2 + 1] = data_a[ix + i0/2 + 1];
+
+        return;
+    }
+
     const int sect_dims = p.sections[0] + p.sections[1] + p.sections[2] + p.sections[3];
     const int sec_w = p.sections[1] + p.sections[0];
     const uint sector = (i0 / 2) % sect_dims;
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp b/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp
@@ -13,21 +13,19 @@ void main() {
 
     const uint row_dst = gl_GlobalInvocationID.x;
 
-    if (i0 >= p.n_dims) {
-        const uint i = row_dst*ne0 + i0;
-
-        data_d[i + 0] = data_a[i + 0];
-        data_d[i + 1] = data_a[i + 1];
-
-        return;
-    }
-
     const uint row_x     = row_dst % ne1;
     const uint channel_x = row_dst / ne1;
 
     const uint idst = row_dst*ne0 + i0/2;
     const uint ix   = channel_x*p.s2 + row_x*p.s1 + i0/2;
 
+    if (i0 >= p.n_dims) {
+        data_d[idst + i0/2 + 0] = data_a[ix + i0/2 + 0];
+        data_d[idst + i0/2 + 1] = data_a[ix + i0/2 + 1];
+
+        return;
+    }
+
     const float theta_base = data_pos[channel_x] * pow(p.theta_scale, i0/2.0f);
 
     const float freq_factor = p.has_ff != 0 ? data_ff[i0/2] : 1.0f;
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp b/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp
@@ -13,21 +13,19 @@ void main() {
 
     const uint row_dst = gl_GlobalInvocationID.x;
 
-    if (i0 >= p.n_dims) {
-        const uint i = row_dst*ne0 + i0;
-
-        data_d[i + 0] = data_a[i + 0];
-        data_d[i + 1] = data_a[i + 1];
-
-        return;
-    }
-
     const uint row_x     = row_dst % ne1;
     const uint channel_x = row_dst / ne1;
 
     const uint idst = row_dst*ne0 + i0;
     const uint ix   = channel_x*p.s2 + row_x*p.s1 + i0;
 
+    if (i0 >= p.n_dims) {
+        data_d[idst + 0] = data_a[ix + 0];
+        data_d[idst + 1] = data_a[ix + 1];
+
+        return;
+    }
+
     const float theta_base = data_pos[channel_x] * pow(p.theta_scale, i0/2.0f);
 
     const float freq_factor = p.has_ff != 0 ? data_ff[i0/2] : 1.0f;
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
@@ -360,6 +360,7 @@ class MODEL_ARCH(IntEnum):
     ARCEE            = auto()
     ERNIE4_5         = auto()
     HUNYUAN_MOE      = auto()
+    SMOLLM3          = auto()
 
 
 class VISION_PROJECTOR_TYPE(IntEnum):
@@ -665,6 +666,7 @@ class MODEL_TENSOR(IntEnum):
     MODEL_ARCH.ERNIE4_5:         "ernie4_5",
     MODEL_ARCH.FALCON_H1:        "falcon-h1",
     MODEL_ARCH.HUNYUAN_MOE:      "hunyuan-moe",
+    MODEL_ARCH.SMOLLM3:          "smollm3",
 }
 
 VISION_PROJECTOR_TYPE_NAMES: dict[VISION_PROJECTOR_TYPE, str] = {
@@ -2271,6 +2273,22 @@ class MODEL_TENSOR(IntEnum):
         MODEL_TENSOR.FFN_DOWN_SHEXP,
         MODEL_TENSOR.FFN_UP_SHEXP,
     ],
+    MODEL_ARCH.SMOLLM3: [
+        MODEL_TENSOR.TOKEN_EMBD,
+        MODEL_TENSOR.OUTPUT_NORM,
+        MODEL_TENSOR.OUTPUT,
+        MODEL_TENSOR.ROPE_FREQS,
+        MODEL_TENSOR.ATTN_NORM,
+        MODEL_TENSOR.ATTN_Q,
+        MODEL_TENSOR.ATTN_K,
+        MODEL_TENSOR.ATTN_V,
+        MODEL_TENSOR.ATTN_OUT,
+        MODEL_TENSOR.ATTN_ROT_EMBD,
+        MODEL_TENSOR.FFN_NORM,
+        MODEL_TENSOR.FFN_GATE,
+        MODEL_TENSOR.FFN_DOWN,
+        MODEL_TENSOR.FFN_UP,
+    ],
     # TODO
 }
 
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
@@ -80,6 +80,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
     { LLM_ARCH_ARCEE,            "arcee"            },
     { LLM_ARCH_ERNIE4_5,         "ernie4_5"         },
     { LLM_ARCH_HUNYUAN_MOE,      "hunyuan-moe"      },
+    { LLM_ARCH_SMOLLM3,          "smollm3"          },
     { LLM_ARCH_UNKNOWN,          "(unknown)"        },
 };
 
@@ -1749,6 +1750,23 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
             { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
         },
     },
+    {
+        LLM_ARCH_SMOLLM3,
+        {
+            { LLM_TENSOR_TOKEN_EMBD,     "token_embd"            },
+            { LLM_TENSOR_OUTPUT_NORM,    "output_norm"           },
+            { LLM_TENSOR_OUTPUT,         "output"                },
+            { LLM_TENSOR_ATTN_NORM,      "blk.%d.attn_norm"      },
+            { LLM_TENSOR_ATTN_Q,         "blk.%d.attn_q"         },
+            { LLM_TENSOR_ATTN_K,         "blk.%d.attn_k"         },
+            { LLM_TENSOR_ATTN_V,         "blk.%d.attn_v"         },
+            { LLM_TENSOR_ATTN_OUT,       "blk.%d.attn_output"    },
+            { LLM_TENSOR_FFN_NORM,       "blk.%d.ffn_norm"       },
+            { LLM_TENSOR_FFN_GATE,       "blk.%d.ffn_gate"       },
+            { LLM_TENSOR_FFN_DOWN,       "blk.%d.ffn_down"       },
+            { LLM_TENSOR_FFN_UP,         "blk.%d.ffn_up"         },
+        },
+    },
 };
 
 static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
diff --git a/src/llama-arch.h b/src/llama-arch.h
@@ -84,6 +84,7 @@ enum llm_arch {
     LLM_ARCH_ARCEE,
     LLM_ARCH_ERNIE4_5,
     LLM_ARCH_HUNYUAN_MOE,
+    LLM_ARCH_SMOLLM3,
     LLM_ARCH_UNKNOWN,
 };
 
diff --git a/src/llama-chat.cpp b/src/llama-chat.cpp
@@ -680,9 +680,6 @@ int32_t llm_chat_apply_template(
                 ss << "<|startoftext|>" << message->content << "<|extra_0|>";
             }
         }
-        if (add_ass) {
-            ss << "<|startoftext|>";
-        }
     } else {
         // template not supported
         return -1;
diff --git a/src/llama-memory-recurrent.cpp b/src/llama-memory-recurrent.cpp
@@ -377,14 +377,18 @@ llama_memory_context_ptr llama_memory_recurrent::init_batch(llama_batch_allocr &
                 ubatch = balloc.split_equal(n_ubatch, false);
             }
 
-            if (balloc.get_n_used() < balloc.get_n_tokens()) {
-                // failed to find a suitable split
+            if (ubatch.n_tokens == 0) {
                 break;
             }
 
             ubatches.push_back(std::move(ubatch)); // NOLINT
         }
 
+        if (balloc.get_n_used() < balloc.get_n_tokens()) {
+            // failed to find a suitable split
+            break;
+        }
+
         if (!prepare(ubatches)) {
             break;
         }
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
diff --git a/tools/server/server.cpp b/tools/server/server.cpp