ggml-org
diff --git a/‎common/chat-parser.cpp‎
Lines changed: 5 additions & 0 deletions b/‎common/chat-parser.cpp‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎common/chat-parser.h‎
Lines changed: 2 additions & 0 deletions b/‎common/chat-parser.h‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎common/chat.cpp‎
Lines changed: 3 additions & 1 deletion b/‎common/chat.cpp‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎convert_hf_to_gguf.py‎
Lines changed: 28 additions & 0 deletions b/‎convert_hf_to_gguf.py‎
Lines changed: 28 additions & 0 deletions
diff --git a/‎docs/function-calling.md‎
Lines changed: 1 addition & 1 deletion b/‎docs/function-calling.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎ggml/src/ggml-cuda/common.cuh‎
Lines changed: 3 additions & 3 deletions b/‎ggml/src/ggml-cuda/common.cuh‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎ggml/src/ggml-cuda/ssm-scan.cu‎
Lines changed: 6 additions & 4 deletions b/‎ggml/src/ggml-cuda/ssm-scan.cu‎
Lines changed: 6 additions & 4 deletions
diff --git a/‎gguf-py/gguf/constants.py‎
Lines changed: 26 additions & 0 deletions b/‎gguf-py/gguf/constants.py‎
Lines changed: 26 additions & 0 deletions
diff --git a/‎gguf-py/gguf/tensor_mapping.py‎
Lines changed: 1 addition & 1 deletion b/‎gguf-py/gguf/tensor_mapping.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎include/llama.h‎
Lines changed: 2 additions & 2 deletions b/‎include/llama.h‎
Lines changed: 2 additions & 2 deletions
@@ -49,6 +49,7 @@ bool common_chat_msg_parser::add_tool_call(const std::string & name, const std::
 
     // LOG_DBG("Tool call arguments:\n\traw: %s\n\tresult: %s\n", arguments.c_str(), tool_call.arguments.c_str());
     result_.tool_calls.emplace_back(tool_call);
+
     return true;
 }
 bool common_chat_msg_parser::add_tool_call(const json & tool_call) {
@@ -378,3 +379,7 @@ std::optional<common_chat_msg_parser::consume_json_result> common_chat_msg_parse
         /* .is_partial = */ found_healing_marker,
     };
 }
+
+void common_chat_msg_parser::clear_tools() {
+    result_.tool_calls.clear();
+}
@@ -115,4 +115,6 @@ class common_chat_msg_parser {
         const std::vector<std::vector<std::string>> & args_paths = {},
         const std::vector<std::vector<std::string>> & content_paths = {}
     );
+
+    void clear_tools();
 };
@@ -1921,7 +1921,9 @@ common_chat_msg common_chat_parse(const std::string & input, bool is_partial, co
     } catch (const common_chat_msg_partial_exception & ex) {
         LOG_DBG("Partial parse: %s\n", ex.what());
         if (!is_partial) {
-            throw std::runtime_error(ex.what());
+            builder.clear_tools();
+            builder.move_to(0);
+            common_chat_parse_content_only(builder);
         }
     }
     auto msg = builder.result();
 
@@ -5262,6 +5262,34 @@ def prepare_tensors(self):
                 raise ValueError(f"Unprocessed experts: {experts}")
 
 
+@ModelBase.register("Dots1ForCausalLM")
+class Dots1Model(Qwen2MoeModel):
+    model_arch = gguf.MODEL_ARCH.DOTS1
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.hparams["num_experts"] = self.hparams["n_routed_experts"]
+
+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
+        self.gguf_writer.add_leading_dense_block_count(self.hparams["first_k_dense_replace"])
+        self.gguf_writer.add_expert_shared_count(self.hparams["n_shared_experts"])
+        self.gguf_writer.add_expert_weights_scale(self.hparams["routed_scaling_factor"])
+        self.gguf_writer.add_expert_weights_norm(self.hparams["norm_topk_prob"])
+
+        if self.hparams["scoring_func"] == "noaux_tc":
+            self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID)
+        else:
+            raise ValueError(f"Unsupported scoring_func value: {self.hparams['scoring_func']}")
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
+        if name.endswith("e_score_correction_bias"):
+            name = name.replace("e_score_correction_bias", "e_score_correction.bias")
+        if "shared_experts" in name:
+            return [(self.map_tensor_name(name), data_torch)]
+        return super().modify_tensors(data_torch, name, bid)
+
+
 @ModelBase.register("PLMForCausalLM")
 class PLMModel(TextModel):
     model_arch = gguf.MODEL_ARCH.PLM
 
@@ -11,7 +11,7 @@ Function calling is supported for all models (see https://github.com/ggml-org/ll
   - Llama 3.1 / 3.3 (including builtin tools support - tool names for `wolfram_alpha`, `web_search` / `brave_search`, `code_interpreter`), Llama 3.2
   - Functionary v3.1 / v3.2
   - Hermes 2/3, Qwen 2.5
-  - Qwen 2.5 Coder (WIP: https://github.com/ggml-org/llama.cpp/pull/12034)
+  - Qwen 2.5 Coder
   - Mistral Nemo
   - Firefunction v2
   - Command R7B
 
@@ -262,11 +262,11 @@ static bool cp_async_available(const int cc) {
 }
 
 static constexpr __device__ int ggml_cuda_get_physical_warp_size() {
-#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
-    return __AMDGCN_WAVEFRONT_SIZE;
+#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && (defined(__GFX9__) || defined(__GFX8__))
+    return 64;
 #else
     return 32;
-#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
+#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && (defined(__GFX9__) || defined(__GFX8__))
 }
 
 [[noreturn]]
 
@@ -10,6 +10,8 @@ __global__ void __launch_bounds__(splitD, 2)
                  float * __restrict__ dst, const int64_t L) {
     GGML_UNUSED(src1_nb0);
     GGML_UNUSED(src2_nb0);
+
+    constexpr int warp_size = ggml_cuda_get_physical_warp_size();
     const int bidx = blockIdx.x;  // split along B
     const int bidy = blockIdx.y;  // split along D
     const int tid  = threadIdx.x;
@@ -44,16 +46,16 @@ __global__ void __launch_bounds__(splitD, 2)
     if (N == 16) {
 #pragma unroll
         for (size_t i = 0; i < splitD / 4; i += 2) {
-            float value = A_block[(wid * warpSize + i) * stride_A + wtid];
+            float value = A_block[(wid * warp_size + i) * stride_A + wtid];
             // todo: bank conflict
             // I am always confused with how to use the swizzling method to solve
             // bank conflit. Hoping somebody can tell me.
-            smem_A[(wid * warpSize + i) * stride_sA + wtid + ((wtid / 16) > 0 ? 1 : 0)] = value;
+            smem_A[(wid * warp_size + i) * stride_sA + wtid + ((wtid / 16) > 0 ? 1 : 0)] = value;
         }
 #pragma unroll
         for (size_t i = 0; i < splitD / 4; i += 2) {
-            float value = s0_block[(wid * warpSize + i) * stride_s0 + wtid];
-            smem_s0[(wid * warpSize + i) * stride_ss0 + wtid + ((wtid / 16) > 0 ? 1 : 0)] = value;
+            float value = s0_block[(wid * warp_size + i) * stride_s0 + wtid];
+            smem_s0[(wid * warp_size + i) * stride_ss0 + wtid + ((wtid / 16) > 0 ? 1 : 0)] = value;
         }
     }
 
 
@@ -343,6 +343,7 @@ class MODEL_ARCH(IntEnum):
     WAVTOKENIZER_DEC = auto()
     PLM              = auto()
     BAILINGMOE       = auto()
+    DOTS1            = auto()
 
 
 class VISION_PROJECTOR_TYPE(IntEnum):
@@ -623,6 +624,7 @@ class MODEL_TENSOR(IntEnum):
     MODEL_ARCH.WAVTOKENIZER_DEC: "wavtokenizer-dec",
     MODEL_ARCH.PLM:              "plm",
     MODEL_ARCH.BAILINGMOE:       "bailingmoe",
+    MODEL_ARCH.DOTS1:            "dots1"
 }
 
 VISION_PROJECTOR_TYPE_NAMES: dict[VISION_PROJECTOR_TYPE, str] = {
@@ -2044,6 +2046,30 @@ class MODEL_TENSOR(IntEnum):
         MODEL_TENSOR.FFN_DOWN_SHEXP,
         MODEL_TENSOR.FFN_UP_SHEXP,
     ],
+    MODEL_ARCH.DOTS1: [
+        MODEL_TENSOR.TOKEN_EMBD,
+        MODEL_TENSOR.OUTPUT_NORM,
+        MODEL_TENSOR.OUTPUT,
+        MODEL_TENSOR.ATTN_NORM,
+        MODEL_TENSOR.ATTN_Q,
+        MODEL_TENSOR.ATTN_Q_NORM,
+        MODEL_TENSOR.ATTN_K,
+        MODEL_TENSOR.ATTN_K_NORM,
+        MODEL_TENSOR.ATTN_V,
+        MODEL_TENSOR.ATTN_OUT,
+        MODEL_TENSOR.FFN_EXP_PROBS_B,
+        MODEL_TENSOR.FFN_NORM,
+        MODEL_TENSOR.FFN_GATE,
+        MODEL_TENSOR.FFN_GATE_EXP,
+        MODEL_TENSOR.FFN_GATE_INP,
+        MODEL_TENSOR.FFN_GATE_SHEXP,
+        MODEL_TENSOR.FFN_DOWN,
+        MODEL_TENSOR.FFN_DOWN_EXP,
+        MODEL_TENSOR.FFN_DOWN_SHEXP,
+        MODEL_TENSOR.FFN_UP,
+        MODEL_TENSOR.FFN_UP_EXP,
+        MODEL_TENSOR.FFN_UP_SHEXP,
+    ],
     # TODO
 }
 
 
@@ -305,7 +305,7 @@ class TensorNameMap:
         ),
 
         MODEL_TENSOR.FFN_EXP_PROBS_B: (
-            "model.layers.{bid}.mlp.gate.e_score_correction", # deepseek-v3
+            "model.layers.{bid}.mlp.gate.e_score_correction", # deepseek-v3 dots1
         ),
 
         # Feed-forward up
 
@@ -243,14 +243,14 @@ extern "C" {
 
     typedef bool (*llama_progress_callback)(float progress, void * user_data);
 
-    // Input data for llama_decode
+    // Input data for llama_encode/llama_decode
     // A llama_batch object can contain input about one or many sequences
     // The provided arrays (i.e. token, embd, pos, etc.) must have size of n_tokens
     //
     // - token  : the token ids of the input (used when embd is NULL)
     // - embd   : token embeddings (i.e. float vector of size n_embd) (used when token is NULL)
     // - pos    : the positions of the respective token in the sequence
-    //            (if set to NULL, the token position will be tracked automatically by llama_decode)
+    //            (if set to NULL, the token position will be tracked automatically by llama_encode/llama_decode)
     // - seq_id : the sequence to which the respective token belongs
     //            (if set to NULL, the sequence ID will be assumed to be 0)
     // - logits : if zero, the logits (and/or the embeddings) for the respective token will not be output
Original file line number	Diff line number	Diff line change
`@@ -49,6 +49,7 @@ bool common_chat_msg_parser::add_tool_call(const std::string & name, const std::`
`49`	`49`
`50`	`50`	`// LOG_DBG("Tool call arguments:\n\traw: %s\n\tresult: %s\n", arguments.c_str(), tool_call.arguments.c_str());`
`51`	`51`	`result_.tool_calls.emplace_back(tool_call);`
	`52`	`+`
`52`	`53`	`return true;`
`53`	`54`	`}`
`54`	`55`	`bool common_chat_msg_parser::add_tool_call(const json & tool_call) {`
`@@ -378,3 +379,7 @@ std::optional<common_chat_msg_parser::consume_json_result> common_chat_msg_parse`
`378`	`379`	`/* .is_partial = */ found_healing_marker,`
`379`	`380`	`};`
`380`	`381`	`}`
	`382`	`+`
	`383`	`+void common_chat_msg_parser::clear_tools() {`
	`384`	`+ result_.tool_calls.clear();`
	`385`	`+}`
Original file line number	Diff line number	Diff line change
`@@ -1921,7 +1921,9 @@ common_chat_msg common_chat_parse(const std::string & input, bool is_partial, co`
`1921`	`1921`	`} catch (const common_chat_msg_partial_exception & ex) {`
`1922`	`1922`	`LOG_DBG("Partial parse: %s\n", ex.what());`
`1923`	`1923`	`if (!is_partial) {`
`1924`		`- throw std::runtime_error(ex.what());`
	`1924`	`+ builder.clear_tools();`
	`1925`	`+ builder.move_to(0);`
	`1926`	`+ common_chat_parse_content_only(builder);`
`1925`	`1927`	`}`
`1926`	`1928`	`}`
`1927`	`1929`	`auto msg = builder.result();`
Original file line number	Diff line number	Diff line change
`@@ -262,11 +262,11 @@ static bool cp_async_available(const int cc) {`
`262`	`262`	`}`
`263`	`263`
`264`	`264`	`static constexpr __device__ int ggml_cuda_get_physical_warp_size() {`
`265`		`-#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)`
`266`		`- return __AMDGCN_WAVEFRONT_SIZE;`
	`265`	`+#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && (defined(__GFX9__) \|\| defined(__GFX8__))`
	`266`	`+ return 64;`
`267`	`267`	`#else`
`268`	`268`	`return 32;`
`269`		`-#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)`
	`269`	`+#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && (defined(__GFX9__) \|\| defined(__GFX8__))`
`270`	`270`	`}`
`271`	`271`
`272`	`272`	`[[noreturn]]`