Skip to content

Commit 8302a8a

Browse files
committed
Merge branch 'master' into imatrix
2 parents 8ecd5fa + 30e5b01 commit 8302a8a

24 files changed

+524
-59
lines changed

common/chat-parser.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@ bool common_chat_msg_parser::add_tool_call(const std::string & name, const std::
4949

5050
// LOG_DBG("Tool call arguments:\n\traw: %s\n\tresult: %s\n", arguments.c_str(), tool_call.arguments.c_str());
5151
result_.tool_calls.emplace_back(tool_call);
52+
5253
return true;
5354
}
5455
bool common_chat_msg_parser::add_tool_call(const json & tool_call) {
@@ -378,3 +379,7 @@ std::optional<common_chat_msg_parser::consume_json_result> common_chat_msg_parse
378379
/* .is_partial = */ found_healing_marker,
379380
};
380381
}
382+
383+
void common_chat_msg_parser::clear_tools() {
384+
result_.tool_calls.clear();
385+
}

common/chat-parser.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -115,4 +115,6 @@ class common_chat_msg_parser {
115115
const std::vector<std::vector<std::string>> & args_paths = {},
116116
const std::vector<std::vector<std::string>> & content_paths = {}
117117
);
118+
119+
void clear_tools();
118120
};

common/chat.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1921,7 +1921,9 @@ common_chat_msg common_chat_parse(const std::string & input, bool is_partial, co
19211921
} catch (const common_chat_msg_partial_exception & ex) {
19221922
LOG_DBG("Partial parse: %s\n", ex.what());
19231923
if (!is_partial) {
1924-
throw std::runtime_error(ex.what());
1924+
builder.clear_tools();
1925+
builder.move_to(0);
1926+
common_chat_parse_content_only(builder);
19251927
}
19261928
}
19271929
auto msg = builder.result();

convert_hf_to_gguf.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5262,6 +5262,34 @@ def prepare_tensors(self):
52625262
raise ValueError(f"Unprocessed experts: {experts}")
52635263

52645264

5265+
@ModelBase.register("Dots1ForCausalLM")
5266+
class Dots1Model(Qwen2MoeModel):
5267+
model_arch = gguf.MODEL_ARCH.DOTS1
5268+
5269+
def __init__(self, *args, **kwargs):
5270+
super().__init__(*args, **kwargs)
5271+
self.hparams["num_experts"] = self.hparams["n_routed_experts"]
5272+
5273+
def set_gguf_parameters(self):
5274+
super().set_gguf_parameters()
5275+
self.gguf_writer.add_leading_dense_block_count(self.hparams["first_k_dense_replace"])
5276+
self.gguf_writer.add_expert_shared_count(self.hparams["n_shared_experts"])
5277+
self.gguf_writer.add_expert_weights_scale(self.hparams["routed_scaling_factor"])
5278+
self.gguf_writer.add_expert_weights_norm(self.hparams["norm_topk_prob"])
5279+
5280+
if self.hparams["scoring_func"] == "noaux_tc":
5281+
self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID)
5282+
else:
5283+
raise ValueError(f"Unsupported scoring_func value: {self.hparams['scoring_func']}")
5284+
5285+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
5286+
if name.endswith("e_score_correction_bias"):
5287+
name = name.replace("e_score_correction_bias", "e_score_correction.bias")
5288+
if "shared_experts" in name:
5289+
return [(self.map_tensor_name(name), data_torch)]
5290+
return super().modify_tensors(data_torch, name, bid)
5291+
5292+
52655293
@ModelBase.register("PLMForCausalLM")
52665294
class PLMModel(TextModel):
52675295
model_arch = gguf.MODEL_ARCH.PLM

docs/function-calling.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ Function calling is supported for all models (see https://github.com/ggml-org/ll
1111
- Llama 3.1 / 3.3 (including builtin tools support - tool names for `wolfram_alpha`, `web_search` / `brave_search`, `code_interpreter`), Llama 3.2
1212
- Functionary v3.1 / v3.2
1313
- Hermes 2/3, Qwen 2.5
14-
- Qwen 2.5 Coder (WIP: https://github.com/ggml-org/llama.cpp/pull/12034)
14+
- Qwen 2.5 Coder
1515
- Mistral Nemo
1616
- Firefunction v2
1717
- Command R7B

ggml/src/ggml-cuda/common.cuh

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -262,11 +262,11 @@ static bool cp_async_available(const int cc) {
262262
}
263263

264264
static constexpr __device__ int ggml_cuda_get_physical_warp_size() {
265-
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
266-
return __AMDGCN_WAVEFRONT_SIZE;
265+
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && (defined(__GFX9__) || defined(__GFX8__))
266+
return 64;
267267
#else
268268
return 32;
269-
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
269+
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && (defined(__GFX9__) || defined(__GFX8__))
270270
}
271271

272272
[[noreturn]]

ggml/src/ggml-cuda/ssm-scan.cu

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@ __global__ void __launch_bounds__(splitD, 2)
1010
float * __restrict__ dst, const int64_t L) {
1111
GGML_UNUSED(src1_nb0);
1212
GGML_UNUSED(src2_nb0);
13+
14+
constexpr int warp_size = ggml_cuda_get_physical_warp_size();
1315
const int bidx = blockIdx.x; // split along B
1416
const int bidy = blockIdx.y; // split along D
1517
const int tid = threadIdx.x;
@@ -44,16 +46,16 @@ __global__ void __launch_bounds__(splitD, 2)
4446
if (N == 16) {
4547
#pragma unroll
4648
for (size_t i = 0; i < splitD / 4; i += 2) {
47-
float value = A_block[(wid * warpSize + i) * stride_A + wtid];
49+
float value = A_block[(wid * warp_size + i) * stride_A + wtid];
4850
// todo: bank conflict
4951
// I am always confused with how to use the swizzling method to solve
5052
// bank conflit. Hoping somebody can tell me.
51-
smem_A[(wid * warpSize + i) * stride_sA + wtid + ((wtid / 16) > 0 ? 1 : 0)] = value;
53+
smem_A[(wid * warp_size + i) * stride_sA + wtid + ((wtid / 16) > 0 ? 1 : 0)] = value;
5254
}
5355
#pragma unroll
5456
for (size_t i = 0; i < splitD / 4; i += 2) {
55-
float value = s0_block[(wid * warpSize + i) * stride_s0 + wtid];
56-
smem_s0[(wid * warpSize + i) * stride_ss0 + wtid + ((wtid / 16) > 0 ? 1 : 0)] = value;
57+
float value = s0_block[(wid * warp_size + i) * stride_s0 + wtid];
58+
smem_s0[(wid * warp_size + i) * stride_ss0 + wtid + ((wtid / 16) > 0 ? 1 : 0)] = value;
5759
}
5860
}
5961

gguf-py/gguf/constants.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -343,6 +343,7 @@ class MODEL_ARCH(IntEnum):
343343
WAVTOKENIZER_DEC = auto()
344344
PLM = auto()
345345
BAILINGMOE = auto()
346+
DOTS1 = auto()
346347

347348

348349
class VISION_PROJECTOR_TYPE(IntEnum):
@@ -623,6 +624,7 @@ class MODEL_TENSOR(IntEnum):
623624
MODEL_ARCH.WAVTOKENIZER_DEC: "wavtokenizer-dec",
624625
MODEL_ARCH.PLM: "plm",
625626
MODEL_ARCH.BAILINGMOE: "bailingmoe",
627+
MODEL_ARCH.DOTS1: "dots1"
626628
}
627629

628630
VISION_PROJECTOR_TYPE_NAMES: dict[VISION_PROJECTOR_TYPE, str] = {
@@ -2044,6 +2046,30 @@ class MODEL_TENSOR(IntEnum):
20442046
MODEL_TENSOR.FFN_DOWN_SHEXP,
20452047
MODEL_TENSOR.FFN_UP_SHEXP,
20462048
],
2049+
MODEL_ARCH.DOTS1: [
2050+
MODEL_TENSOR.TOKEN_EMBD,
2051+
MODEL_TENSOR.OUTPUT_NORM,
2052+
MODEL_TENSOR.OUTPUT,
2053+
MODEL_TENSOR.ATTN_NORM,
2054+
MODEL_TENSOR.ATTN_Q,
2055+
MODEL_TENSOR.ATTN_Q_NORM,
2056+
MODEL_TENSOR.ATTN_K,
2057+
MODEL_TENSOR.ATTN_K_NORM,
2058+
MODEL_TENSOR.ATTN_V,
2059+
MODEL_TENSOR.ATTN_OUT,
2060+
MODEL_TENSOR.FFN_EXP_PROBS_B,
2061+
MODEL_TENSOR.FFN_NORM,
2062+
MODEL_TENSOR.FFN_GATE,
2063+
MODEL_TENSOR.FFN_GATE_EXP,
2064+
MODEL_TENSOR.FFN_GATE_INP,
2065+
MODEL_TENSOR.FFN_GATE_SHEXP,
2066+
MODEL_TENSOR.FFN_DOWN,
2067+
MODEL_TENSOR.FFN_DOWN_EXP,
2068+
MODEL_TENSOR.FFN_DOWN_SHEXP,
2069+
MODEL_TENSOR.FFN_UP,
2070+
MODEL_TENSOR.FFN_UP_EXP,
2071+
MODEL_TENSOR.FFN_UP_SHEXP,
2072+
],
20472073
# TODO
20482074
}
20492075

gguf-py/gguf/tensor_mapping.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -305,7 +305,7 @@ class TensorNameMap:
305305
),
306306

307307
MODEL_TENSOR.FFN_EXP_PROBS_B: (
308-
"model.layers.{bid}.mlp.gate.e_score_correction", # deepseek-v3
308+
"model.layers.{bid}.mlp.gate.e_score_correction", # deepseek-v3 dots1
309309
),
310310

311311
# Feed-forward up

include/llama.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -243,14 +243,14 @@ extern "C" {
243243

244244
typedef bool (*llama_progress_callback)(float progress, void * user_data);
245245

246-
// Input data for llama_decode
246+
// Input data for llama_encode/llama_decode
247247
// A llama_batch object can contain input about one or many sequences
248248
// The provided arrays (i.e. token, embd, pos, etc.) must have size of n_tokens
249249
//
250250
// - token : the token ids of the input (used when embd is NULL)
251251
// - embd : token embeddings (i.e. float vector of size n_embd) (used when token is NULL)
252252
// - pos : the positions of the respective token in the sequence
253-
// (if set to NULL, the token position will be tracked automatically by llama_decode)
253+
// (if set to NULL, the token position will be tracked automatically by llama_encode/llama_decode)
254254
// - seq_id : the sequence to which the respective token belongs
255255
// (if set to NULL, the sequence ID will be assumed to be 0)
256256
// - logits : if zero, the logits (and/or the embeddings) for the respective token will not be output

0 commit comments

Comments
 (0)