Skip to content

Commit 083bcba

Browse files
authored
Merge branch 'ggml-org:master' into master
2 parents 5f96917 + bbac6a2 commit 083bcba

File tree

5 files changed

+24
-10
lines changed

5 files changed

+24
-10
lines changed

convert_hf_to_gguf.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -742,6 +742,12 @@ def set_gguf_parameters(self):
742742
if (n_experts_used := self.hparams.get("num_experts_per_tok")) is not None:
743743
self.gguf_writer.add_expert_used_count(n_experts_used)
744744
logger.info(f"gguf: experts used count = {n_experts_used}")
745+
if (n_expert_groups := self.hparams.get("n_group")) is not None:
746+
self.gguf_writer.add_expert_group_count(n_expert_groups)
747+
logger.info(f"gguf: expert groups count = {n_expert_groups}")
748+
if (n_group_used := self.hparams.get("topk_group")) is not None:
749+
self.gguf_writer.add_expert_group_used_count(n_group_used)
750+
logger.info(f"gguf: expert groups used count = {n_group_used}")
745751

746752
if (head_dim := self.hparams.get("head_dim")) is not None:
747753
self.gguf_writer.add_key_length(head_dim)
@@ -8233,8 +8239,6 @@ def set_gguf_parameters(self):
82338239
self.gguf_writer.add_expert_weights_scale(hparams["routed_scaling_factor"])
82348240
self.gguf_writer.add_expert_count(hparams["num_experts"])
82358241
self.gguf_writer.add_expert_shared_count(hparams["num_shared_experts"])
8236-
self.gguf_writer.add_expert_group_count(hparams["n_group"])
8237-
self.gguf_writer.add_expert_group_used_count(hparams["topk_group"])
82388242
self.gguf_writer.add_expert_weights_norm(hparams["norm_topk_prob"])
82398243

82408244
if hparams["score_function"] == "sigmoid":

ggml/src/ggml-cuda/ggml-cuda.cu

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1957,8 +1957,15 @@ static void ggml_cuda_mul_mat_batched_cublas_impl(ggml_backend_cuda_context & ct
19571957

19581958
size_t src1_stride_size = sizeof(cuda_t);
19591959

1960-
dim3 block_dims(ne13, ne12);
1961-
k_compute_batched_ptrs<<<1, block_dims, 0, main_stream>>>(
1960+
const int threads_x = 16;
1961+
const int threads_y = 16;
1962+
dim3 block_dims(threads_x, threads_y);
1963+
1964+
dim3 grid_dims(
1965+
(ne13 + threads_x - 1) / threads_x,
1966+
(ne12 + threads_y - 1) / threads_y
1967+
);
1968+
k_compute_batched_ptrs<<<grid_dims, block_dims, 0, main_stream>>>(
19621969
src0_ptr, src1_ptr, dst_t,
19631970
ptrs_src.get(), ptrs_dst.get(),
19641971
ne12, ne13,

src/llama-graph.cpp

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1009,10 +1009,9 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
10091009
ggml_tensor * weights_sum = ggml_sum_rows(ctx0, weights); // [1, n_tokens]
10101010
cb(weights_sum, "ffn_moe_weights_sum", il);
10111011

1012-
if (arch == LLM_ARCH_BAILINGMOE2) {
1013-
weights_sum = ggml_scale_bias(ctx0, weights_sum, 1.0, 1e-20);
1014-
cb(weights_sum, "ffn_moe_weights_sum_biased", il);
1015-
}
1012+
// Avoid division by zero, clamp to smallest number representable by F16
1013+
weights_sum = ggml_clamp(ctx0, weights_sum, 6.103515625e-5, INFINITY);
1014+
cb(weights_sum, "ffn_moe_weights_sum_clamped", il);
10161015

10171016
weights = ggml_div(ctx0, weights, weights_sum); // [n_expert_used, n_tokens]
10181017
cb(weights, "ffn_moe_weights_norm", il);

src/llama-model.cpp

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6369,6 +6369,8 @@ void llama_model::print_info() const {
63696369
LLAMA_LOG_INFO("%s: n_ff = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_ff(il); }, hparams.n_layer).c_str());
63706370
LLAMA_LOG_INFO("%s: n_expert = %u\n", __func__, hparams.n_expert);
63716371
LLAMA_LOG_INFO("%s: n_expert_used = %u\n", __func__, hparams.n_expert_used);
6372+
LLAMA_LOG_INFO("%s: n_expert_groups = %d\n", __func__, hparams.n_expert_groups);
6373+
LLAMA_LOG_INFO("%s: n_group_used = %d\n", __func__, hparams.n_group_used);
63726374
LLAMA_LOG_INFO("%s: causal attn = %d\n", __func__, hparams.causal_attn);
63736375
LLAMA_LOG_INFO("%s: pooling type = %d\n", __func__, hparams.pooling_type);
63746376
LLAMA_LOG_INFO("%s: rope type = %d\n", __func__, hparams.rope_type);
@@ -6469,8 +6471,6 @@ void llama_model::print_info() const {
64696471
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
64706472
LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
64716473
LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
6472-
LLAMA_LOG_INFO("%s: n_expert_groups = %d\n", __func__, hparams.n_expert_groups);
6473-
LLAMA_LOG_INFO("%s: n_group_used = %d\n", __func__, hparams.n_group_used);
64746474
LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
64756475
LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm);
64766476
LLAMA_LOG_INFO("%s: expert_gating_func = %s\n", __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func));
@@ -19339,6 +19339,7 @@ struct llm_build_smallthinker : public llm_graph_context{
1933919339

1934019340
cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
1934119341
cb(cur, "result_norm", -1);
19342+
res->t_embd = cur;
1934219343

1934319344
// lm_head
1934419345
cur = build_lora_mm(model.output, cur);

tests/test-backend-ops.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6697,6 +6697,9 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
66976697
test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 1024, {3, 2}, {1, 1}));
66986698
test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 8, 1024, {3, 2}, {1, 1}));
66996699
test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 1024, {3, 2}, {1, 1}));
6700+
6701+
// test cases with large batch size
6702+
test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 8, 256, {1536, 1}, {1, 1}));
67006703
}
67016704
}
67026705
for (ggml_type type_a : other_types) {

0 commit comments

Comments
 (0)