correct group selection and rename n_group_exp

CISC · web-flow · commit f885540aa9de · 2025-09-28T15:07:42.000+02:00
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
@@ -928,24 +928,24 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
         cb(selection_probs, "ffn_moe_probs_biased", il);
     }
 
-    // select top n_group_exp expert groups
+    // select top n_group_used expert groups
     if (arch == LLM_ARCH_BAILINGMOE2) {
         const int64_t n_exp_per_group = n_expert / hparams.n_expert_groups;
 
         // organize experts into n_expert_groups
-        ggml_tensor * selection_groups = ggml_view_2d(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, selection_probs)), n_tokens * n_exp_per_group, hparams.n_expert_groups, n_tokens * n_exp_per_group * sizeof(float), 0); // [n_tokens, n_expert_groups]
+        ggml_tensor * selection_groups = ggml_view_2d(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, selection_probs)), n_tokens * n_exp_per_group, hparams.n_expert_groups, n_tokens * n_exp_per_group * sizeof(float), 0); // [n_tokens * n_exp_per_group, n_expert_groups]
         ggml_tensor * group_scores = ggml_top_k(ctx0, selection_groups, 2); // [2, n_expert_groups]
+        group_scores = ggml_get_rows(ctx0, ggml_reshape_3d(ctx0, selection_groups, 1, selection_groups->ne[0], selection_groups->ne[1]), group_scores); // [1, 2, n_expert_groups]
 
-        // get top n_group_exp expert groups
-        group_scores = ggml_transpose(ctx0, ggml_sum_rows(ctx0, ggml_cast(ctx0, group_scores, GGML_TYPE_F32))); // [n_expert_groups, 1]
-        ggml_tensor * expert_groups = ggml_top_k(ctx0, ggml_cont(ctx0, group_scores), hparams.n_group_exp); // [n_group_exp, 1]
+        // get top n_group_used expert groups
+        group_scores = ggml_transpose(ctx0, ggml_sum_rows(ctx0, ggml_reshape_2d(ctx0, group_scores, group_scores->ne[1], group_scores->ne[2]))); // [n_expert_groups, 1]
+        ggml_tensor * expert_groups = ggml_top_k(ctx0, ggml_cont(ctx0, group_scores), hparams.n_group_used); // [n_group_used, 1]
         cb(expert_groups->src[0], "ffn_moe_group_argsort", il);
         cb(expert_groups, "ffn_moe_group_topk", il);
 
         // mask out the other groups
-        selection_probs = ggml_scale_bias(ctx0, selection_groups, 0.0f, -INFINITY);
-        group_scores = ggml_repeat_4d(ctx0, expert_groups, selection_probs->ne[1], 1, 1, 1); // [n_expert_groups, 1]
-        selection_probs = ggml_set_rows(ctx0, selection_probs, selection_groups, group_scores); // [n_tokens, n_expert_groups]
+        selection_probs = ggml_get_rows(ctx0, selection_groups, expert_groups); // [n_tokens * n_exp_per_group, n_group_used]
+        selection_probs = ggml_set_rows(ctx0, ggml_scale_bias(ctx0, selection_groups, 0.0f, -INFINITY), selection_probs, expert_groups); // [n_tokens * n_exp_per_group, n_expert_groups]
         selection_probs = ggml_view_2d(ctx0, selection_probs, n_tokens, n_expert, n_tokens * sizeof(float), 0); // [n_tokens, n_expert]
         selection_probs = ggml_cont(ctx0, ggml_transpose(ctx0, selection_probs)); // [n_expert, n_tokens]
         cb(selection_probs, "ffn_moe_probs_masked", il);
diff --git a/src/llama-hparams.h b/src/llama-hparams.h
@@ -73,7 +73,7 @@ struct llama_hparams {
     uint32_t n_expert_shared    = 0;
     uint32_t n_norm_groups      = 0;
     uint32_t n_expert_groups    = 0;
-    uint32_t n_group_exp        = 0;
+    uint32_t n_group_used       = 0;
     uint32_t n_group_experts    = 0;
 
     float    expert_group_scale   = 0.05f;
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
@@ -1877,7 +1877,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                 ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp);
                 ml.get_key(LLM_KV_EXPERT_SHARED_COUNT,               hparams.n_expert_shared);
                 ml.get_key(LLM_KV_EXPERT_GROUP_COUNT,                hparams.n_expert_groups);
-                ml.get_key(LLM_KV_EXPERT_GROUP_USED_COUNT,           hparams.n_group_exp);
+                ml.get_key(LLM_KV_EXPERT_GROUP_USED_COUNT,           hparams.n_group_used);
                 ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE,              hparams.expert_weights_scale);
                 ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM,               hparams.expert_weights_norm, false);
                 ml.get_key(LLM_KV_EXPERT_GATING_FUNC,                hparams.expert_gating_func);
@@ -6338,7 +6338,7 @@ void llama_model::print_info() const {
         LLAMA_LOG_INFO("%s: n_ff_shexp           = %d\n",     __func__, hparams.n_ff_shexp);
         LLAMA_LOG_INFO("%s: n_expert_shared      = %d\n",     __func__, hparams.n_expert_shared);
         LLAMA_LOG_INFO("%s: n_expert_groups      = %d\n",     __func__, hparams.n_expert_groups);
-        LLAMA_LOG_INFO("%s: n_group_exp          = %d\n",     __func__, hparams.n_group_exp);
+        LLAMA_LOG_INFO("%s: n_group_used         = %d\n",     __func__, hparams.n_group_used);
         LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n",   __func__, hparams.expert_weights_scale);
         LLAMA_LOG_INFO("%s: expert_weights_norm  = %d\n",     __func__, hparams.expert_weights_norm);
         LLAMA_LOG_INFO("%s: expert_gating_func   = %s\n",     __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func));