@@ -564,7 +564,7 @@ enum llm_tensor {
564564 LLM_TENSOR_FFN_DOWN_SHEXP,
565565 LLM_TENSOR_FFN_GATE_SHEXP,
566566 LLM_TENSOR_FFN_UP_SHEXP,
567- LLM_TENSOR_FFN_EXPERT_WEIGHTS_B ,
567+ LLM_TENSOR_FFN_EXP_PROBS_B ,
568568 LLM_TENSOR_ATTN_Q_NORM,
569569 LLM_TENSOR_ATTN_K_NORM,
570570 LLM_TENSOR_LAYER_OUT_NORM,
@@ -1434,7 +1434,7 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
14341434 { LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" },
14351435 { LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" },
14361436 { LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
1437- { LLM_TENSOR_FFN_EXPERT_WEIGHTS_B, "blk.%d.expert_weights_b " },
1437+ { LLM_TENSOR_FFN_EXP_PROBS_B, "blk.%d.exp_probs_b " },
14381438 },
14391439 },
14401440 {
@@ -2934,7 +2934,7 @@ struct llama_layer {
29342934 struct ggml_tensor * ffn_down_b = nullptr; // b2
29352935 struct ggml_tensor * ffn_up_b = nullptr; // b3
29362936 struct ggml_tensor * ffn_act = nullptr;
2937- struct ggml_tensor * ffn_expert_weights_bias = nullptr;
2937+ struct ggml_tensor * ffn_exp_probs_b = nullptr;
29382938
29392939 // mamba proj
29402940 struct ggml_tensor * ssm_in = nullptr;
@@ -7480,7 +7480,7 @@ static const std::map<llm_tensor, llm_tensor_info> llm_tensor_info_mapping = {
74807480 {LLM_TENSOR_FFN_DOWN_EXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
74817481 {LLM_TENSOR_FFN_GATE_EXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
74827482 {LLM_TENSOR_FFN_UP_EXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
7483- {LLM_TENSOR_FFN_EXPERT_WEIGHTS_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
7483+ {LLM_TENSOR_FFN_EXP_PROBS_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
74847484 // this tensor is loaded for T5, but never used
74857485 {LLM_TENSOR_DEC_CROSS_ATTN_REL_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}},
74867486 {LLM_TENSOR_CONV1D, {LLM_TENSOR_LAYER_INPUT, GGML_OP_IM2COL}},
@@ -9283,7 +9283,7 @@ static bool llm_load_tensors(
92839283 layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
92849284 } else {
92859285 layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
9286- layer.ffn_expert_weights_bias = create_tensor(tn(LLM_TENSOR_FFN_EXPERT_WEIGHTS_B , "bias", i), {n_expert}, llama_model_loader::TENSOR_NOT_REQUIRED);
9286+ layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B , "bias", i), {n_expert}, llama_model_loader::TENSOR_NOT_REQUIRED);
92879287
92889288 if (n_expert == 0) {
92899289 throw std::runtime_error("n_expert must be > 0");
@@ -10285,22 +10285,22 @@ llm_expert_gating_func_type gating_op,
1028510285 case LLM_EXPERT_GATING_FUNC_SOFTMAX:
1028610286 {
1028710287 probs = ggml_soft_max(ctx, logits); // [n_expert, n_tokens]
10288- cb(probs, "ffn_moe_probs", il);
1028910288 } break;
1029010289 case LLM_EXPERT_GATING_FUNC_SIGMOID:
1029110290 {
1029210291 probs = ggml_sigmoid(ctx, logits); // [n_expert, n_tokens]
10293- cb(probs, "ffn_moe_sigm", il);
1029410292 } break;
1029510293 default:
1029610294 GGML_ABORT("fatal error");
1029710295 }
10296+ cb(probs, "ffn_moe_probs", il);
1029810297
1029910298 // add experts selection bias - introduced in DeepSeek V3
10299+ // leave probs unbiased as it's later used to get expert weights
1030010300 ggml_tensor * selection_probs = probs;
1030110301 if (expert_weights_b != nullptr) {
1030210302 selection_probs = ggml_add(ctx, probs, expert_weights_b);
10303- cb(selection_probs, "ffn_moe_sigm_biased ", il);
10303+ cb(selection_probs, "ffn_moe_probs_biased ", il);
1030410304 }
1030510305
1030610306 // select experts
@@ -16241,7 +16241,7 @@ struct llm_build_context {
1624116241 model.layers[il].ffn_up_exps,
1624216242 model.layers[il].ffn_gate_exps,
1624316243 model.layers[il].ffn_down_exps,
16244- model.layers[il].ffn_expert_weights_bias ,
16244+ model.layers[il].ffn_exp_probs_b ,
1624516245 n_expert, n_expert_used,
1624616246 LLM_FFN_SILU, hparams.expert_weights_norm,
1624716247 true, hparams.expert_weights_scale,
0 commit comments