@@ -938,6 +938,100 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
938938 return moe_out;
939939}
940940
941+ ggml_tensor * llm_graph_context::build_moe_ffn_from_probs (
942+ ggml_tensor * cur,
943+ ggml_tensor * probs,
944+ ggml_tensor * up_exps,
945+ ggml_tensor * gate_exps,
946+ ggml_tensor * down_exps,
947+ ggml_tensor * exp_probs_b,
948+ int64_t n_expert,
949+ int64_t n_expert_used,
950+ llama_expert_gating_func_type gating_op,
951+ int il) const {
952+ const int64_t n_embd = cur->ne [0 ];
953+ const int64_t n_tokens = cur->ne [1 ];
954+
955+ // add experts selection bias - introduced in DeepSeek V3
956+ // leave probs unbiased as it's later used to get expert weights
957+ ggml_tensor * selection_probs = probs;
958+ if (exp_probs_b != nullptr ) {
959+ selection_probs = ggml_add (ctx0, probs, exp_probs_b);
960+ cb (selection_probs, " ffn_moe_probs_biased" , il);
961+ }
962+
963+ // select experts
964+ ggml_tensor * selected_experts = ggml_top_k (ctx0, selection_probs, n_expert_used); // [n_expert_used, n_tokens]
965+ cb (selected_experts->src [0 ], " ffn_moe_argsort" , il);
966+ cb (selected_experts, " ffn_moe_topk" , il);
967+
968+ ggml_tensor * weights = ggml_get_rows (ctx0,
969+ ggml_reshape_3d (ctx0, probs, 1 , n_expert, n_tokens), selected_experts); // [1, n_expert_used, n_tokens]
970+ cb (weights, " ffn_moe_weights" , il);
971+
972+ weights = ggml_reshape_2d (ctx0, weights, n_expert_used, n_tokens);
973+ if (gating_op == LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX) {
974+ weights = ggml_soft_max (ctx0, weights);
975+ } else {
976+ weights = ggml_sigmoid (ctx0, weights);
977+ ggml_tensor * weights_sum = ggml_sum_rows (ctx0, weights); // [1, n_tokens]
978+ cb (weights_sum, " ffn_moe_weights_sum" , il);
979+
980+ weights = ggml_div (ctx0, weights, weights_sum); // [n_expert_used, n_tokens]
981+ cb (weights, " ffn_moe_weights_norm" , il);
982+ }
983+
984+ weights = ggml_reshape_3d (ctx0, weights, 1 , n_expert_used, n_tokens);
985+
986+ cur = ggml_reshape_3d (ctx0, cur, n_embd, 1 , n_tokens);
987+
988+ ggml_tensor * up = build_lora_mm_id (up_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
989+ cb (up, " ffn_moe_up" , il);
990+
991+ ggml_tensor * experts = nullptr ;
992+ cur = build_lora_mm_id (gate_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
993+ cb (cur, " ffn_moe_gate" , il);
994+
995+ cur = ggml_reglu_split (ctx0, cur, up);
996+ cb (cur, " ffn_moe_reglu" , il);
997+
998+ experts = build_lora_mm_id (down_exps, cur, selected_experts); // [n_embd, n_expert_used, n_tokens]
999+ cb (experts, " ffn_moe_down" , il);
1000+
1001+ experts = ggml_mul (ctx0, experts, weights);
1002+ cb (cur, " ffn_moe_weighted" , il);
1003+
1004+ ggml_tensor * cur_experts[LLAMA_MAX_EXPERTS] = { nullptr };
1005+
1006+ assert (n_expert_used > 0 );
1007+
1008+ // order the views before the adds
1009+ for (uint32_t i = 0 ; i < hparams.n_expert_used ; ++i) {
1010+ cur_experts[i] = ggml_view_2d (ctx0, experts, n_embd, n_tokens, experts->nb [2 ], i*experts->nb [1 ]);
1011+
1012+ ggml_build_forward_expand (gf, cur_experts[i]);
1013+ }
1014+
1015+ // aggregate experts
1016+ // note: here we explicitly use hparams.n_expert_used instead of n_expert_used
1017+ // to avoid potentially a large number of add nodes during warmup
1018+ // ref: https://github.com/ggml-org/llama.cpp/pull/14753
1019+ ggml_tensor * moe_out = cur_experts[0 ];
1020+
1021+ for (uint32_t i = 1 ; i < hparams.n_expert_used ; ++i) {
1022+ moe_out = ggml_add (ctx0, moe_out, cur_experts[i]);
1023+ }
1024+
1025+ if (n_expert_used == 1 ) {
1026+ // avoid returning a non-contiguous tensor
1027+ moe_out = ggml_cont (ctx0, moe_out);
1028+ }
1029+
1030+ cb (moe_out, " ffn_moe_out" , il);
1031+
1032+ return moe_out;
1033+ }
1034+
9411035// input embeddings with optional lora
9421036ggml_tensor * llm_graph_context::build_inp_embd (ggml_tensor * tok_embd) const {
9431037 const int64_t n_embd = hparams.n_embd ;
0 commit comments