@@ -785,13 +785,20 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
785
785
bool scale_w,
786
786
float w_scale,
787
787
llama_expert_gating_func_type gating_op,
788
- int il) const {
788
+ int il,
789
+ ggml_tensor * probs_in) const {
789
790
const int64_t n_embd = cur->ne [0 ];
790
791
const int64_t n_tokens = cur->ne [1 ];
791
792
const bool weight_before_ffn = arch == LLM_ARCH_LLAMA4; // for llama4, we apply the sigmoid-ed weights before the FFN
792
793
793
- ggml_tensor * logits = build_lora_mm (gate_inp, cur); // [n_expert, n_tokens]
794
- cb (logits, " ffn_moe_logits" , il);
794
+ ggml_tensor * logits = nullptr ;
795
+
796
+ if (probs_in == nullptr ) {
797
+ logits = build_lora_mm (gate_inp, cur); // [n_expert, n_tokens]
798
+ cb (logits, " ffn_moe_logits" , il);
799
+ } else {
800
+ logits = probs_in;
801
+ }
795
802
796
803
ggml_tensor * probs = nullptr ;
797
804
switch (gating_op) {
@@ -884,6 +891,14 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
884
891
cur = ggml_gelu (ctx0, cur);
885
892
cb (cur, " ffn_moe_gelu" , il);
886
893
} break ;
894
+ case LLM_FFN_RELU:
895
+ if (gate_exps) {
896
+ cur = ggml_reglu_split (ctx0, cur, up);
897
+ cb (cur, " ffn_moe_reglu" , il);
898
+ } else {
899
+ cur = ggml_relu (ctx0, cur);
900
+ cb (cur, " ffn_moe_relu" , il);
901
+ } break ;
887
902
default :
888
903
GGML_ABORT (" fatal error" );
889
904
}
@@ -927,100 +942,6 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
927
942
return moe_out;
928
943
}
929
944
930
- ggml_tensor * llm_graph_context::build_moe_ffn_from_probs (
931
- ggml_tensor * cur,
932
- ggml_tensor * probs,
933
- ggml_tensor * up_exps,
934
- ggml_tensor * gate_exps,
935
- ggml_tensor * down_exps,
936
- ggml_tensor * exp_probs_b,
937
- int64_t n_expert,
938
- int64_t n_expert_used,
939
- llama_expert_gating_func_type gating_op,
940
- int il) const {
941
- const int64_t n_embd = cur->ne [0 ];
942
- const int64_t n_tokens = cur->ne [1 ];
943
-
944
- // add experts selection bias - introduced in DeepSeek V3
945
- // leave probs unbiased as it's later used to get expert weights
946
- ggml_tensor * selection_probs = probs;
947
- if (exp_probs_b != nullptr ) {
948
- selection_probs = ggml_add (ctx0, probs, exp_probs_b);
949
- cb (selection_probs, " ffn_moe_probs_biased" , il);
950
- }
951
-
952
- // select experts
953
- ggml_tensor * selected_experts = ggml_top_k (ctx0, selection_probs, n_expert_used); // [n_expert_used, n_tokens]
954
- cb (selected_experts->src [0 ], " ffn_moe_argsort" , il);
955
- cb (selected_experts, " ffn_moe_topk" , il);
956
-
957
- ggml_tensor * weights = ggml_get_rows (ctx0,
958
- ggml_reshape_3d (ctx0, probs, 1 , n_expert, n_tokens), selected_experts); // [1, n_expert_used, n_tokens]
959
- cb (weights, " ffn_moe_weights" , il);
960
-
961
- weights = ggml_reshape_2d (ctx0, weights, n_expert_used, n_tokens);
962
- if (gating_op == LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX) {
963
- weights = ggml_soft_max (ctx0, weights);
964
- } else {
965
- weights = ggml_sigmoid (ctx0, weights);
966
- ggml_tensor * weights_sum = ggml_sum_rows (ctx0, weights); // [1, n_tokens]
967
- cb (weights_sum, " ffn_moe_weights_sum" , il);
968
-
969
- weights = ggml_div (ctx0, weights, weights_sum); // [n_expert_used, n_tokens]
970
- cb (weights, " ffn_moe_weights_norm" , il);
971
- }
972
-
973
- weights = ggml_reshape_3d (ctx0, weights, 1 , n_expert_used, n_tokens);
974
-
975
- cur = ggml_reshape_3d (ctx0, cur, n_embd, 1 , n_tokens);
976
-
977
- ggml_tensor * up = build_lora_mm_id (up_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
978
- cb (up, " ffn_moe_up" , il);
979
-
980
- ggml_tensor * experts = nullptr ;
981
- cur = build_lora_mm_id (gate_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
982
- cb (cur, " ffn_moe_gate" , il);
983
-
984
- cur = ggml_reglu_split (ctx0, cur, up);
985
- cb (cur, " ffn_moe_reglu" , il);
986
-
987
- experts = build_lora_mm_id (down_exps, cur, selected_experts); // [n_embd, n_expert_used, n_tokens]
988
- cb (experts, " ffn_moe_down" , il);
989
-
990
- experts = ggml_mul (ctx0, experts, weights);
991
- cb (cur, " ffn_moe_weighted" , il);
992
-
993
- ggml_tensor * cur_experts[LLAMA_MAX_EXPERTS] = { nullptr };
994
-
995
- assert (n_expert_used > 0 );
996
-
997
- // order the views before the adds
998
- for (uint32_t i = 0 ; i < hparams.n_expert_used ; ++i) {
999
- cur_experts[i] = ggml_view_2d (ctx0, experts, n_embd, n_tokens, experts->nb [2 ], i*experts->nb [1 ]);
1000
-
1001
- ggml_build_forward_expand (gf, cur_experts[i]);
1002
- }
1003
-
1004
- // aggregate experts
1005
- // note: here we explicitly use hparams.n_expert_used instead of n_expert_used
1006
- // to avoid potentially a large number of add nodes during warmup
1007
- // ref: https://github.com/ggml-org/llama.cpp/pull/14753
1008
- ggml_tensor * moe_out = cur_experts[0 ];
1009
-
1010
- for (uint32_t i = 1 ; i < hparams.n_expert_used ; ++i) {
1011
- moe_out = ggml_add (ctx0, moe_out, cur_experts[i]);
1012
- }
1013
-
1014
- if (n_expert_used == 1 ) {
1015
- // avoid returning a non-contiguous tensor
1016
- moe_out = ggml_cont (ctx0, moe_out);
1017
- }
1018
-
1019
- cb (moe_out, " ffn_moe_out" , il);
1020
-
1021
- return moe_out;
1022
- }
1023
-
1024
945
// input embeddings with optional lora
1025
946
ggml_tensor * llm_graph_context::build_inp_embd (ggml_tensor * tok_embd) const {
1026
947
const int64_t n_embd = hparams.n_embd ;
0 commit comments