@@ -128,8 +128,9 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr
128128 cb (Vcur, " Vcur" , il);
129129
130130 // note: MLA with the absorption optimzation converts into MQA (ie: GQA with 1 group)
131- cur = build_attn (inp_attn, model.layers [il].wo , NULL , Qcur, Kcur, Vcur, nullptr , nullptr ,
132- model.layers [il].wv_b , kq_scale, il);
131+ cur = build_attn (inp_attn,
132+ model.layers [il].wo , NULL ,
133+ Qcur, Kcur, Vcur, nullptr , nullptr , model.layers [il].wv_b , kq_scale, il);
133134 } else {
134135 ggml_tensor * kv = ggml_mul_mat (ctx0, model.layers [il].wkv_b , kv_cmpr);
135136 cb (kv, " kv" , il);
@@ -159,8 +160,9 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr
159160 cb (Kcur, " Kcur" , il);
160161
161162 // note: MLA without the absorption optimization converts into MHA (ie: GQA with full n_head groups)
162- cur = build_attn (inp_attn, model.layers [il].wo , NULL , Qcur, Kcur, Vcur, nullptr , nullptr , nullptr ,
163- kq_scale, il);
163+ cur = build_attn (inp_attn,
164+ model.layers [il].wo , NULL ,
165+ Qcur, Kcur, Vcur, nullptr , nullptr , nullptr , kq_scale, il);
164166 }
165167 };
166168 if (il == n_layer - 1 && inp_out_ids) {
@@ -174,23 +176,34 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr
174176 cb (cur, " ffn_norm" , il);
175177
176178 if ((uint32_t ) il < hparams.n_layer_dense_lead ) {
177- cur = build_ffn (cur, model.layers [il].ffn_up , NULL , NULL , model.layers [il].ffn_gate , NULL , NULL ,
178- model.layers [il].ffn_down , NULL , NULL , NULL , LLM_FFN_SILU, LLM_FFN_PAR, il);
179+ cur = build_ffn (cur,
180+ model.layers [il].ffn_up , NULL , NULL ,
181+ model.layers [il].ffn_gate , NULL , NULL ,
182+ model.layers [il].ffn_down , NULL , NULL ,
183+ NULL , LLM_FFN_SILU, LLM_FFN_PAR, il);
179184 cb (cur, " ffn_out" , il);
180185 } else {
181186 // MoE branch
182- ggml_tensor * moe_out = build_moe_ffn (
183- cur, model.layers [il].ffn_gate_inp , model.layers [il].ffn_up_exps , model.layers [il].ffn_gate_exps ,
184- model.layers [il].ffn_down_exps , model.layers [il].ffn_exp_probs_b , n_expert, n_expert_used, LLM_FFN_SILU,
185- hparams.expert_weights_norm , true , hparams.expert_weights_scale ,
186- (llama_expert_gating_func_type) hparams.expert_gating_func , il);
187+ ggml_tensor * moe_out = build_moe_ffn (cur,
188+ model.layers [il].ffn_gate_inp ,
189+ model.layers [il].ffn_up_exps ,
190+ model.layers [il].ffn_gate_exps ,
191+ model.layers [il].ffn_down_exps ,
192+ model.layers [il].ffn_exp_probs_b ,
193+ n_expert, n_expert_used,
194+ LLM_FFN_SILU, hparams.expert_weights_norm ,
195+ true , hparams.expert_weights_scale ,
196+ (llama_expert_gating_func_type) hparams.expert_gating_func ,
197+ il);
187198 cb (moe_out, " ffn_moe_out" , il);
188199
189200 // FFN shared expert
190201 {
191202 ggml_tensor * ffn_shexp =
192- build_ffn (cur, model.layers [il].ffn_up_shexp , NULL , NULL , model.layers [il].ffn_gate_shexp , NULL ,
193- NULL , model.layers [il].ffn_down_shexp , NULL , NULL , NULL , LLM_FFN_SILU, LLM_FFN_PAR, il);
203+ build_ffn (cur, model.layers [il].ffn_up_shexp , NULL , NULL ,
204+ model.layers [il].ffn_gate_shexp , NULL , NULL ,
205+ model.layers [il].ffn_down_shexp , NULL , NULL ,
206+ NULL , LLM_FFN_SILU, LLM_FFN_PAR, il);
194207 cb (ffn_shexp, " ffn_shexp" , il);
195208
196209 cur = ggml_add (ctx0, moe_out, ffn_shexp);
0 commit comments