@@ -6118,15 +6118,8 @@ void llama_model::print_info() const {
61186118 LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
61196119 }
61206120
6121- if (arch == LLM_ARCH_BAILINGMOE) {
6122- LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
6123- LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
6124- LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
6125- LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
6126- LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm);
6127- }
6128-
6129- if (arch == LLM_ARCH_BAILINGMOE_V2) {
6121+ if (arch == LLM_ARCH_BAILINGMOE ||
6122+ arch == LLM_ARCH_BAILINGMOE_V2) {
61306123 LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
61316124 LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
61326125 LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
@@ -16693,12 +16686,18 @@ struct llm_build_bailingmoe_v2 : public llm_graph_context {
1669316686 Kcur = ggml_reshape_3d(ctx0, Kcur, n_rot, n_head_kv, n_tokens);
1669416687 Vcur = ggml_reshape_3d(ctx0, Vcur, n_rot, n_head_kv, n_tokens);
1669516688
16689+ Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
16690+ cb(Qcur, "Qcur_normed", il);
16691+
1669616692 Qcur = ggml_rope_ext(
1669716693 ctx0, Qcur, inp_pos, rope_factors,
1669816694 n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
1669916695 ext_factor, attn_factor, beta_fast, beta_slow
1670016696 );
1670116697
16698+ Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
16699+ cb(Kcur, "Kcur_normed", il);
16700+
1670216701 Kcur = ggml_rope_ext(
1670316702 ctx0, Kcur, inp_pos, rope_factors,
1670416703 n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
@@ -16727,41 +16726,52 @@ struct llm_build_bailingmoe_v2 : public llm_graph_context {
1672716726 LLM_NORM_RMS, il);
1672816727 cb(cur, "ffn_norm", il);
1672916728
16730- ggml_tensor * moe_out =
16731- build_moe_ffn(cur,
16732- model.layers[il].ffn_gate_inp,
16733- model.layers[il].ffn_up_exps,
16734- model.layers[il].ffn_gate_exps,
16735- model.layers[il].ffn_down_exps,
16736- nullptr,
16737- n_expert, n_expert_used,
16738- LLM_FFN_SILU, hparams.expert_weights_norm,
16739- false, hparams.expert_weights_scale,
16740- LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
16741- il);
16742- cb(moe_out, "ffn_moe_out", il);
16743-
16744- // FFN shared expert
16745- {
16746- ggml_tensor * ffn_shexp = build_ffn(cur,
16747- model.layers[il].ffn_up_shexp, NULL, NULL,
16748- model.layers[il].ffn_gate_shexp, NULL, NULL,
16749- model.layers[il].ffn_down_shexp, NULL, NULL,
16729+ if ((uint32_t) il < hparams.n_layer_dense_lead) {
16730+ cur = build_ffn(cur,
16731+ model.layers[il].ffn_up, NULL, NULL,
16732+ model.layers[il].ffn_gate, NULL, NULL,
16733+ model.layers[il].ffn_down, NULL, NULL,
1675016734 NULL,
1675116735 LLM_FFN_SILU, LLM_FFN_PAR, il);
16752- cb(ffn_shexp, "ffn_shexp", il);
16753-
16754- cur = ggml_add(ctx0, moe_out, ffn_shexp);
1675516736 cb(cur, "ffn_out", il);
16756- }
16737+ } else {
16738+ // MoE branch
16739+ ggml_tensor * moe_out =
16740+ build_moe_ffn(cur,
16741+ model.layers[il].ffn_gate_inp,
16742+ model.layers[il].ffn_up_exps,
16743+ model.layers[il].ffn_gate_exps,
16744+ model.layers[il].ffn_down_exps,
16745+ model.layers[il].ffn_exp_probs_b,
16746+ n_expert, n_expert_used,
16747+ LLM_FFN_SILU, hparams.expert_weights_norm,
16748+ true, hparams.expert_weights_scale,
16749+ (llama_expert_gating_func_type) hparams.expert_gating_func,
16750+ il);
16751+ cb(moe_out, "ffn_moe_out", il);
1675716752
16758- cur = ggml_add(ctx0, cur, ffn_inp);
16753+ // FFN shared expert
16754+ {
16755+ ggml_tensor * ffn_shexp = build_ffn(cur,
16756+ model.layers[il].ffn_up_shexp, NULL, NULL,
16757+ model.layers[il].ffn_gate_shexp, NULL, NULL,
16758+ model.layers[il].ffn_down_shexp, NULL, NULL,
16759+ NULL,
16760+ LLM_FFN_SILU, LLM_FFN_PAR, il);
16761+ cb(ffn_shexp, "ffn_shexp", il);
1675916762
16760- cur = build_cvec(cur, il);
16761- cb(cur, "l_out", il);
16763+ cur = ggml_add(ctx0, moe_out, ffn_shexp);
16764+ cb(cur, "ffn_out", il);
16765+ }
1676216766
16763- // input for next layer
16764- inpL = cur;
16767+ cur = ggml_add(ctx0, cur, ffn_inp);
16768+
16769+ cur = build_cvec(cur, il);
16770+ cb(cur, "l_out", il);
16771+
16772+ // input for next layer
16773+ inpL = cur;
16774+ }
1676516775 }
1676616776
1676716777 cur = inpL;
0 commit comments