Skip to content

Commit 3fe7676

Browse files
committed
update llm graph
1 parent 69177c7 commit 3fe7676

File tree

1 file changed

+48
-38
lines changed

1 file changed

+48
-38
lines changed

src/llama-model.cpp

Lines changed: 48 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -6118,15 +6118,8 @@ void llama_model::print_info() const {
61186118
LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
61196119
}
61206120

6121-
if (arch == LLM_ARCH_BAILINGMOE) {
6122-
LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
6123-
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
6124-
LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
6125-
LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
6126-
LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm);
6127-
}
6128-
6129-
if (arch == LLM_ARCH_BAILINGMOE_V2) {
6121+
if (arch == LLM_ARCH_BAILINGMOE ||
6122+
arch == LLM_ARCH_BAILINGMOE_V2) {
61306123
LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
61316124
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
61326125
LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
@@ -16693,12 +16686,18 @@ struct llm_build_bailingmoe_v2 : public llm_graph_context {
1669316686
Kcur = ggml_reshape_3d(ctx0, Kcur, n_rot, n_head_kv, n_tokens);
1669416687
Vcur = ggml_reshape_3d(ctx0, Vcur, n_rot, n_head_kv, n_tokens);
1669516688

16689+
Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
16690+
cb(Qcur, "Qcur_normed", il);
16691+
1669616692
Qcur = ggml_rope_ext(
1669716693
ctx0, Qcur, inp_pos, rope_factors,
1669816694
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
1669916695
ext_factor, attn_factor, beta_fast, beta_slow
1670016696
);
1670116697

16698+
Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
16699+
cb(Kcur, "Kcur_normed", il);
16700+
1670216701
Kcur = ggml_rope_ext(
1670316702
ctx0, Kcur, inp_pos, rope_factors,
1670416703
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
@@ -16727,41 +16726,52 @@ struct llm_build_bailingmoe_v2 : public llm_graph_context {
1672716726
LLM_NORM_RMS, il);
1672816727
cb(cur, "ffn_norm", il);
1672916728

16730-
ggml_tensor * moe_out =
16731-
build_moe_ffn(cur,
16732-
model.layers[il].ffn_gate_inp,
16733-
model.layers[il].ffn_up_exps,
16734-
model.layers[il].ffn_gate_exps,
16735-
model.layers[il].ffn_down_exps,
16736-
nullptr,
16737-
n_expert, n_expert_used,
16738-
LLM_FFN_SILU, hparams.expert_weights_norm,
16739-
false, hparams.expert_weights_scale,
16740-
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
16741-
il);
16742-
cb(moe_out, "ffn_moe_out", il);
16743-
16744-
// FFN shared expert
16745-
{
16746-
ggml_tensor * ffn_shexp = build_ffn(cur,
16747-
model.layers[il].ffn_up_shexp, NULL, NULL,
16748-
model.layers[il].ffn_gate_shexp, NULL, NULL,
16749-
model.layers[il].ffn_down_shexp, NULL, NULL,
16729+
if ((uint32_t) il < hparams.n_layer_dense_lead) {
16730+
cur = build_ffn(cur,
16731+
model.layers[il].ffn_up, NULL, NULL,
16732+
model.layers[il].ffn_gate, NULL, NULL,
16733+
model.layers[il].ffn_down, NULL, NULL,
1675016734
NULL,
1675116735
LLM_FFN_SILU, LLM_FFN_PAR, il);
16752-
cb(ffn_shexp, "ffn_shexp", il);
16753-
16754-
cur = ggml_add(ctx0, moe_out, ffn_shexp);
1675516736
cb(cur, "ffn_out", il);
16756-
}
16737+
} else {
16738+
// MoE branch
16739+
ggml_tensor * moe_out =
16740+
build_moe_ffn(cur,
16741+
model.layers[il].ffn_gate_inp,
16742+
model.layers[il].ffn_up_exps,
16743+
model.layers[il].ffn_gate_exps,
16744+
model.layers[il].ffn_down_exps,
16745+
model.layers[il].ffn_exp_probs_b,
16746+
n_expert, n_expert_used,
16747+
LLM_FFN_SILU, hparams.expert_weights_norm,
16748+
true, hparams.expert_weights_scale,
16749+
(llama_expert_gating_func_type) hparams.expert_gating_func,
16750+
il);
16751+
cb(moe_out, "ffn_moe_out", il);
1675716752

16758-
cur = ggml_add(ctx0, cur, ffn_inp);
16753+
// FFN shared expert
16754+
{
16755+
ggml_tensor * ffn_shexp = build_ffn(cur,
16756+
model.layers[il].ffn_up_shexp, NULL, NULL,
16757+
model.layers[il].ffn_gate_shexp, NULL, NULL,
16758+
model.layers[il].ffn_down_shexp, NULL, NULL,
16759+
NULL,
16760+
LLM_FFN_SILU, LLM_FFN_PAR, il);
16761+
cb(ffn_shexp, "ffn_shexp", il);
1675916762

16760-
cur = build_cvec(cur, il);
16761-
cb(cur, "l_out", il);
16763+
cur = ggml_add(ctx0, moe_out, ffn_shexp);
16764+
cb(cur, "ffn_out", il);
16765+
}
1676216766

16763-
// input for next layer
16764-
inpL = cur;
16767+
cur = ggml_add(ctx0, cur, ffn_inp);
16768+
16769+
cur = build_cvec(cur, il);
16770+
cb(cur, "l_out", il);
16771+
16772+
// input for next layer
16773+
inpL = cur;
16774+
}
1676516775
}
1676616776

1676716777
cur = inpL;

0 commit comments

Comments
 (0)