@@ -12742,71 +12742,8 @@ struct llm_build_granite : public llm_graph_context {
1274212742 inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
1274312743 }
1274412744
12745- // For Granite architectures - scale residual
12746- cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
12747- ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
12748- cb(ffn_inp, "ffn_inp", il);
12749-
12750- // feed-forward network (non-MoE)
12751- if (model.layers[il].ffn_gate_inp == nullptr) {
12752-
12753- cur = build_norm(ffn_inp,
12754- model.layers[il].ffn_norm, NULL,
12755- LLM_NORM_RMS, il);
12756- cb(cur, "ffn_norm", il);
12757-
12758- cur = build_ffn(cur,
12759- model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
12760- model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
12761- model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
12762- NULL,
12763- LLM_FFN_SILU, LLM_FFN_PAR, il);
12764- cb(cur, "ffn_out", il);
12765-
12766- } else {
12767- // MoE branch
12768- cur = build_norm(ffn_inp,
12769- model.layers[il].ffn_norm, NULL,
12770- LLM_NORM_RMS, il);
12771- cb(cur, "ffn_norm", il);
12772-
12773- ggml_tensor * moe_out = build_moe_ffn(cur,
12774- model.layers[il].ffn_gate_inp,
12775- model.layers[il].ffn_up_exps,
12776- model.layers[il].ffn_gate_exps,
12777- model.layers[il].ffn_down_exps,
12778- nullptr,
12779- n_expert, n_expert_used,
12780- LLM_FFN_SILU, true,
12781- false, 0.0,
12782- LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
12783- il);
12784- cb(moe_out, "ffn_moe_out", il);
12785-
12786- // For Granite MoE Shared
12787- if (hparams.n_ff_shexp > 0) {
12788- ggml_tensor * ffn_shexp = build_ffn(cur,
12789- model.layers[il].ffn_up_shexp, NULL, NULL,
12790- model.layers[il].ffn_gate_shexp, NULL, NULL,
12791- model.layers[il].ffn_down_shexp, NULL, NULL,
12792- NULL,
12793- LLM_FFN_SILU, LLM_FFN_PAR, il);
12794- cb(ffn_shexp, "ffn_shexp", il);
12795-
12796- cur = ggml_add(ctx0, moe_out, ffn_shexp);
12797- cb(cur, "ffn_out", il);
12798- } else {
12799- cur = moe_out;
12800- }
12801- }
12802-
12803- // For Granite architectures - scale residual
12804- cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
12805- cur = ggml_add(ctx0, cur, ffn_inp);
12806- cb(cur, "ffn_out", il);
12807-
12808- cur = build_cvec(cur, il);
12809- cb(cur, "l_out", il);
12745+ // ffn
12746+ cur = build_layer_ffn(this, cur, inpSA, model, il);
1281012747
1281112748 // input for next layer
1281212749 inpL = cur;
@@ -12899,6 +12836,90 @@ struct llm_build_granite : public llm_graph_context {
1289912836 self->cb(cur, "attn_out", il);
1290012837 return cur;
1290112838 }
12839+
12840+ // static ffn layer builder for reuse in hybrid architectures
12841+ static ggml_tensor * build_layer_ffn(
12842+ const llm_graph_context * self,
12843+ ggml_tensor * cur,
12844+ ggml_tensor * inpSA,
12845+ const llama_model & model,
12846+ const int il) {
12847+
12848+ auto * ctx0 = self->ctx0;
12849+ const auto& hparams = self->hparams;
12850+
12851+ // For Granite architectures - scale residual
12852+ if (hparams.f_residual_scale) {
12853+ cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
12854+ }
12855+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
12856+ self->cb(ffn_inp, "ffn_inp", il);
12857+
12858+ // feed-forward network (non-MoE)
12859+ if (model.layers[il].ffn_gate_inp == nullptr) {
12860+
12861+ cur = self->build_norm(ffn_inp,
12862+ model.layers[il].ffn_norm, NULL,
12863+ LLM_NORM_RMS, il);
12864+ self->cb(cur, "ffn_norm", il);
12865+
12866+ cur = self->build_ffn(cur,
12867+ model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
12868+ model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
12869+ model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
12870+ NULL,
12871+ LLM_FFN_SILU, LLM_FFN_PAR, il);
12872+ self->cb(cur, "ffn_out", il);
12873+
12874+ } else {
12875+ // MoE branch
12876+ cur = self->build_norm(ffn_inp,
12877+ model.layers[il].ffn_norm, NULL,
12878+ LLM_NORM_RMS, il);
12879+ self->cb(cur, "ffn_norm", il);
12880+
12881+ ggml_tensor * moe_out = self->build_moe_ffn(cur,
12882+ model.layers[il].ffn_gate_inp,
12883+ model.layers[il].ffn_up_exps,
12884+ model.layers[il].ffn_gate_exps,
12885+ model.layers[il].ffn_down_exps,
12886+ nullptr,
12887+ self->n_expert, self->n_expert_used,
12888+ LLM_FFN_SILU, true,
12889+ false, 0.0,
12890+ LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
12891+ il);
12892+ self->cb(moe_out, "ffn_moe_out", il);
12893+
12894+ // For Granite MoE Shared
12895+ if (hparams.n_ff_shexp > 0) {
12896+ ggml_tensor * ffn_shexp = self->build_ffn(cur,
12897+ model.layers[il].ffn_up_shexp, NULL, NULL,
12898+ model.layers[il].ffn_gate_shexp, NULL, NULL,
12899+ model.layers[il].ffn_down_shexp, NULL, NULL,
12900+ NULL,
12901+ LLM_FFN_SILU, LLM_FFN_PAR, il);
12902+ self->cb(ffn_shexp, "ffn_shexp", il);
12903+
12904+ cur = ggml_add(ctx0, moe_out, ffn_shexp);
12905+ self->cb(cur, "ffn_out", il);
12906+ } else {
12907+ cur = moe_out;
12908+ }
12909+ }
12910+
12911+ // For Granite architectures - scale residual
12912+ if (hparams.f_residual_scale) {
12913+ cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
12914+ }
12915+ cur = ggml_add(ctx0, cur, ffn_inp);
12916+ self->cb(cur, "ffn_out", il);
12917+
12918+ cur = self->build_cvec(cur, il);
12919+ self->cb(cur, "l_out", il);
12920+
12921+ return cur;
12922+ }
1290212923};
1290312924
1290412925struct llm_build_hybrid_mamba : public llm_graph_context {
@@ -12964,75 +12985,8 @@ struct llm_build_hybrid_mamba : public llm_graph_context {
1296412985 inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
1296512986 }
1296612987
12967- // For Granite architectures - scale residual
12968- if (hparams.f_residual_scale) {
12969- cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
12970- }
12971- ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
12972- cb(ffn_inp, "ffn_inp", il);
12973-
12974- // feed-forward network (non-MoE)
12975- if (model.layers[il].ffn_gate_inp == nullptr) {
12976-
12977- cur = build_norm(ffn_inp,
12978- model.layers[il].ffn_norm, NULL,
12979- LLM_NORM_RMS, il);
12980- cb(cur, "ffn_norm", il);
12981-
12982- cur = build_ffn(cur,
12983- model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
12984- model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
12985- model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
12986- NULL,
12987- LLM_FFN_SILU, LLM_FFN_PAR, il);
12988- cb(cur, "ffn_out", il);
12989-
12990- } else {
12991- // MoE branch
12992- cur = build_norm(ffn_inp,
12993- model.layers[il].ffn_norm, NULL,
12994- LLM_NORM_RMS, il);
12995- cb(cur, "ffn_norm", il);
12996-
12997- ggml_tensor * moe_out = build_moe_ffn(cur,
12998- model.layers[il].ffn_gate_inp,
12999- model.layers[il].ffn_up_exps,
13000- model.layers[il].ffn_gate_exps,
13001- model.layers[il].ffn_down_exps,
13002- nullptr,
13003- n_expert, n_expert_used,
13004- LLM_FFN_SILU, true,
13005- false, 0.0,
13006- LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
13007- il);
13008- cb(moe_out, "ffn_moe_out", il);
13009-
13010- // For Granite MoE Shared
13011- if (hparams.n_ff_shexp > 0) {
13012- ggml_tensor * ffn_shexp = build_ffn(cur,
13013- model.layers[il].ffn_up_shexp, NULL, NULL,
13014- model.layers[il].ffn_gate_shexp, NULL, NULL,
13015- model.layers[il].ffn_down_shexp, NULL, NULL,
13016- NULL,
13017- LLM_FFN_SILU, LLM_FFN_PAR, il);
13018- cb(ffn_shexp, "ffn_shexp", il);
13019-
13020- cur = ggml_add(ctx0, moe_out, ffn_shexp);
13021- cb(cur, "ffn_out", il);
13022- } else {
13023- cur = moe_out;
13024- }
13025- }
13026-
13027- // For Granite architectures - scale residual
13028- if (hparams.f_residual_scale) {
13029- cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
13030- }
13031- cur = ggml_add(ctx0, cur, ffn_inp);
13032- cb(cur, "ffn_out", il);
13033-
13034- cur = build_cvec(cur, il);
13035- cb(cur, "l_out", il);
12988+ // ffn
12989+ cur = llm_build_granite::build_layer_ffn(this, cur, inpSA, model, il);
1303612990
1303712991 // input for next layer
1303812992 inpL = cur;
0 commit comments