Skip to content

Commit f783ddd

Browse files
committed
refactor: Pull granite ffn portion into a static function and reuse in hybrid
Branch: GraniteFour Signed-off-by: Gabe Goodhart <[email protected]>
1 parent 4f50543 commit f783ddd

File tree

1 file changed

+88
-134
lines changed

1 file changed

+88
-134
lines changed

src/llama-model.cpp

Lines changed: 88 additions & 134 deletions
Original file line numberDiff line numberDiff line change
@@ -12742,71 +12742,8 @@ struct llm_build_granite : public llm_graph_context {
1274212742
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
1274312743
}
1274412744

12745-
// For Granite architectures - scale residual
12746-
cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
12747-
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
12748-
cb(ffn_inp, "ffn_inp", il);
12749-
12750-
// feed-forward network (non-MoE)
12751-
if (model.layers[il].ffn_gate_inp == nullptr) {
12752-
12753-
cur = build_norm(ffn_inp,
12754-
model.layers[il].ffn_norm, NULL,
12755-
LLM_NORM_RMS, il);
12756-
cb(cur, "ffn_norm", il);
12757-
12758-
cur = build_ffn(cur,
12759-
model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
12760-
model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
12761-
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
12762-
NULL,
12763-
LLM_FFN_SILU, LLM_FFN_PAR, il);
12764-
cb(cur, "ffn_out", il);
12765-
12766-
} else {
12767-
// MoE branch
12768-
cur = build_norm(ffn_inp,
12769-
model.layers[il].ffn_norm, NULL,
12770-
LLM_NORM_RMS, il);
12771-
cb(cur, "ffn_norm", il);
12772-
12773-
ggml_tensor * moe_out = build_moe_ffn(cur,
12774-
model.layers[il].ffn_gate_inp,
12775-
model.layers[il].ffn_up_exps,
12776-
model.layers[il].ffn_gate_exps,
12777-
model.layers[il].ffn_down_exps,
12778-
nullptr,
12779-
n_expert, n_expert_used,
12780-
LLM_FFN_SILU, true,
12781-
false, 0.0,
12782-
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
12783-
il);
12784-
cb(moe_out, "ffn_moe_out", il);
12785-
12786-
// For Granite MoE Shared
12787-
if (hparams.n_ff_shexp > 0) {
12788-
ggml_tensor * ffn_shexp = build_ffn(cur,
12789-
model.layers[il].ffn_up_shexp, NULL, NULL,
12790-
model.layers[il].ffn_gate_shexp, NULL, NULL,
12791-
model.layers[il].ffn_down_shexp, NULL, NULL,
12792-
NULL,
12793-
LLM_FFN_SILU, LLM_FFN_PAR, il);
12794-
cb(ffn_shexp, "ffn_shexp", il);
12795-
12796-
cur = ggml_add(ctx0, moe_out, ffn_shexp);
12797-
cb(cur, "ffn_out", il);
12798-
} else {
12799-
cur = moe_out;
12800-
}
12801-
}
12802-
12803-
// For Granite architectures - scale residual
12804-
cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
12805-
cur = ggml_add(ctx0, cur, ffn_inp);
12806-
cb(cur, "ffn_out", il);
12807-
12808-
cur = build_cvec(cur, il);
12809-
cb(cur, "l_out", il);
12745+
// ffn
12746+
cur = build_layer_ffn(this, cur, inpSA, model, il);
1281012747

1281112748
// input for next layer
1281212749
inpL = cur;
@@ -12899,6 +12836,90 @@ struct llm_build_granite : public llm_graph_context {
1289912836
self->cb(cur, "attn_out", il);
1290012837
return cur;
1290112838
}
12839+
12840+
// static ffn layer builder for reuse in hybrid architectures
12841+
static ggml_tensor * build_layer_ffn(
12842+
const llm_graph_context * self,
12843+
ggml_tensor * cur,
12844+
ggml_tensor * inpSA,
12845+
const llama_model & model,
12846+
const int il) {
12847+
12848+
auto * ctx0 = self->ctx0;
12849+
const auto& hparams = self->hparams;
12850+
12851+
// For Granite architectures - scale residual
12852+
if (hparams.f_residual_scale) {
12853+
cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
12854+
}
12855+
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
12856+
self->cb(ffn_inp, "ffn_inp", il);
12857+
12858+
// feed-forward network (non-MoE)
12859+
if (model.layers[il].ffn_gate_inp == nullptr) {
12860+
12861+
cur = self->build_norm(ffn_inp,
12862+
model.layers[il].ffn_norm, NULL,
12863+
LLM_NORM_RMS, il);
12864+
self->cb(cur, "ffn_norm", il);
12865+
12866+
cur = self->build_ffn(cur,
12867+
model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
12868+
model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
12869+
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
12870+
NULL,
12871+
LLM_FFN_SILU, LLM_FFN_PAR, il);
12872+
self->cb(cur, "ffn_out", il);
12873+
12874+
} else {
12875+
// MoE branch
12876+
cur = self->build_norm(ffn_inp,
12877+
model.layers[il].ffn_norm, NULL,
12878+
LLM_NORM_RMS, il);
12879+
self->cb(cur, "ffn_norm", il);
12880+
12881+
ggml_tensor * moe_out = self->build_moe_ffn(cur,
12882+
model.layers[il].ffn_gate_inp,
12883+
model.layers[il].ffn_up_exps,
12884+
model.layers[il].ffn_gate_exps,
12885+
model.layers[il].ffn_down_exps,
12886+
nullptr,
12887+
self->n_expert, self->n_expert_used,
12888+
LLM_FFN_SILU, true,
12889+
false, 0.0,
12890+
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
12891+
il);
12892+
self->cb(moe_out, "ffn_moe_out", il);
12893+
12894+
// For Granite MoE Shared
12895+
if (hparams.n_ff_shexp > 0) {
12896+
ggml_tensor * ffn_shexp = self->build_ffn(cur,
12897+
model.layers[il].ffn_up_shexp, NULL, NULL,
12898+
model.layers[il].ffn_gate_shexp, NULL, NULL,
12899+
model.layers[il].ffn_down_shexp, NULL, NULL,
12900+
NULL,
12901+
LLM_FFN_SILU, LLM_FFN_PAR, il);
12902+
self->cb(ffn_shexp, "ffn_shexp", il);
12903+
12904+
cur = ggml_add(ctx0, moe_out, ffn_shexp);
12905+
self->cb(cur, "ffn_out", il);
12906+
} else {
12907+
cur = moe_out;
12908+
}
12909+
}
12910+
12911+
// For Granite architectures - scale residual
12912+
if (hparams.f_residual_scale) {
12913+
cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
12914+
}
12915+
cur = ggml_add(ctx0, cur, ffn_inp);
12916+
self->cb(cur, "ffn_out", il);
12917+
12918+
cur = self->build_cvec(cur, il);
12919+
self->cb(cur, "l_out", il);
12920+
12921+
return cur;
12922+
}
1290212923
};
1290312924

1290412925
struct llm_build_hybrid_mamba : public llm_graph_context {
@@ -12964,75 +12985,8 @@ struct llm_build_hybrid_mamba : public llm_graph_context {
1296412985
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
1296512986
}
1296612987

12967-
// For Granite architectures - scale residual
12968-
if (hparams.f_residual_scale) {
12969-
cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
12970-
}
12971-
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
12972-
cb(ffn_inp, "ffn_inp", il);
12973-
12974-
// feed-forward network (non-MoE)
12975-
if (model.layers[il].ffn_gate_inp == nullptr) {
12976-
12977-
cur = build_norm(ffn_inp,
12978-
model.layers[il].ffn_norm, NULL,
12979-
LLM_NORM_RMS, il);
12980-
cb(cur, "ffn_norm", il);
12981-
12982-
cur = build_ffn(cur,
12983-
model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
12984-
model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
12985-
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
12986-
NULL,
12987-
LLM_FFN_SILU, LLM_FFN_PAR, il);
12988-
cb(cur, "ffn_out", il);
12989-
12990-
} else {
12991-
// MoE branch
12992-
cur = build_norm(ffn_inp,
12993-
model.layers[il].ffn_norm, NULL,
12994-
LLM_NORM_RMS, il);
12995-
cb(cur, "ffn_norm", il);
12996-
12997-
ggml_tensor * moe_out = build_moe_ffn(cur,
12998-
model.layers[il].ffn_gate_inp,
12999-
model.layers[il].ffn_up_exps,
13000-
model.layers[il].ffn_gate_exps,
13001-
model.layers[il].ffn_down_exps,
13002-
nullptr,
13003-
n_expert, n_expert_used,
13004-
LLM_FFN_SILU, true,
13005-
false, 0.0,
13006-
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
13007-
il);
13008-
cb(moe_out, "ffn_moe_out", il);
13009-
13010-
// For Granite MoE Shared
13011-
if (hparams.n_ff_shexp > 0) {
13012-
ggml_tensor * ffn_shexp = build_ffn(cur,
13013-
model.layers[il].ffn_up_shexp, NULL, NULL,
13014-
model.layers[il].ffn_gate_shexp, NULL, NULL,
13015-
model.layers[il].ffn_down_shexp, NULL, NULL,
13016-
NULL,
13017-
LLM_FFN_SILU, LLM_FFN_PAR, il);
13018-
cb(ffn_shexp, "ffn_shexp", il);
13019-
13020-
cur = ggml_add(ctx0, moe_out, ffn_shexp);
13021-
cb(cur, "ffn_out", il);
13022-
} else {
13023-
cur = moe_out;
13024-
}
13025-
}
13026-
13027-
// For Granite architectures - scale residual
13028-
if (hparams.f_residual_scale) {
13029-
cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
13030-
}
13031-
cur = ggml_add(ctx0, cur, ffn_inp);
13032-
cb(cur, "ffn_out", il);
13033-
13034-
cur = build_cvec(cur, il);
13035-
cb(cur, "l_out", il);
12988+
// ffn
12989+
cur = llm_build_granite::build_layer_ffn(this, cur, inpSA, model, il);
1303612990

1303712991
// input for next layer
1303812992
inpL = cur;

0 commit comments

Comments
 (0)