@@ -4610,11 +4610,6 @@ struct llm_build_llama : public llm_graph_context {
46104610 inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
46114611 }
46124612
4613- // For Granite architecture
4614- if (hparams.f_residual_scale) {
4615- cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
4616- }
4617-
46184613 ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
46194614 cb(ffn_inp, "ffn_inp", il);
46204615
@@ -4672,7 +4667,7 @@ struct llm_build_llama : public llm_graph_context {
46724667 LLM_NORM_RMS, il);
46734668 cb(cur, "ffn_norm", il);
46744669
4675- ggml_tensor * moe_out = build_moe_ffn(cur,
4670+ cur = build_moe_ffn(cur,
46764671 model.layers[il].ffn_gate_inp,
46774672 model.layers[il].ffn_up_exps,
46784673 model.layers[il].ffn_gate_exps,
@@ -4683,28 +4678,7 @@ struct llm_build_llama : public llm_graph_context {
46834678 false, 0.0,
46844679 LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
46854680 il);
4686- cb(moe_out, "ffn_moe_out", il);
4687-
4688- // For Granite MoE Shared
4689- if (hparams.n_ff_shexp > 0) {
4690- ggml_tensor * ffn_shexp = build_ffn(cur,
4691- model.layers[il].ffn_up_shexp, NULL, NULL,
4692- model.layers[il].ffn_gate_shexp, NULL, NULL,
4693- model.layers[il].ffn_down_shexp, NULL, NULL,
4694- NULL,
4695- LLM_FFN_SILU, LLM_FFN_PAR, il);
4696- cb(ffn_shexp, "ffn_shexp", il);
4697-
4698- cur = ggml_add(ctx0, moe_out, ffn_shexp);
4699- cb(cur, "ffn_out", il);
4700- } else {
4701- cur = moe_out;
4702- }
4703- }
4704-
4705- // For Granite architecture
4706- if (hparams.f_residual_scale) {
4707- cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
4681+ cb(cur, "ffn_moe_out", il);
47084682 }
47094683
47104684 cur = ggml_add(ctx0, cur, ffn_inp);
@@ -4729,11 +4703,6 @@ struct llm_build_llama : public llm_graph_context {
47294703 // lm_head
47304704 cur = build_lora_mm(model.output, cur);
47314705
4732- // For Granite architecture
4733- if (hparams.f_logit_scale) {
4734- cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_logit_scale);
4735- }
4736-
47374706 cb(cur, "result_output", -1);
47384707 res->t_logits = cur;
47394708
@@ -4844,11 +4813,6 @@ struct llm_build_deci : public llm_graph_context {
48444813 continue;
48454814 }
48464815
4847- // For Granite architecture
4848- if (hparams.f_residual_scale) {
4849- cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
4850- }
4851-
48524816 // modified to support attention-free layer of Llama-3_1-Nemotron-51B
48534817 ggml_tensor * ffn_inp = cur;
48544818 if (n_head > 0) {
@@ -4872,11 +4836,6 @@ struct llm_build_deci : public llm_graph_context {
48724836 cb(cur, "ffn_out", il);
48734837 }
48744838
4875- // For Granite architecture
4876- if (hparams.f_residual_scale) {
4877- cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
4878- }
4879-
48804839 cur = ggml_add(ctx0, cur, ffn_inp);
48814840 cb(cur, "ffn_out", il);
48824841
@@ -4899,11 +4858,6 @@ struct llm_build_deci : public llm_graph_context {
48994858 // lm_head
49004859 cur = build_lora_mm(model.output, cur);
49014860
4902- // For Granite architecture
4903- if (hparams.f_logit_scale) {
4904- cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_logit_scale);
4905- }
4906-
49074861 cb(cur, "result_output", -1);
49084862 res->t_logits = cur;
49094863
@@ -12242,6 +12196,194 @@ struct llm_build_arwkv7 : public llm_build_rwkv7_base {
1224212196 }
1224312197};
1224412198
12199+
12200+ struct llm_build_granite : public llm_graph_context {
12201+ llm_build_granite(
12202+ const llama_model & model,
12203+ const llm_graph_params & params,
12204+ ggml_cgraph * gf,
12205+ const bool use_rope = true)
12206+ : llm_graph_context(params) {
12207+
12208+ const int64_t n_embd_head = hparams.n_embd_head_v;
12209+
12210+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
12211+ GGML_ASSERT(n_embd_head == hparams.n_rot);
12212+
12213+ ggml_tensor * cur;
12214+ ggml_tensor * inpL;
12215+
12216+ inpL = build_inp_embd(model.tok_embd);
12217+
12218+ // inp_pos - used for rope if enabled
12219+ ggml_tensor * inp_pos;
12220+ if (use_rope) {
12221+ inp_pos = build_inp_pos();
12222+ }
12223+
12224+ auto * inp_attn = build_attn_inp_kv_unified();
12225+
12226+ const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
12227+ for (int il = 0; il < n_layer; ++il) {
12228+ ggml_tensor * inpSA = inpL;
12229+
12230+ // norm
12231+ cur = build_norm(inpL,
12232+ model.layers[il].attn_norm, NULL,
12233+ LLM_NORM_RMS, il);
12234+ cb(cur, "attn_norm", il);
12235+
12236+ // self-attention
12237+ {
12238+ // compute Q and K and (optionally) RoPE them
12239+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
12240+ cb(Qcur, "Qcur", il);
12241+ if (model.layers[il].bq) {
12242+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
12243+ cb(Qcur, "Qcur", il);
12244+ }
12245+
12246+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
12247+ cb(Kcur, "Kcur", il);
12248+ if (model.layers[il].bk) {
12249+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
12250+ cb(Kcur, "Kcur", il);
12251+ }
12252+
12253+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
12254+ cb(Vcur, "Vcur", il);
12255+ if (model.layers[il].bv) {
12256+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
12257+ cb(Vcur, "Vcur", il);
12258+ }
12259+
12260+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
12261+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
12262+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
12263+
12264+ if (use_rope) {
12265+ ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
12266+ Qcur = ggml_rope_ext(
12267+ ctx0, Qcur, inp_pos, rope_factors,
12268+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
12269+ ext_factor, attn_factor, beta_fast, beta_slow
12270+ );
12271+
12272+ Kcur = ggml_rope_ext(
12273+ ctx0, Kcur, inp_pos, rope_factors,
12274+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
12275+ ext_factor, attn_factor, beta_fast, beta_slow
12276+ );
12277+ }
12278+
12279+ cb(Qcur, "Qcur", il);
12280+ cb(Kcur, "Kcur", il);
12281+ cb(Vcur, "Vcur", il);
12282+
12283+ cur = build_attn(inp_attn, gf,
12284+ model.layers[il].wo, model.layers[il].bo,
12285+ Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
12286+ cb(cur, "attn_out", il);
12287+ }
12288+
12289+ if (il == n_layer - 1) {
12290+ // skip computing output for unused tokens
12291+ ggml_tensor * inp_out_ids = build_inp_out_ids();
12292+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
12293+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
12294+ }
12295+
12296+ // For Granite architectures - scale residual
12297+ cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
12298+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
12299+ cb(ffn_inp, "ffn_inp", il);
12300+
12301+ // feed-forward network (non-MoE)
12302+ if (model.layers[il].ffn_gate_inp == nullptr) {
12303+
12304+ cur = build_norm(ffn_inp,
12305+ model.layers[il].ffn_norm, NULL,
12306+ LLM_NORM_RMS, il);
12307+ cb(cur, "ffn_norm", il);
12308+
12309+ cur = build_ffn(cur,
12310+ model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
12311+ model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
12312+ model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
12313+ NULL,
12314+ LLM_FFN_SILU, LLM_FFN_PAR, il);
12315+ cb(cur, "ffn_out", il);
12316+
12317+ } else {
12318+ // MoE branch
12319+ cur = build_norm(ffn_inp,
12320+ model.layers[il].ffn_norm, NULL,
12321+ LLM_NORM_RMS, il);
12322+ cb(cur, "ffn_norm", il);
12323+
12324+ ggml_tensor * moe_out = build_moe_ffn(cur,
12325+ model.layers[il].ffn_gate_inp,
12326+ model.layers[il].ffn_up_exps,
12327+ model.layers[il].ffn_gate_exps,
12328+ model.layers[il].ffn_down_exps,
12329+ nullptr,
12330+ n_expert, n_expert_used,
12331+ LLM_FFN_SILU, true,
12332+ false, 0.0,
12333+ LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
12334+ il);
12335+ cb(moe_out, "ffn_moe_out", il);
12336+
12337+ // For Granite MoE Shared
12338+ if (hparams.n_ff_shexp > 0) {
12339+ ggml_tensor * ffn_shexp = build_ffn(cur,
12340+ model.layers[il].ffn_up_shexp, NULL, NULL,
12341+ model.layers[il].ffn_gate_shexp, NULL, NULL,
12342+ model.layers[il].ffn_down_shexp, NULL, NULL,
12343+ NULL,
12344+ LLM_FFN_SILU, LLM_FFN_PAR, il);
12345+ cb(ffn_shexp, "ffn_shexp", il);
12346+
12347+ cur = ggml_add(ctx0, moe_out, ffn_shexp);
12348+ cb(cur, "ffn_out", il);
12349+ } else {
12350+ cur = moe_out;
12351+ }
12352+ }
12353+
12354+ // For Granite architectures - scale residual
12355+ cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
12356+ cur = ggml_add(ctx0, cur, ffn_inp);
12357+ cb(cur, "ffn_out", il);
12358+
12359+ cur = build_cvec(cur, il);
12360+ cb(cur, "l_out", il);
12361+
12362+ // input for next layer
12363+ inpL = cur;
12364+ }
12365+
12366+ cur = inpL;
12367+
12368+ cur = build_norm(cur,
12369+ model.output_norm, NULL,
12370+ LLM_NORM_RMS, -1);
12371+
12372+ cb(cur, "result_norm", -1);
12373+ res->t_embd = cur;
12374+
12375+ // lm_head
12376+ cur = build_lora_mm(model.output, cur);
12377+
12378+ // For Granite architectures - scale logits
12379+ cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_logit_scale);
12380+ cb(cur, "result_output", -1);
12381+ res->t_logits = cur;
12382+
12383+ ggml_build_forward_expand(gf, cur);
12384+ }
12385+ };
12386+
1224512387// ref: https://github.com/facebookresearch/chameleon
1224612388// based on the original build_llama() function, changes:
1224712389// * qk-norm
@@ -12949,9 +13091,6 @@ llm_graph_result_ptr llama_model::build_graph(
1294913091 case LLM_ARCH_LLAMA:
1295013092 case LLM_ARCH_LLAMA4:
1295113093 case LLM_ARCH_MINICPM:
12952- case LLM_ARCH_GRANITE:
12953- case LLM_ARCH_GRANITE_MOE:
12954- case LLM_ARCH_GRANITE_MOE_SHARED:
1295513094 {
1295613095 llm = std::make_unique<llm_build_llama>(*this, params, gf);
1295713096 } break;
@@ -13182,6 +13321,12 @@ llm_graph_result_ptr llama_model::build_graph(
1318213321 {
1318313322 llm = std::make_unique<llm_build_arwkv7>(*this, params, gf);
1318413323 } break;
13324+ case LLM_ARCH_GRANITE:
13325+ case LLM_ARCH_GRANITE_MOE:
13326+ case LLM_ARCH_GRANITE_MOE_SHARED:
13327+ {
13328+ llm = std::make_unique<llm_build_granite>(*this, params, gf);
13329+ } break;
1318513330 case LLM_ARCH_CHAMELEON:
1318613331 {
1318713332 llm = std::make_unique<llm_build_chameleon>(*this, params, gf);
0 commit comments