@@ -178,6 +178,8 @@ enum llm_arch {
178178 LLM_ARCH_JAIS,
179179 LLM_ARCH_NEMOTRON,
180180 LLM_ARCH_EXAONE,
181+ LLM_ARCH_GRANITE,
182+ LLM_ARCH_GRANITE_MOE,
181183 LLM_ARCH_UNKNOWN,
182184};
183185
@@ -225,6 +227,8 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
225227 { LLM_ARCH_JAIS, "jais" },
226228 { LLM_ARCH_NEMOTRON, "nemotron" },
227229 { LLM_ARCH_EXAONE, "exaone" },
230+ { LLM_ARCH_GRANITE, "granite" },
231+ { LLM_ARCH_GRANITE_MOE, "granitemoe" },
228232 { LLM_ARCH_UNKNOWN, "(unknown)" },
229233};
230234
@@ -261,6 +265,8 @@ enum llm_kv {
261265 LLM_KV_DECODER_START_TOKEN_ID,
262266 LLM_KV_ATTN_LOGIT_SOFTCAPPING,
263267 LLM_KV_FINAL_LOGIT_SOFTCAPPING,
268+ LLM_KV_RESIDUAL_SCALE,
269+ LLM_KV_EMBEDDING_SCALE,
264270
265271 LLM_KV_ATTENTION_HEAD_COUNT,
266272 LLM_KV_ATTENTION_HEAD_COUNT_KV,
@@ -275,6 +281,7 @@ enum llm_kv {
275281 LLM_KV_ATTENTION_KV_LORA_RANK,
276282 LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT,
277283 LLM_KV_ATTENTION_SLIDING_WINDOW,
284+ LLM_KV_ATTENTION_SCALE,
278285
279286 LLM_KV_ROPE_DIMENSION_COUNT,
280287 LLM_KV_ROPE_FREQ_BASE,
@@ -359,6 +366,8 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
359366 { LLM_KV_DECODER_START_TOKEN_ID, "%s.decoder_start_token_id" },
360367 { LLM_KV_ATTN_LOGIT_SOFTCAPPING, "%s.attn_logit_softcapping" },
361368 { LLM_KV_FINAL_LOGIT_SOFTCAPPING, "%s.final_logit_softcapping" },
369+ { LLM_KV_RESIDUAL_SCALE, "%s.residual_scale" },
370+ { LLM_KV_EMBEDDING_SCALE, "%s.embedding_scale" },
362371
363372 { LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
364373 { LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
@@ -373,6 +382,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
373382 { LLM_KV_ATTENTION_KV_LORA_RANK, "%s.attention.kv_lora_rank" },
374383 { LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, "%s.attention.relative_buckets_count" },
375384 { LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" },
385+ { LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
376386
377387 { LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
378388 { LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" },
@@ -1303,6 +1313,41 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
13031313 { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
13041314 },
13051315 },
1316+ {
1317+ LLM_ARCH_GRANITE,
1318+ {
1319+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1320+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
1321+ { LLM_TENSOR_OUTPUT, "output" },
1322+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
1323+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
1324+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
1325+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
1326+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
1327+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
1328+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
1329+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
1330+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
1331+ },
1332+ },
1333+ {
1334+ LLM_ARCH_GRANITE_MOE,
1335+ {
1336+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1337+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
1338+ { LLM_TENSOR_OUTPUT, "output" },
1339+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
1340+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
1341+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
1342+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
1343+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
1344+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
1345+ { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
1346+ { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
1347+ { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
1348+ { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
1349+ },
1350+ },
13061351 {
13071352 LLM_ARCH_UNKNOWN,
13081353 {
@@ -1931,6 +1976,11 @@ struct llama_hparams {
19311976 float f_max_alibi_bias = 0.0f;
19321977 float f_logit_scale = 0.0f;
19331978
1979+ // Additional scale factors (Granite/Granite MoE)
1980+ float f_residual_scale = 0.0f;
1981+ float f_embedding_scale = 0.0f;
1982+ float f_attention_scale = 0.0f;
1983+
19341984 bool causal_attn = true;
19351985 bool use_alibi = false;
19361986 bool attn_soft_cap = false;
@@ -1987,6 +2037,9 @@ struct llama_hparams {
19872037 if (!is_float_close(this->rope_freq_scale_train, other.rope_freq_scale_train, EPSILON)) return true;
19882038 if (!is_float_close(this->expert_weights_scale, other.expert_weights_scale, EPSILON)) return true;
19892039 if (!is_float_close(this->rope_yarn_log_mul, other.rope_yarn_log_mul, EPSILON)) return true;
2040+ if (!is_float_close(this->f_residual_scale, other.f_residual_scale, EPSILON)) return true;
2041+ if (!is_float_close(this->f_embedding_scale, other.f_embedding_scale, EPSILON)) return true;
2042+ if (!is_float_close(this->f_attention_scale, other.f_attention_scale, EPSILON)) return true;
19902043
19912044 return false;
19922045 }
@@ -4998,6 +5051,22 @@ static void llm_load_hparams(
49985051 default: model.type = e_model::MODEL_UNKNOWN;
49995052 }
50005053 } break;
5054+ case LLM_ARCH_GRANITE:
5055+ case LLM_ARCH_GRANITE_MOE:
5056+ {
5057+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
5058+ ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
5059+ ml.get_key(LLM_KV_RESIDUAL_SCALE, hparams.f_residual_scale);
5060+ ml.get_key(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale);
5061+ ml.get_key(LLM_KV_ATTENTION_SCALE, hparams.f_attention_scale);
5062+
5063+ switch (hparams.n_layer) {
5064+ case 32: model.type = e_model::MODEL_3B; break;
5065+ case 40: model.type = e_model::MODEL_3B; break;
5066+ // Add additional layer/vocab/etc checks here for other model sizes
5067+ default: model.type = e_model::MODEL_UNKNOWN;
5068+ }
5069+ } break;
50015070 default: (void)0;
50025071 }
50035072
@@ -5665,6 +5734,12 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
56655734 LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
56665735 LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
56675736 }
5737+
5738+ if (model.arch == LLM_ARCH_GRANITE || model.arch == LLM_ARCH_GRANITE_MOE) {
5739+ LLAMA_LOG_INFO("%s: f_embedding_scale = %f\n", __func__, hparams.f_embedding_scale);
5740+ LLAMA_LOG_INFO("%s: f_residual_scale = %f\n", __func__, hparams.f_residual_scale);
5741+ LLAMA_LOG_INFO("%s: f_attention_scale = %f\n", __func__, hparams.f_attention_scale);
5742+ }
56685743}
56695744
56705745// Returns false if cancelled by progress_callback
@@ -5837,6 +5912,8 @@ static bool llm_load_tensors(
58375912 case LLM_ARCH_LLAMA:
58385913 case LLM_ARCH_REFACT:
58395914 case LLM_ARCH_MINICPM:
5915+ case LLM_ARCH_GRANITE:
5916+ case LLM_ARCH_GRANITE_MOE:
58405917 {
58415918 model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
58425919
@@ -7669,6 +7746,11 @@ static struct ggml_tensor * llm_build_inp_embd(
76697746 ggml_set_input(lctx.inp_embd);
76707747 }
76717748
7749+ // For Granite architecture
7750+ if (hparams.f_embedding_scale != 0.0f) {
7751+ inpL = ggml_scale(ctx, inpL, hparams.f_embedding_scale);
7752+ }
7753+
76727754 cb(inpL, "inp_embd", -1);
76737755
76747756 return inpL;
@@ -8646,6 +8728,7 @@ struct llm_build_context {
86468728 // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
86478729 struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
86488730
8731+ const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
86498732 for (int il = 0; il < n_layer; ++il) {
86508733 struct ggml_tensor * inpSA = inpL;
86518734
@@ -8698,7 +8781,7 @@ struct llm_build_context {
86988781
86998782 cur = llm_build_kv(ctx0, lctx, kv_self, gf,
87008783 model.layers[il].wo, model.layers[il].bo,
8701- Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)) , cb, il);
8784+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, kq_scale , cb, il);
87028785 }
87038786
87048787 if (il == n_layer - 1) {
@@ -8709,6 +8792,11 @@ struct llm_build_context {
87098792 inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
87108793 }
87118794
8795+ // For Granite architecture
8796+ if (hparams.f_residual_scale) {
8797+ cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
8798+ }
8799+
87128800 struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
87138801 cb(ffn_inp, "ffn_inp", il);
87148802
@@ -8745,6 +8833,11 @@ struct llm_build_context {
87458833 cb(cur, "ffn_moe_out", il);
87468834 }
87478835
8836+ // For Granite architecture
8837+ if (hparams.f_residual_scale) {
8838+ cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
8839+ }
8840+
87488841 cur = ggml_add(ctx0, cur, ffn_inp);
87498842 cb(cur, "ffn_out", il);
87508843
@@ -8764,6 +8857,12 @@ struct llm_build_context {
87648857
87658858 // lm_head
87668859 cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
8860+
8861+ // For Granite architecture
8862+ if (hparams.f_logit_scale) {
8863+ cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_logit_scale);
8864+ }
8865+
87678866 cb(cur, "result_output", -1);
87688867
87698868 ggml_build_forward_expand(gf, cur);
@@ -13942,6 +14041,8 @@ static struct ggml_cgraph * llama_build_graph(
1394214041
1394314042 switch (model.arch) {
1394414043 case LLM_ARCH_LLAMA:
14044+ case LLM_ARCH_GRANITE:
14045+ case LLM_ARCH_GRANITE_MOE:
1394514046 {
1394614047 result = llm.build_llama();
1394714048 } break;
@@ -17195,6 +17296,8 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
1719517296 case LLM_ARCH_ARCTIC:
1719617297 case LLM_ARCH_DEEPSEEK2:
1719717298 case LLM_ARCH_CHATGLM:
17299+ case LLM_ARCH_GRANITE:
17300+ case LLM_ARCH_GRANITE_MOE:
1719817301 return LLAMA_ROPE_TYPE_NORM;
1719917302
1720017303 // the pairs of head values are offset by n_rot/2
@@ -19191,6 +19294,19 @@ static int32_t llama_chat_apply_template_internal(
1919119294 if (add_ass) {
1919219295 ss << "[|assistant|]";
1919319296 }
19297+ } else if (tmpl == "granite" || tmpl == "granitemoe" || tmpl_contains("<|start_of_role|>") || tmpl_contains("Granite")) {
19298+ // IBM Granite template
19299+ for (const auto & message : chat) {
19300+ std::string role(message->role);
19301+ ss << "<|start_of_role|>" << role << "<|end_of_role|>";
19302+ if (role == "assistant_tool_call") {
19303+ ss << "<|tool_call|>";
19304+ }
19305+ ss << message->content << "<|end_of_text|>\n";
19306+ }
19307+ if (add_ass) {
19308+ ss << "<|start_of_role|>assistant<|end_of_role|>\n";
19309+ }
1919419310 } else {
1919519311 // template not supported
1919619312 return -1;
0 commit comments