@@ -212,6 +212,8 @@ enum llm_arch {
212212 LLM_ARCH_T5,
213213 LLM_ARCH_T5ENCODER,
214214 LLM_ARCH_JAIS,
215+ LLM_ARCH_GRANITE = 46,
216+ LLM_ARCH_GRANITE_MOE,
215217 LLM_ARCH_UNKNOWN,
216218};
217219
@@ -257,6 +259,8 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
257259 { LLM_ARCH_T5, "t5" },
258260 { LLM_ARCH_T5ENCODER, "t5encoder" },
259261 { LLM_ARCH_JAIS, "jais" },
262+ { LLM_ARCH_GRANITE, "granite" },
263+ { LLM_ARCH_GRANITE_MOE, "granitemoe" },
260264 { LLM_ARCH_UNKNOWN, "(unknown)" },
261265};
262266
@@ -293,6 +297,12 @@ enum llm_kv {
293297 LLM_KV_DECODER_START_TOKEN_ID,
294298 LLM_KV_ATTN_LOGIT_SOFTCAPPING,
295299 LLM_KV_FINAL_LOGIT_SOFTCAPPING,
300+ LLM_KV_SWIN_NORM,
301+ LLM_KV_RESCALE_EVERY_N_LAYERS,
302+ LLM_KV_TIME_MIX_EXTRA_DIM,
303+ LLM_KV_TIME_DECAY_EXTRA_DIM,
304+ LLM_KV_RESIDUAL_SCALE,
305+ LLM_KV_EMBEDDING_SCALE,
296306
297307 LLM_KV_ATTENTION_HEAD_COUNT,
298308 LLM_KV_ATTENTION_HEAD_COUNT_KV,
@@ -307,6 +317,7 @@ enum llm_kv {
307317 LLM_KV_ATTENTION_KV_LORA_RANK,
308318 LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT,
309319 LLM_KV_ATTENTION_SLIDING_WINDOW,
320+ LLM_KV_ATTENTION_SCALE,
310321
311322 LLM_KV_ROPE_DIMENSION_COUNT,
312323 LLM_KV_ROPE_FREQ_BASE,
@@ -391,6 +402,8 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
391402 { LLM_KV_DECODER_START_TOKEN_ID, "%s.decoder_start_token_id" },
392403 { LLM_KV_ATTN_LOGIT_SOFTCAPPING, "%s.attn_logit_softcapping" },
393404 { LLM_KV_FINAL_LOGIT_SOFTCAPPING, "%s.final_logit_softcapping" },
405+ { LLM_KV_RESIDUAL_SCALE, "%s.residual_scale" },
406+ { LLM_KV_EMBEDDING_SCALE, "%s.embedding_scale" },
394407
395408 { LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
396409 { LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
@@ -405,6 +418,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
405418 { LLM_KV_ATTENTION_KV_LORA_RANK, "%s.attention.kv_lora_rank" },
406419 { LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, "%s.attention.relative_buckets_count" },
407420 { LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" },
421+ { LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
408422
409423 { LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
410424 { LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" },
@@ -1298,6 +1312,42 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
12981312 { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
12991313 },
13001314 },
1315+ {
1316+ LLM_ARCH_GRANITE,
1317+ {
1318+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1319+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
1320+ { LLM_TENSOR_OUTPUT, "output" },
1321+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
1322+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
1323+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
1324+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
1325+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
1326+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
1327+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
1328+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
1329+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
1330+ },
1331+ },
1332+ {
1333+ LLM_ARCH_GRANITE_MOE,
1334+ {
1335+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1336+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
1337+ { LLM_TENSOR_OUTPUT, "output" },
1338+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
1339+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
1340+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
1341+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
1342+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
1343+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
1344+ { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
1345+ { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
1346+ { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
1347+ { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
1348+ },
1349+ },
1350+
13011351 {
13021352 LLM_ARCH_UNKNOWN,
13031353 {
@@ -2203,6 +2253,11 @@ struct llama_hparams {
22032253 float f_max_alibi_bias = 0.0f;
22042254 float f_logit_scale = 0.0f;
22052255
2256+ // Additional scale factors (Granite/Granite MoE)
2257+ float f_residual_scale = 0.0f;
2258+ float f_embedding_scale = 0.0f;
2259+ float f_attention_scale = 0.0f;
2260+
22062261 bool causal_attn = true;
22072262 bool use_alibi = false;
22082263 bool attn_soft_cap = false;
@@ -2259,6 +2314,9 @@ struct llama_hparams {
22592314 if (!is_float_close(this->rope_freq_scale_train, other.rope_freq_scale_train, EPSILON)) return true;
22602315 if (!is_float_close(this->expert_weights_scale, other.expert_weights_scale, EPSILON)) return true;
22612316 if (!is_float_close(this->rope_yarn_log_mul, other.rope_yarn_log_mul, EPSILON)) return true;
2317+ if (!is_float_close(this->f_residual_scale, other.f_residual_scale, EPSILON)) return true;
2318+ if (!is_float_close(this->f_embedding_scale, other.f_embedding_scale, EPSILON)) return true;
2319+ if (!is_float_close(this->f_attention_scale, other.f_attention_scale, EPSILON)) return true;
22622320
22632321 return false;
22642322 }
@@ -5283,6 +5341,22 @@ static void llm_load_hparams(
52835341 default: model.type = e_model::MODEL_UNKNOWN;
52845342 }
52855343 } break;
5344+ case LLM_ARCH_GRANITE:
5345+ case LLM_ARCH_GRANITE_MOE:
5346+ {
5347+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
5348+ ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
5349+ ml.get_key(LLM_KV_RESIDUAL_SCALE, hparams.f_residual_scale);
5350+ ml.get_key(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale);
5351+ ml.get_key(LLM_KV_ATTENTION_SCALE, hparams.f_attention_scale);
5352+
5353+ switch (hparams.n_layer) {
5354+ case 32: model.type = e_model::MODEL_3B; break;
5355+ case 40: model.type = e_model::MODEL_3B; break;
5356+ // Add additional layer/vocab/etc checks here for other model sizes
5357+ default: model.type = e_model::MODEL_UNKNOWN;
5358+ }
5359+ } break;
52865360 default: (void)0;
52875361 }
52885362
@@ -5970,6 +6044,13 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
59706044 LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
59716045 LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
59726046 }
6047+
6048+ if (model.arch == LLM_ARCH_GRANITE || model.arch == LLM_ARCH_GRANITE_MOE) {
6049+ LLAMA_LOG_INFO("%s: f_embedding_scale = %f\n", __func__, hparams.f_embedding_scale);
6050+ LLAMA_LOG_INFO("%s: f_residual_scale = %f\n", __func__, hparams.f_residual_scale);
6051+ LLAMA_LOG_INFO("%s: f_attention_scale = %f\n", __func__, hparams.f_attention_scale);
6052+ }
6053+
59736054}
59746055
59756056// Returns false if cancelled by progress_callback
@@ -6138,6 +6219,8 @@ static bool llm_load_tensors(
61386219 case LLM_ARCH_LLAMA:
61396220 case LLM_ARCH_REFACT:
61406221 case LLM_ARCH_MINICPM:
6222+ case LLM_ARCH_GRANITE:
6223+ case LLM_ARCH_GRANITE_MOE:
61416224 {
61426225 model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
61436226
@@ -7927,6 +8010,11 @@ static struct ggml_tensor * llm_build_inp_embd(
79278010 ggml_set_input(lctx.inp_embd);
79288011 }
79298012
8013+ // For Granite architecture
8014+ if (hparams.f_embedding_scale != 0.0f) {
8015+ inpL = ggml_scale(ctx, inpL, hparams.f_embedding_scale);
8016+ }
8017+
79308018 cb(inpL, "inp_embd", -1);
79318019
79328020 return inpL;
@@ -8358,12 +8446,15 @@ static struct ggml_tensor * llm_build_kqv(
83588446 if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3 || model.arch == LLM_ARCH_GPTNEOX) {
83598447 ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32);
83608448 }
8449+ //ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32);
83618450
83628451 cur = ggml_reshape_2d(ctx, cur, n_embd_head_v*n_head, n_tokens);
83638452 } else {
83648453 struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
83658454 cb(kq, "kq", il);
83668455
8456+ //ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
8457+
83678458 if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3 || model.arch == LLM_ARCH_GPTNEOX || model.arch == LLM_ARCH_QWEN2) {
83688459 // for this arch, we need to perform the KQ multiplication with F32 precision, otherwise we get NaNs
83698460 // ref: https://github.com/ggerganov/llama.cpp/pull/4490#issuecomment-1859055847
@@ -8917,6 +9008,8 @@ struct llm_build_context {
89179008 // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
89189009 struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
89199010
9011+ //const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
9012+ const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : 1.f;
89209013 for (int il = 0; il < n_layer; ++il) {
89219014 struct ggml_tensor * inpSA = inpL;
89229015
@@ -8933,6 +9026,9 @@ struct llm_build_context {
89339026
89349027 // compute Q and K and RoPE them
89359028 struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
9029+ if (hparams.f_attention_scale != 0) {
9030+ Qcur = ggml_scale(ctx0, Qcur, hparams.f_attention_scale);
9031+ }
89369032 cb(Qcur, "Qcur", il);
89379033 if (model.layers[il].bq) {
89389034 Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
@@ -8969,7 +9065,7 @@ struct llm_build_context {
89699065
89709066 cur = llm_build_kv(ctx0, lctx, kv_self, gf,
89719067 model.layers[il].wo, model.layers[il].bo,
8972- Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)) , cb, il);
9068+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, kq_scale , cb, il);
89739069 }
89749070
89759071 if (il == n_layer - 1) {
@@ -8980,6 +9076,11 @@ struct llm_build_context {
89809076 inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
89819077 }
89829078
9079+ // For Granite architecture
9080+ if (hparams.f_residual_scale) {
9081+ cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
9082+ }
9083+
89839084 struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
89849085 cb(ffn_inp, "ffn_inp", il);
89859086
@@ -9016,6 +9117,11 @@ struct llm_build_context {
90169117 cb(cur, "ffn_moe_out", il);
90179118 }
90189119
9120+ // For Granite architecture
9121+ if (hparams.f_residual_scale) {
9122+ cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
9123+ }
9124+
90199125 cur = ggml_add(ctx0, cur, ffn_inp);
90209126 cb(cur, "ffn_out", il);
90219127
@@ -9035,6 +9141,12 @@ struct llm_build_context {
90359141
90369142 // lm_head
90379143 cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
9144+
9145+ // For Granite architecture
9146+ if (hparams.f_logit_scale) {
9147+ cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_logit_scale);
9148+ }
9149+
90389150 cb(cur, "result_output", -1);
90399151
90409152 ggml_build_forward_expand(gf, cur);
@@ -14032,6 +14144,8 @@ static struct ggml_cgraph * llama_build_graph(
1403214144
1403314145 switch (model.arch) {
1403414146 case LLM_ARCH_LLAMA:
14147+ case LLM_ARCH_GRANITE:
14148+ case LLM_ARCH_GRANITE_MOE:
1403514149 {
1403614150 result = llm.build_llama();
1403714151 } break;
@@ -17470,6 +17584,8 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
1747017584 case LLM_ARCH_ARCTIC:
1747117585 case LLM_ARCH_DEEPSEEK2:
1747217586 case LLM_ARCH_CHATGLM:
17587+ case LLM_ARCH_GRANITE:
17588+ case LLM_ARCH_GRANITE_MOE:
1747317589 return LLAMA_ROPE_TYPE_NORM;
1747417590
1747517591 // the pairs of head values are offset by n_rot/2
0 commit comments