Skip to content

Commit 17d7f4a

Browse files
authored
Merge pull request #608 from gabe-l-hart/GraniteThreeSupport
Granite three support
2 parents 9d92413 + f44916f commit 17d7f4a

File tree

1 file changed

+117
-1
lines changed

1 file changed

+117
-1
lines changed

llama.cpp/llama.cpp

Lines changed: 117 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -178,6 +178,8 @@ enum llm_arch {
178178
LLM_ARCH_JAIS,
179179
LLM_ARCH_NEMOTRON,
180180
LLM_ARCH_EXAONE,
181+
LLM_ARCH_GRANITE,
182+
LLM_ARCH_GRANITE_MOE,
181183
LLM_ARCH_UNKNOWN,
182184
};
183185

@@ -225,6 +227,8 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
225227
{ LLM_ARCH_JAIS, "jais" },
226228
{ LLM_ARCH_NEMOTRON, "nemotron" },
227229
{ LLM_ARCH_EXAONE, "exaone" },
230+
{ LLM_ARCH_GRANITE, "granite" },
231+
{ LLM_ARCH_GRANITE_MOE, "granitemoe" },
228232
{ LLM_ARCH_UNKNOWN, "(unknown)" },
229233
};
230234

@@ -261,6 +265,8 @@ enum llm_kv {
261265
LLM_KV_DECODER_START_TOKEN_ID,
262266
LLM_KV_ATTN_LOGIT_SOFTCAPPING,
263267
LLM_KV_FINAL_LOGIT_SOFTCAPPING,
268+
LLM_KV_RESIDUAL_SCALE,
269+
LLM_KV_EMBEDDING_SCALE,
264270

265271
LLM_KV_ATTENTION_HEAD_COUNT,
266272
LLM_KV_ATTENTION_HEAD_COUNT_KV,
@@ -275,6 +281,7 @@ enum llm_kv {
275281
LLM_KV_ATTENTION_KV_LORA_RANK,
276282
LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT,
277283
LLM_KV_ATTENTION_SLIDING_WINDOW,
284+
LLM_KV_ATTENTION_SCALE,
278285

279286
LLM_KV_ROPE_DIMENSION_COUNT,
280287
LLM_KV_ROPE_FREQ_BASE,
@@ -359,6 +366,8 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
359366
{ LLM_KV_DECODER_START_TOKEN_ID, "%s.decoder_start_token_id" },
360367
{ LLM_KV_ATTN_LOGIT_SOFTCAPPING, "%s.attn_logit_softcapping" },
361368
{ LLM_KV_FINAL_LOGIT_SOFTCAPPING, "%s.final_logit_softcapping" },
369+
{ LLM_KV_RESIDUAL_SCALE, "%s.residual_scale" },
370+
{ LLM_KV_EMBEDDING_SCALE, "%s.embedding_scale" },
362371

363372
{ LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
364373
{ LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
@@ -373,6 +382,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
373382
{ LLM_KV_ATTENTION_KV_LORA_RANK, "%s.attention.kv_lora_rank" },
374383
{ LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, "%s.attention.relative_buckets_count" },
375384
{ LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" },
385+
{ LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
376386

377387
{ LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
378388
{ LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" },
@@ -1303,6 +1313,41 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
13031313
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
13041314
},
13051315
},
1316+
{
1317+
LLM_ARCH_GRANITE,
1318+
{
1319+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1320+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
1321+
{ LLM_TENSOR_OUTPUT, "output" },
1322+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
1323+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
1324+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
1325+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
1326+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
1327+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
1328+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
1329+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
1330+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
1331+
},
1332+
},
1333+
{
1334+
LLM_ARCH_GRANITE_MOE,
1335+
{
1336+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1337+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
1338+
{ LLM_TENSOR_OUTPUT, "output" },
1339+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
1340+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
1341+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
1342+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
1343+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
1344+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
1345+
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
1346+
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
1347+
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
1348+
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
1349+
},
1350+
},
13061351
{
13071352
LLM_ARCH_UNKNOWN,
13081353
{
@@ -1931,6 +1976,11 @@ struct llama_hparams {
19311976
float f_max_alibi_bias = 0.0f;
19321977
float f_logit_scale = 0.0f;
19331978

1979+
// Additional scale factors (Granite/Granite MoE)
1980+
float f_residual_scale = 0.0f;
1981+
float f_embedding_scale = 0.0f;
1982+
float f_attention_scale = 0.0f;
1983+
19341984
bool causal_attn = true;
19351985
bool use_alibi = false;
19361986
bool attn_soft_cap = false;
@@ -1987,6 +2037,9 @@ struct llama_hparams {
19872037
if (!is_float_close(this->rope_freq_scale_train, other.rope_freq_scale_train, EPSILON)) return true;
19882038
if (!is_float_close(this->expert_weights_scale, other.expert_weights_scale, EPSILON)) return true;
19892039
if (!is_float_close(this->rope_yarn_log_mul, other.rope_yarn_log_mul, EPSILON)) return true;
2040+
if (!is_float_close(this->f_residual_scale, other.f_residual_scale, EPSILON)) return true;
2041+
if (!is_float_close(this->f_embedding_scale, other.f_embedding_scale, EPSILON)) return true;
2042+
if (!is_float_close(this->f_attention_scale, other.f_attention_scale, EPSILON)) return true;
19902043

19912044
return false;
19922045
}
@@ -4998,6 +5051,22 @@ static void llm_load_hparams(
49985051
default: model.type = e_model::MODEL_UNKNOWN;
49995052
}
50005053
} break;
5054+
case LLM_ARCH_GRANITE:
5055+
case LLM_ARCH_GRANITE_MOE:
5056+
{
5057+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
5058+
ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
5059+
ml.get_key(LLM_KV_RESIDUAL_SCALE, hparams.f_residual_scale);
5060+
ml.get_key(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale);
5061+
ml.get_key(LLM_KV_ATTENTION_SCALE, hparams.f_attention_scale);
5062+
5063+
switch (hparams.n_layer) {
5064+
case 32: model.type = e_model::MODEL_3B; break;
5065+
case 40: model.type = e_model::MODEL_3B; break;
5066+
// Add additional layer/vocab/etc checks here for other model sizes
5067+
default: model.type = e_model::MODEL_UNKNOWN;
5068+
}
5069+
} break;
50015070
default: (void)0;
50025071
}
50035072

@@ -5665,6 +5734,12 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
56655734
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
56665735
LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
56675736
}
5737+
5738+
if (model.arch == LLM_ARCH_GRANITE || model.arch == LLM_ARCH_GRANITE_MOE) {
5739+
LLAMA_LOG_INFO("%s: f_embedding_scale = %f\n", __func__, hparams.f_embedding_scale);
5740+
LLAMA_LOG_INFO("%s: f_residual_scale = %f\n", __func__, hparams.f_residual_scale);
5741+
LLAMA_LOG_INFO("%s: f_attention_scale = %f\n", __func__, hparams.f_attention_scale);
5742+
}
56685743
}
56695744

56705745
// Returns false if cancelled by progress_callback
@@ -5837,6 +5912,8 @@ static bool llm_load_tensors(
58375912
case LLM_ARCH_LLAMA:
58385913
case LLM_ARCH_REFACT:
58395914
case LLM_ARCH_MINICPM:
5915+
case LLM_ARCH_GRANITE:
5916+
case LLM_ARCH_GRANITE_MOE:
58405917
{
58415918
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
58425919

@@ -7669,6 +7746,11 @@ static struct ggml_tensor * llm_build_inp_embd(
76697746
ggml_set_input(lctx.inp_embd);
76707747
}
76717748

7749+
// For Granite architecture
7750+
if (hparams.f_embedding_scale != 0.0f) {
7751+
inpL = ggml_scale(ctx, inpL, hparams.f_embedding_scale);
7752+
}
7753+
76727754
cb(inpL, "inp_embd", -1);
76737755

76747756
return inpL;
@@ -8646,6 +8728,7 @@ struct llm_build_context {
86468728
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
86478729
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
86488730

8731+
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
86498732
for (int il = 0; il < n_layer; ++il) {
86508733
struct ggml_tensor * inpSA = inpL;
86518734

@@ -8698,7 +8781,7 @@ struct llm_build_context {
86988781

86998782
cur = llm_build_kv(ctx0, lctx, kv_self, gf,
87008783
model.layers[il].wo, model.layers[il].bo,
8701-
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8784+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, kq_scale, cb, il);
87028785
}
87038786

87048787
if (il == n_layer - 1) {
@@ -8709,6 +8792,11 @@ struct llm_build_context {
87098792
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
87108793
}
87118794

8795+
// For Granite architecture
8796+
if (hparams.f_residual_scale) {
8797+
cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
8798+
}
8799+
87128800
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
87138801
cb(ffn_inp, "ffn_inp", il);
87148802

@@ -8745,6 +8833,11 @@ struct llm_build_context {
87458833
cb(cur, "ffn_moe_out", il);
87468834
}
87478835

8836+
// For Granite architecture
8837+
if (hparams.f_residual_scale) {
8838+
cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
8839+
}
8840+
87488841
cur = ggml_add(ctx0, cur, ffn_inp);
87498842
cb(cur, "ffn_out", il);
87508843

@@ -8764,6 +8857,12 @@ struct llm_build_context {
87648857

87658858
// lm_head
87668859
cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
8860+
8861+
// For Granite architecture
8862+
if (hparams.f_logit_scale) {
8863+
cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_logit_scale);
8864+
}
8865+
87678866
cb(cur, "result_output", -1);
87688867

87698868
ggml_build_forward_expand(gf, cur);
@@ -13942,6 +14041,8 @@ static struct ggml_cgraph * llama_build_graph(
1394214041

1394314042
switch (model.arch) {
1394414043
case LLM_ARCH_LLAMA:
14044+
case LLM_ARCH_GRANITE:
14045+
case LLM_ARCH_GRANITE_MOE:
1394514046
{
1394614047
result = llm.build_llama();
1394714048
} break;
@@ -17195,6 +17296,8 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
1719517296
case LLM_ARCH_ARCTIC:
1719617297
case LLM_ARCH_DEEPSEEK2:
1719717298
case LLM_ARCH_CHATGLM:
17299+
case LLM_ARCH_GRANITE:
17300+
case LLM_ARCH_GRANITE_MOE:
1719817301
return LLAMA_ROPE_TYPE_NORM;
1719917302

1720017303
// the pairs of head values are offset by n_rot/2
@@ -19191,6 +19294,19 @@ static int32_t llama_chat_apply_template_internal(
1919119294
if (add_ass) {
1919219295
ss << "[|assistant|]";
1919319296
}
19297+
} else if (tmpl == "granite" || tmpl == "granitemoe" || tmpl_contains("<|start_of_role|>") || tmpl_contains("Granite")) {
19298+
// IBM Granite template
19299+
for (const auto & message : chat) {
19300+
std::string role(message->role);
19301+
ss << "<|start_of_role|>" << role << "<|end_of_role|>";
19302+
if (role == "assistant_tool_call") {
19303+
ss << "<|tool_call|>";
19304+
}
19305+
ss << message->content << "<|end_of_text|>\n";
19306+
}
19307+
if (add_ass) {
19308+
ss << "<|start_of_role|>assistant<|end_of_role|>\n";
19309+
}
1919419310
} else {
1919519311
// template not supported
1919619312
return -1;

0 commit comments

Comments
 (0)