Skip to content

Commit 06e77d5

Browse files
authored
Merge branch 'ggml-org:master' into mradermacher
2 parents 243d354 + 4d37262 commit 06e77d5

File tree

2 files changed

+22
-0
lines changed

2 files changed

+22
-0
lines changed

src/llama-model.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1628,6 +1628,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
16281628
}
16291629
ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul, false);
16301630

1631+
// (optional) temperature tuning - used by mistral-large
1632+
ml.get_key(LLM_KV_ATTENTION_TEMPERATURE_SCALE, hparams.f_attn_temp_scale, false);
1633+
ml.get_key(LLM_KV_ATTENTION_TEMPERATURE_LENGTH, hparams.n_attn_temp_floor_scale, false);
1634+
16311635
switch (hparams.n_layer) {
16321636
case 27: type = LLM_TYPE_16B; break;
16331637
case 60: type = LLM_TYPE_236B; break;

src/models/deepseek2.cpp

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,12 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr
3030
// {n_embd, n_tokens}
3131
inpL = build_inp_embd(model.tok_embd);
3232

33+
// (optional) temperature tuning - used by mistral-large
34+
ggml_tensor * inp_attn_scale = nullptr;
35+
if (hparams.f_attn_temp_scale != 0.0f) {
36+
inp_attn_scale = build_inp_attn_scale();
37+
}
38+
3339
// inp_pos - contains the positions
3440
ggml_tensor * inp_pos = build_inp_pos();
3541

@@ -128,6 +134,12 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr
128134
ggml_tensor * Vcur = kv_cmpr;
129135
cb(Vcur, "Vcur", il);
130136

137+
if (inp_attn_scale) {
138+
// apply llama 4 temperature scaling
139+
Qcur = ggml_mul(ctx0, Qcur, inp_attn_scale);
140+
cb(Qcur, "Qcur_attn_temp_scaled", il);
141+
}
142+
131143
// note: MLA with the absorption optimzation converts into MQA (ie: GQA with 1 group)
132144
cur = build_attn(inp_attn,
133145
model.layers[il].wo, NULL,
@@ -160,6 +172,12 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr
160172
ggml_tensor * Kcur = ggml_concat(ctx0, ggml_repeat(ctx0, k_pe, q_pe), k_nope, 0);
161173
cb(Kcur, "Kcur", il);
162174

175+
if (inp_attn_scale) {
176+
// apply llama 4 temperature scaling
177+
Qcur = ggml_mul(ctx0, Qcur, inp_attn_scale);
178+
cb(Qcur, "Qcur_attn_temp_scaled", il);
179+
}
180+
163181
// note: MLA without the absorption optimization converts into MHA (ie: GQA with full n_head groups)
164182
cur = build_attn(inp_attn,
165183
model.layers[il].wo, NULL,

0 commit comments

Comments
 (0)