@@ -30,6 +30,12 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr
3030 // {n_embd, n_tokens}
3131 inpL = build_inp_embd (model.tok_embd );
3232
33+ // (optional) temperature tuning - used by mistral-large
34+ ggml_tensor * inp_attn_scale = nullptr ;
35+ if (hparams.f_attn_temp_scale != 0 .0f ) {
36+ inp_attn_scale = build_inp_attn_scale ();
37+ }
38+
3339 // inp_pos - contains the positions
3440 ggml_tensor * inp_pos = build_inp_pos ();
3541
@@ -128,6 +134,12 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr
128134 ggml_tensor * Vcur = kv_cmpr;
129135 cb (Vcur, " Vcur" , il);
130136
137+ if (inp_attn_scale) {
138+ // apply llama 4 temperature scaling
139+ Qcur = ggml_mul (ctx0, Qcur, inp_attn_scale);
140+ cb (Qcur, " Qcur_attn_temp_scaled" , il);
141+ }
142+
131143 // note: MLA with the absorption optimzation converts into MQA (ie: GQA with 1 group)
132144 cur = build_attn (inp_attn,
133145 model.layers [il].wo , NULL ,
@@ -160,6 +172,12 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr
160172 ggml_tensor * Kcur = ggml_concat (ctx0, ggml_repeat (ctx0, k_pe, q_pe), k_nope, 0 );
161173 cb (Kcur, " Kcur" , il);
162174
175+ if (inp_attn_scale) {
176+ // apply llama 4 temperature scaling
177+ Qcur = ggml_mul (ctx0, Qcur, inp_attn_scale);
178+ cb (Qcur, " Qcur_attn_temp_scaled" , il);
179+ }
180+
163181 // note: MLA without the absorption optimization converts into MHA (ie: GQA with full n_head groups)
164182 cur = build_attn (inp_attn,
165183 model.layers [il].wo , NULL ,
0 commit comments