Merge pull request #1 from nlasky2000-dot/fix-mistral3-attn-temp-scaling

nlasky2000-dot · web-flow · commit 4cf8a560f224 · 2025-12-14T01:02:07.000-05:00
fix: correct attention temperature scaling formula for Mistral3/Devstral
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
@@ -77,9 +77,11 @@ void llm_graph_input_attn_temp::set_input(const llama_ubatch * ubatch) {
         std::vector<float> attn_scale_data(n_tokens, 0.0f);
         for (int i = 0; i < n_tokens; ++i) {
             const float pos = ubatch->pos[i];
-            attn_scale_data[i] = std::log(
-                std::floor((pos + 1.0f) / n_attn_temp_floor_scale) + 1.0
-            ) * f_attn_temp_scale + 1.0;
+            // ref: https://github.com/huggingface/transformers/blob/main/src/transformers/models/ministral3/modeling_ministral3.py#L101
+            // scaling = 1 + beta * log(1 + floor(pos / max_position_embeddings))
+            attn_scale_data[i] = 1.0f + f_attn_temp_scale * std::log(
+                1.0f + std::floor(pos / n_attn_temp_floor_scale)
+            );
         }
 
         ggml_backend_tensor_set(attn_scale, attn_scale_data.data(), 0, n_tokens*ggml_element_size(attn_scale));