load ok, missing patch merger

ngxson · ngxson · commit efce8750ed3f · 2025-05-01T08:52:11.000+02:00
diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
@@ -1931,7 +1931,18 @@ def set_gguf_parameters(self):
         if hparams["model_type"] == "pixtral":
             self.gguf_writer.add_vision_projector_type(gguf.VisionProjectorType.PIXTRAL)
             self.gguf_writer.add_vision_attention_layernorm_eps(hparams["layer_norm_eps"])
-            self.gguf_writer.add_vision_use_silu(True)
+
+            # hidden_act
+            if hparams["hidden_act"] == "silu":
+                self.gguf_writer.add_vision_use_silu(True)
+            elif hparams["hidden_act"] == "gelu":
+                self.gguf_writer.add_vision_use_gelu(True)
+            else:
+                raise ValueError(f"Unsupported hidden_act: {hparams['hidden_act']}")
+
+            # spatial_merge_size
+            if "spatial_merge_size" in self.global_config:
+                self.gguf_writer.add_vision_spatial_merge_size(self.global_config["spatial_merge_size"])
 
     def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
         del bid  # unused
diff --git a/examples/llava/clip-impl.h b/examples/llava/clip-impl.h
@@ -31,6 +31,7 @@
 #define KEY_FEATURE_LAYER       "clip.vision.feature_layer"
 #define KEY_PROJ_SCALE_FACTOR   "clip.vision.projector.scale_factor"
 #define KEY_PROJ_TYPE           "clip.projector_type"
+#define KEY_SPATIAL_MERGE_SIZE  "clip.spatial_merge_size"
 
 #define KEY_USE_GLU_MLP         "clip.use_glu_mlp"  // for qwen2.5vl
 #define KEY_USE_RMS_NORM        "clip.use_rms_norm" // for qwen2.5vl
diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
@@ -172,6 +172,7 @@ struct clip_hparams {
     std::unordered_set<int32_t> vision_feature_layer;
     int32_t attn_window_size = 0;
     int32_t n_wa_pattern = 0;
+    int32_t spatial_merge_size = 0;
 };
 
 struct clip_layer {
@@ -232,6 +233,7 @@ struct clip_vision_model {
     struct ggml_tensor * projection;
 
     // LLaVA projection
+    struct ggml_tensor * mm_input_norm_w = nullptr;
     struct ggml_tensor * mm_0_w = nullptr;
     struct ggml_tensor * mm_0_b = nullptr;
     struct ggml_tensor * mm_2_w = nullptr;
@@ -311,6 +313,7 @@ struct clip_vision_model {
 
     // pixtral
     struct ggml_tensor * token_embd_img_break = nullptr;
+    struct ggml_tensor * mm_patch_merger_w = nullptr;
 };
 
 struct clip_ctx {
@@ -721,7 +724,13 @@ static ggml_cgraph * clip_image_build_graph_pixtral(clip_ctx * ctx, const clip_i
         {
             ggml_tensor * gate_proj = ggml_mul_mat(ctx0, model.layers[il].ff_gate_w, cur);
             ggml_tensor * up_proj   = ggml_mul_mat(ctx0, model.layers[il].ff_up_w,   cur);
-            gate_proj = ggml_silu(ctx0, gate_proj); // pixtral uses silu
+            if (ctx->use_silu) {
+                gate_proj = ggml_silu(ctx0, gate_proj);
+            } else if (ctx->use_gelu) {
+                gate_proj = ggml_gelu(ctx0, gate_proj);
+            } else {
+                GGML_ABORT("Pixtral: Unsupported activation");
+            }
             cur = ggml_mul(ctx0, up_proj, gate_proj);
             cur = ggml_mul_mat(ctx0, model.layers[il].ff_down_w, cur);
         }
@@ -732,7 +741,18 @@ static ggml_cgraph * clip_image_build_graph_pixtral(clip_ctx * ctx, const clip_i
         embeddings = cur;
     }
 
-    // LlavaMultiModalProjector (with GELU activation)
+    // mistral small 3.1 patch merger
+    // ref: https://github.com/huggingface/transformers/blob/7a3e208892c06a5e278144eaf38c8599a42f53e7/src/transformers/models/mistral3/modeling_mistral3.py#L67
+    if (model.mm_patch_merger_w) {
+        GGML_ASSERT(hparams.spatial_merge_size > 0);
+        embeddings = ggml_mul(ctx0, ggml_rms_norm(ctx0, embeddings, eps), model.mm_input_norm_w);
+
+        // reshape image tokens to 2D grid
+        embeddings = ggml_reshape_3d(ctx0, embeddings, hidden_size, n_patches_x, n_patches_y);
+        embeddings = ggml_permute(ctx0, embeddings, 1, 2, 0, 3); // [x, y, hidden_size]
+    }
+
+    // LlavaMultiModalProjector (always using GELU activation)
     {
         embeddings = ggml_mul_mat(ctx0, model.mm_1_w, embeddings);
         embeddings = ggml_add(ctx0, embeddings, model.mm_1_b);
@@ -1734,6 +1754,7 @@ struct clip_model_loader {
                 case PROJECTOR_TYPE_PIXTRAL:
                     {
                         hparams.rope_theta = 10000.0f;
+                        get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.spatial_merge_size, false);
                     } break;
                 case PROJECTOR_TYPE_QWEN25VL:
                     {
@@ -1962,6 +1983,9 @@ struct clip_model_loader {
                     vision_model.mm_2_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"));
                     // [IMG_BREAK] token embedding
                     vision_model.token_embd_img_break = get_tensor(TN_TOK_IMG_BREAK);
+                    // for mistral small 3.1
+                    vision_model.mm_input_norm_w   = get_tensor(TN_MM_INP_NORM,     false);
+                    vision_model.mm_patch_merger_w = get_tensor(TN_MM_PATCH_MERGER, false);
                 } break;
             default:
                 GGML_ASSERT(false && "unknown projector type");
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
@@ -231,6 +231,7 @@ class ClipVision:
         BLOCK_COUNT         = "clip.vision.block_count"
         IMAGE_MEAN          = "clip.vision.image_mean"
         IMAGE_STD           = "clip.vision.image_std"
+        SPATIAL_MERGE_SIZE  = "clip.vision.spatial_merge_size"
         USE_GELU            = "clip.use_gelu"
         USE_SILU            = "clip.use_silu"
 
diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py
@@ -972,6 +972,9 @@ def add_vision_image_mean(self, values: Sequence[float]) -> None:
     def add_vision_image_std(self, values: Sequence[float]) -> None:
         self.add_array(Keys.ClipVision.IMAGE_STD, values)
 
+    def add_vision_spatial_merge_size(self, value: int) -> None:
+        self.add_uint32(Keys.ClipVision.SPATIAL_MERGE_SIZE, value)
+
     def add_vision_use_gelu(self, value: bool) -> None:
         self.add_bool(Keys.ClipVision.USE_GELU, value)