scale_factor is an int

ngxson · ngxson · commit f60cc7e3cdd3 · 2025-04-21T20:24:48.000+02:00
diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
@@ -1901,7 +1901,7 @@ def set_gguf_parameters(self):
         super().set_gguf_parameters()
         self.gguf_writer.add_vision_projector_type(gguf.VisionProjectorType.IDEFICS3)
         self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams.get("layer_norm_eps", 1e-5))
-        self.gguf_writer.add_vision_projector_scale_factor(self.global_config.get("scale_factor", 2.0))
+        self.gguf_writer.add_vision_projector_scale_factor(self.global_config.get("scale_factor", 2))
         self.gguf_writer.add_vision_use_gelu(True)
 
     def tensor_force_quant(self, name, new_name, bid, n_dims):
diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
@@ -159,11 +159,11 @@ struct clip_hparams {
     int32_t projection_dim;
     int32_t n_head;
     int32_t n_layer;
+    int32_t proj_scale_factor = 0; // idefics3
 
     patch_merge_type mm_patch_merge_type = PATCH_MERGE_FLAT;
 
     float eps;
-    float proj_scale_factor = 0.0; // idefics3
 
     std::vector<int32_t> image_grid_pinpoints;
     int32_t image_crop_resolution;
@@ -518,7 +518,7 @@ static ggml_cgraph * clip_image_build_graph_siglip(clip_ctx * ctx, const clip_im
         const int bsz    = 1; // batch size, always 1 for now since we don't support batching
         const int height = std::sqrt(seq);
         const int width  = std::sqrt(seq);
-        GGML_ASSERT(scale_factor != 0.0);
+        GGML_ASSERT(scale_factor != 0);
         cur = ggml_reshape_4d(ctx0, cur, n_embd * scale_factor, width / scale_factor, height, bsz);
         cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
         cur = ggml_reshape_4d(ctx0, ggml_cont(ctx0, cur),
@@ -1277,7 +1277,7 @@ struct clip_model_loader {
         switch (ctx_clip.proj_type) {
             case PROJECTOR_TYPE_IDEFICS3:
                 {
-                    get_f32(KEY_PROJ_SCALE_FACTOR, hparams.proj_scale_factor, false);
+                    get_u32(KEY_PROJ_SCALE_FACTOR, hparams.proj_scale_factor, false);
                 } break;
             default:
                 break;
@@ -2386,7 +2386,7 @@ int clip_n_patches_by_img(const struct clip_ctx * ctx, struct clip_image_f32 * i
     } else if (ctx->proj_type == PROJECTOR_TYPE_GEMMA3) {
         n_patches = 256;
     } else if (ctx->proj_type == PROJECTOR_TYPE_IDEFICS3) {
-        n_patches /= (int)ctx->vision_model.hparams.proj_scale_factor;
+        n_patches /= ctx->vision_model.hparams.proj_scale_factor;
     }
 
     return n_patches;
diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py
@@ -975,8 +975,8 @@ def add_vision_use_gelu(self, value: bool) -> None:
     def add_vision_use_silu(self, value: bool) -> None:
         self.add_bool(Keys.ClipVision.USE_SILU, value)
 
-    def add_vision_projector_scale_factor(self, value: float) -> None:
-        self.add_float32(Keys.ClipVision.Projector.SCALE_FACTOR, value)
+    def add_vision_projector_scale_factor(self, value: int) -> None:
+        self.add_uint32(Keys.ClipVision.Projector.SCALE_FACTOR, value)
 
     def _pack(self, fmt: str, value: Any, skip_pack_prefix: bool = False) -> bytes:
         pack_prefix = ''