@@ -159,11 +159,11 @@ struct clip_hparams {
159159 int32_t projection_dim;
160160 int32_t n_head;
161161 int32_t n_layer;
162+ int32_t proj_scale_factor = 0 ; // idefics3
162163
163164 patch_merge_type mm_patch_merge_type = PATCH_MERGE_FLAT;
164165
165166 float eps;
166- float proj_scale_factor = 0.0 ; // idefics3
167167
168168 std::vector<int32_t > image_grid_pinpoints;
169169 int32_t image_crop_resolution;
@@ -518,7 +518,7 @@ static ggml_cgraph * clip_image_build_graph_siglip(clip_ctx * ctx, const clip_im
518518 const int bsz = 1 ; // batch size, always 1 for now since we don't support batching
519519 const int height = std::sqrt (seq);
520520 const int width = std::sqrt (seq);
521- GGML_ASSERT (scale_factor != 0.0 );
521+ GGML_ASSERT (scale_factor != 0 );
522522 cur = ggml_reshape_4d (ctx0, cur, n_embd * scale_factor, width / scale_factor, height, bsz);
523523 cur = ggml_permute (ctx0, cur, 0 , 2 , 1 , 3 );
524524 cur = ggml_reshape_4d (ctx0, ggml_cont (ctx0, cur),
@@ -1277,7 +1277,7 @@ struct clip_model_loader {
12771277 switch (ctx_clip.proj_type ) {
12781278 case PROJECTOR_TYPE_IDEFICS3:
12791279 {
1280- get_f32 (KEY_PROJ_SCALE_FACTOR, hparams.proj_scale_factor , false );
1280+ get_u32 (KEY_PROJ_SCALE_FACTOR, hparams.proj_scale_factor , false );
12811281 } break ;
12821282 default :
12831283 break ;
@@ -2386,7 +2386,7 @@ int clip_n_patches_by_img(const struct clip_ctx * ctx, struct clip_image_f32 * i
23862386 } else if (ctx->proj_type == PROJECTOR_TYPE_GEMMA3) {
23872387 n_patches = 256 ;
23882388 } else if (ctx->proj_type == PROJECTOR_TYPE_IDEFICS3) {
2389- n_patches /= ( int ) ctx->vision_model .hparams .proj_scale_factor ;
2389+ n_patches /= ctx->vision_model .hparams .proj_scale_factor ;
23902390 }
23912391
23922392 return n_patches;
0 commit comments