@@ -216,7 +216,11 @@ struct clip_hparams {
216216 void set_warmup_n_tokens (int n_tokens) {
217217 int n_tok_per_side = static_cast <int >(std::sqrt (n_tokens));
218218 GGML_ASSERT (n_tok_per_side * n_tok_per_side == n_tokens && " n_tokens must be n*n" );
219- warmup_image_size = n_tok_per_side * patch_size * static_cast <int >(std::sqrt (proj_scale_factor));
219+ warmup_image_size = n_tok_per_side * patch_size * get_scale_factor_per_side ();
220+ }
221+
222+ int get_scale_factor_per_side () const {
223+ return static_cast <int >(std::sqrt (proj_scale_factor));
220224 }
221225};
222226
@@ -546,7 +550,7 @@ struct clip_graph {
546550 const int batch_size = 1 ;
547551 GGML_ASSERT (n_patches_x == n_patches_y);
548552 const int patches_per_image = n_patches_x;
549- const int kernel_size = hparams.proj_scale_factor ;
553+ const int kernel_size = hparams.get_scale_factor_per_side () ;
550554
551555 cur = ggml_transpose (ctx0, cur);
552556 cur = ggml_cont_4d (ctx0, cur, patches_per_image, patches_per_image, n_embd, batch_size);
@@ -568,13 +572,13 @@ struct clip_graph {
568572 } else if (ctx->proj_type () == PROJECTOR_TYPE_IDEFICS3) {
569573 // pixel_shuffle
570574 // https://github.com/huggingface/transformers/blob/0a950e0bbe1ed58d5401a6b547af19f15f0c195e/src/transformers/models/idefics3/modeling_idefics3.py#L578
571- const int scale_factor = model.hparams .proj_scale_factor ;
575+ const int scale_factor = model.hparams .get_scale_factor_per_side () ;
572576 cur = build_patch_merge_permute (cur, scale_factor);
573577 cur = ggml_mul_mat (ctx0, model.projection , cur);
574578
575579 } else if (ctx->proj_type () == PROJECTOR_TYPE_LFM2) {
576580 // pixel unshuffle block
577- const int scale_factor = model.hparams .proj_scale_factor ;
581+ const int scale_factor = model.hparams .get_scale_factor_per_side () ;
578582 cur = build_patch_merge_permute (cur, scale_factor);
579583
580584 // projection
@@ -598,7 +602,7 @@ struct clip_graph {
598602 }
599603
600604 ggml_cgraph * build_pixtral () {
601- const int n_merge = hparams.proj_scale_factor ;
605+ const int n_merge = hparams.get_scale_factor_per_side () ;
602606
603607 // 2D input positions
604608 ggml_tensor * pos_h = ggml_new_tensor_1d (ctx0, GGML_TYPE_I32, n_patches);
@@ -940,7 +944,8 @@ struct clip_graph {
940944
941945 // deepstack features (stack along the feature dimension), [n_embd * len(deepstack_layers), n_patches_x * n_patches_y, batch_size]
942946 ggml_tensor * deepstack_features = nullptr ;
943- const int merge_factor = hparams.proj_scale_factor > 0 ? hparams.proj_scale_factor * hparams.proj_scale_factor : 4 ; // default 2x2=4 for qwen3vl
947+ const int merge_factor = hparams.proj_scale_factor > 0
948+ ? hparams.proj_scale_factor : 4 ; // default 2x2=4 for qwen3vl
944949
945950 // loop over layers
946951 for (int il = 0 ; il < n_layer; il++) {
@@ -2366,16 +2371,16 @@ struct clip_graph {
23662371
23672372 // aka pixel_shuffle / pixel_unshuffle / patch_merger (Kimi-VL)
23682373 // support dynamic resolution
2369- ggml_tensor * build_patch_merge_permute (ggml_tensor * cur, int scale_factor ) {
2370- GGML_ASSERT (scale_factor > 1 );
2374+ ggml_tensor * build_patch_merge_permute (ggml_tensor * cur, int kernel_size ) {
2375+ GGML_ASSERT (kernel_size > 1 );
23712376
23722377 const int n_embd = cur->ne [0 ];
23732378 int width = img.nx / patch_size;
23742379 int height = img.ny / patch_size;
23752380
23762381 // pad width and height to factor
2377- const int64_t pad_width = CLIP_ALIGN (width, scale_factor ) - width;
2378- const int64_t pad_height = CLIP_ALIGN (height, scale_factor ) - height;
2382+ const int64_t pad_width = CLIP_ALIGN (width, kernel_size ) - width;
2383+ const int64_t pad_height = CLIP_ALIGN (height, kernel_size ) - height;
23792384 cur = ggml_reshape_3d (ctx0, cur, n_embd, width, height);
23802385 if (pad_width || pad_height) {
23812386 cur = ggml_pad (ctx0, cur, 0 , pad_width, pad_height, 0 );
@@ -2384,11 +2389,11 @@ struct clip_graph {
23842389 }
23852390
23862391 // unshuffle h
2387- cur = ggml_reshape_3d (ctx0, cur, n_embd * scale_factor , width / scale_factor , height);
2392+ cur = ggml_reshape_3d (ctx0, cur, n_embd * kernel_size , width / kernel_size , height);
23882393 cur = ggml_permute (ctx0, cur, 0 , 2 , 1 , 3 );
23892394
23902395 // unshuffle w
2391- cur = ggml_cont_3d (ctx0, cur, n_embd * scale_factor * scale_factor , height / scale_factor , width / scale_factor );
2396+ cur = ggml_cont_3d (ctx0, cur, n_embd * kernel_size * kernel_size , height / kernel_size , width / kernel_size );
23922397 cur = ggml_permute (ctx0, cur, 0 , 2 , 1 , 3 );
23932398
23942399 cur = ggml_cont_2d (ctx0, cur, cur->ne [0 ], cur->ne [1 ] * cur->ne [2 ]);
@@ -3203,9 +3208,11 @@ struct clip_model_loader {
32033208 if (ctx_clip.model .modality == CLIP_MODALITY_VISION) {
32043209 img->nx = hparams.warmup_image_size ;
32053210 img->ny = hparams.warmup_image_size ;
3211+ LOG_INF (" %s: warmup with image size = %d x %d\n " , __func__, img->nx , img->ny );
32063212 } else {
32073213 img->nx = hparams.warmup_audio_size ;
32083214 img->ny = hparams.n_mel_bins ;
3215+ LOG_INF (" %s: warmup with audio size = %d\n " , __func__, img->nx );
32093216 }
32103217 batch.entries .push_back (std::move (img));
32113218
@@ -4020,7 +4027,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
40204027 clip_image_u8 canvas;
40214028 const clip_image_size canvas_size = img_tool::calc_size_preserved_ratio (
40224029 original_size,
4023- params.patch_size * static_cast < int >( std::sqrt ( params.proj_scale_factor ) ),
4030+ params.patch_size * params.get_scale_factor_per_side ( ),
40244031 params.image_min_pixels ,
40254032 params.image_max_pixels );
40264033 canvas.nx = canvas_size.width ;
@@ -4119,10 +4126,11 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
41194126 case PROJECTOR_TYPE_PIXTRAL:
41204127 case PROJECTOR_TYPE_LIGHTONOCR:
41214128 {
4129+ GGML_ASSERT (params.image_min_pixels && params.image_max_pixels );
41224130 clip_image_u8 resized_image;
41234131 const clip_image_size target_size = img_tool::calc_size_preserved_ratio (
41244132 original_size,
4125- params.patch_size * static_cast < int >( std::sqrt ( params.proj_scale_factor ) ),
4133+ params.patch_size * params.get_scale_factor_per_side ( ),
41264134 params.image_min_pixels ,
41274135 params.image_max_pixels );
41284136 img_tool::resize (*img, resized_image, target_size, img_tool::RESIZE_ALGO_BILINEAR);
@@ -4150,9 +4158,10 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
41504158 case PROJECTOR_TYPE_LFM2:
41514159 case PROJECTOR_TYPE_KIMIVL:
41524160 {
4161+ GGML_ASSERT (params.image_min_pixels && params.image_max_pixels );
41534162 const clip_image_size target_size = img_tool::calc_size_preserved_ratio (
41544163 original_size,
4155- params.patch_size * static_cast < int >( std::sqrt ( params.proj_scale_factor ) ),
4164+ params.patch_size * params.get_scale_factor_per_side ( ),
41564165 params.image_min_pixels ,
41574166 params.image_max_pixels );
41584167 const std::array<uint8_t , 3 > pad_color = {122 , 116 , 104 };
@@ -4339,15 +4348,13 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
43394348 case PROJECTOR_TYPE_INTERNVL:
43404349 case PROJECTOR_TYPE_LLAMA4:
43414350 {
4342- // both X and Y are downscaled by the scale factor
4343- int scale_factor = ctx->model .hparams .proj_scale_factor ;
4344- n_patches /= (scale_factor * scale_factor);
4351+ n_patches /= ctx->model .hparams .proj_scale_factor ;
43454352 } break ;
43464353 case PROJECTOR_TYPE_LFM2:
43474354 case PROJECTOR_TYPE_KIMIVL:
43484355 {
43494356 // dynamic size
4350- int scale_factor = ctx-> model . hparams . proj_scale_factor ;
4357+ int scale_factor = params. get_scale_factor_per_side () ;
43514358 int out_patch_size = params.patch_size * scale_factor;
43524359 int x_patch = CLIP_ALIGN (img->nx , out_patch_size) / out_patch_size;
43534360 int y_patch = CLIP_ALIGN (img->ny , out_patch_size) / out_patch_size;
@@ -4357,7 +4364,7 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
43574364 case PROJECTOR_TYPE_LIGHTONOCR:
43584365 {
43594366 // dynamic size
4360- int n_merge = params.proj_scale_factor ;
4367+ int n_merge = params.get_scale_factor_per_side () ;
43614368 int n_patches_x = img->nx / patch_size / (n_merge > 0 ? n_merge : 1 );
43624369 int n_patches_y = img->ny / patch_size / (n_merge > 0 ? n_merge : 1 );
43634370 if (ctx->model .token_embd_img_break ) {
0 commit comments