@@ -526,57 +526,16 @@ struct clip_graph {
526526 cur);
527527
528528 } else if (ctx->proj_type () == PROJECTOR_TYPE_IDEFICS3) {
529+ // pixel_shuffle
529530 // https://github.com/huggingface/transformers/blob/0a950e0bbe1ed58d5401a6b547af19f15f0c195e/src/transformers/models/idefics3/modeling_idefics3.py#L578
530-
531531 const int scale_factor = model.hparams .proj_scale_factor ;
532- const int n_embd = cur->ne [0 ];
533- const int seq = cur->ne [1 ];
534- const int bsz = 1 ; // batch size, always 1 for now since we don't support batching
535- const int height = std::sqrt (seq);
536- const int width = std::sqrt (seq);
537- GGML_ASSERT (scale_factor != 0 );
538- cur = ggml_reshape_4d (ctx0, cur, n_embd * scale_factor, width / scale_factor, height, bsz);
539- cur = ggml_permute (ctx0, cur, 0 , 2 , 1 , 3 );
540- cur = ggml_cont_4d (ctx0, cur,
541- n_embd * scale_factor * scale_factor,
542- height / scale_factor,
543- width / scale_factor,
544- bsz);
545- cur = ggml_permute (ctx0, cur, 0 , 2 , 1 , 3 );
546- cur = ggml_cont_3d (ctx0, cur,
547- n_embd * scale_factor * scale_factor,
548- seq / (scale_factor * scale_factor),
549- bsz);
550-
532+ cur = build_pixel_shuffle (cur, scale_factor);
551533 cur = ggml_mul_mat (ctx0, model.projection , cur);
534+
552535 } else if (ctx->proj_type () == PROJECTOR_TYPE_LFM2) {
553536 // pixel unshuffle block
554537 const int scale_factor = model.hparams .proj_scale_factor ;
555- GGML_ASSERT (scale_factor > 1 );
556-
557- const int n_embd = cur->ne [0 ];
558- int width = img.nx / patch_size;
559- int height = img.ny / patch_size;
560-
561- // pad width and height to factor
562- const int64_t pad_width = CLIP_ALIGN (width, scale_factor) - width;
563- const int64_t pad_height = CLIP_ALIGN (height, scale_factor) - height;
564- cur = ggml_reshape_3d (ctx0, cur, n_embd, width, height);
565- if (pad_width || pad_height) {
566- cur = ggml_pad (ctx0, cur, 0 , pad_width, pad_height, 0 );
567- width += pad_width;
568- height += pad_height;
569- }
570-
571- // unshuffle h
572- cur = ggml_reshape_3d (ctx0, cur, n_embd * scale_factor, width / scale_factor, height);
573- cur = ggml_permute (ctx0, cur, 0 , 2 , 1 , 3 );
574-
575- // unshuffle w
576- cur = ggml_cont_3d (ctx0, cur, n_embd * scale_factor * scale_factor, height / scale_factor, width / scale_factor);
577- cur = ggml_permute (ctx0, cur, 0 , 2 , 1 , 3 );
578-
579- cur = ggml_cont_2d (ctx0, cur, cur->ne [0 ], cur->ne [1 ] * cur->ne [2 ]);
538+ cur = build_pixel_shuffle (cur, scale_factor);
580539
581540 // projection
582541 cur = ggml_norm (ctx0, cur, 1e-5 ); // default nn.LayerNorm
@@ -1142,34 +1101,9 @@ struct clip_graph {
11421101 cb (cur, " vit_out" , -1 );
11431102
11441103 {
1145- // pixel unshuffle block
1104+ // patch_merger
11461105 const int scale_factor = model.hparams .proj_scale_factor ;
1147- GGML_ASSERT (scale_factor > 1 );
1148-
1149- const int n_embd = cur->ne [0 ];
1150- int width = img.nx / patch_size;
1151- int height = img.ny / patch_size;
1152-
1153- // pad width and height to factor
1154- const int64_t pad_width = CLIP_ALIGN (width, scale_factor) - width;
1155- const int64_t pad_height = CLIP_ALIGN (height, scale_factor) - height;
1156- cur = ggml_reshape_3d (ctx0, cur, n_embd, width, height);
1157- if (pad_width || pad_height) {
1158- cur = ggml_pad (ctx0, cur, 0 , pad_width, pad_height, 0 );
1159- width += pad_width;
1160- height += pad_height;
1161- }
1162-
1163- // unshuffle h
1164- cur = ggml_reshape_3d (ctx0, cur, n_embd * scale_factor, width / scale_factor, height);
1165- cur = ggml_permute (ctx0, cur, 0 , 2 , 1 , 3 );
1166-
1167- // unshuffle w
1168- cur = ggml_cont_3d (ctx0, cur, n_embd * scale_factor * scale_factor, height / scale_factor, width / scale_factor);
1169- cur = ggml_permute (ctx0, cur, 0 , 2 , 1 , 3 );
1170-
1171- cur = ggml_cont_2d (ctx0, cur, cur->ne [0 ], cur->ne [1 ] * cur->ne [2 ]);
1172- cb (cur, " pixel_unshuffle" , -1 );
1106+ cur = build_pixel_shuffle (cur, scale_factor);
11731107
11741108 // projection norm
11751109 int proj_inp_dim = cur->ne [0 ];
@@ -2107,6 +2041,39 @@ struct clip_graph {
21072041 return cur;
21082042 }
21092043
2044+ // aka pixel_unshuffle in Siglip2, aka patch_merger in Kimi
2045+ // support dynamic resolution
2046+ ggml_tensor * build_pixel_shuffle (ggml_tensor * cur, int scale_factor) {
2047+ GGML_ASSERT (scale_factor > 1 );
2048+
2049+ const int n_embd = cur->ne [0 ];
2050+ int width = img.nx / patch_size;
2051+ int height = img.ny / patch_size;
2052+
2053+ // pad width and height to factor
2054+ const int64_t pad_width = CLIP_ALIGN (width, scale_factor) - width;
2055+ const int64_t pad_height = CLIP_ALIGN (height, scale_factor) - height;
2056+ cur = ggml_reshape_3d (ctx0, cur, n_embd, width, height);
2057+ if (pad_width || pad_height) {
2058+ cur = ggml_pad (ctx0, cur, 0 , pad_width, pad_height, 0 );
2059+ width += pad_width;
2060+ height += pad_height;
2061+ }
2062+
2063+ // unshuffle h
2064+ cur = ggml_reshape_3d (ctx0, cur, n_embd * scale_factor, width / scale_factor, height);
2065+ cur = ggml_permute (ctx0, cur, 0 , 2 , 1 , 3 );
2066+
2067+ // unshuffle w
2068+ cur = ggml_cont_3d (ctx0, cur, n_embd * scale_factor * scale_factor, height / scale_factor, width / scale_factor);
2069+ cur = ggml_permute (ctx0, cur, 0 , 2 , 1 , 3 );
2070+
2071+ cur = ggml_cont_2d (ctx0, cur, cur->ne [0 ], cur->ne [1 ] * cur->ne [2 ]);
2072+ cb (cur, " pixel_shuffle" , -1 );
2073+
2074+ return cur;
2075+ }
2076+
21102077};
21112078
21122079static ggml_cgraph * clip_image_build_graph (clip_ctx * ctx, const clip_image_f32_batch & imgs) {
@@ -2572,6 +2539,7 @@ struct clip_model_loader {
25722539 // some models already exported with legacy (incorrect) naming which is quite messy, let's fix it here
25732540 // note: Qwen model converted from the old surgery script has n_ff = 0, so we cannot use n_ff to check!
25742541 bool is_ffn_swapped = (
2542+ // only old models need this fix
25752543 model.proj_type == PROJECTOR_TYPE_MLP
25762544 || model.proj_type == PROJECTOR_TYPE_MLP_NORM
25772545 || model.proj_type == PROJECTOR_TYPE_LDP
@@ -2580,6 +2548,8 @@ struct clip_model_loader {
25802548 || model.proj_type == PROJECTOR_TYPE_QWEN25VL
25812549 || model.proj_type == PROJECTOR_TYPE_GLM_EDGE
25822550 || model.proj_type == PROJECTOR_TYPE_GEMMA3
2551+ || model.proj_type == PROJECTOR_TYPE_IDEFICS3
2552+ || model.proj_type == PROJECTOR_TYPE_MINICPMV
25832553 ) && layer.ff_up_w && layer.ff_down_w && layer.ff_down_w ->ne [0 ] == hparams.n_embd ;
25842554 if (is_ffn_swapped) {
25852555 // swap up and down weights
0 commit comments