Skip to content

Commit afd43a9

Browse files
committed
clean up
1 parent 56cf4ca commit afd43a9

File tree

1 file changed

+42
-72
lines changed

1 file changed

+42
-72
lines changed

tools/mtmd/clip.cpp

Lines changed: 42 additions & 72 deletions
Original file line numberDiff line numberDiff line change
@@ -526,57 +526,16 @@ struct clip_graph {
526526
cur);
527527

528528
} else if (ctx->proj_type() == PROJECTOR_TYPE_IDEFICS3) {
529+
// pixel_shuffle
529530
// https://github.com/huggingface/transformers/blob/0a950e0bbe1ed58d5401a6b547af19f15f0c195e/src/transformers/models/idefics3/modeling_idefics3.py#L578
530-
531531
const int scale_factor = model.hparams.proj_scale_factor;
532-
const int n_embd = cur->ne[0];
533-
const int seq = cur->ne[1];
534-
const int bsz = 1; // batch size, always 1 for now since we don't support batching
535-
const int height = std::sqrt(seq);
536-
const int width = std::sqrt(seq);
537-
GGML_ASSERT(scale_factor != 0);
538-
cur = ggml_reshape_4d(ctx0, cur, n_embd * scale_factor, width / scale_factor, height, bsz);
539-
cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
540-
cur = ggml_cont_4d(ctx0, cur,
541-
n_embd * scale_factor * scale_factor,
542-
height / scale_factor,
543-
width / scale_factor,
544-
bsz);
545-
cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
546-
cur = ggml_cont_3d(ctx0, cur,
547-
n_embd * scale_factor * scale_factor,
548-
seq / (scale_factor * scale_factor),
549-
bsz);
550-
532+
cur = build_pixel_shuffle(cur, scale_factor);
551533
cur = ggml_mul_mat(ctx0, model.projection, cur);
534+
552535
} else if (ctx->proj_type() == PROJECTOR_TYPE_LFM2) {
553536
// pixel unshuffle block
554537
const int scale_factor = model.hparams.proj_scale_factor;
555-
GGML_ASSERT(scale_factor > 1);
556-
557-
const int n_embd = cur->ne[0];
558-
int width = img.nx / patch_size;
559-
int height = img.ny / patch_size;
560-
561-
// pad width and height to factor
562-
const int64_t pad_width = CLIP_ALIGN(width, scale_factor) - width;
563-
const int64_t pad_height = CLIP_ALIGN(height, scale_factor) - height;
564-
cur = ggml_reshape_3d(ctx0, cur, n_embd, width, height);
565-
if (pad_width || pad_height) {
566-
cur = ggml_pad(ctx0, cur, 0, pad_width, pad_height, 0);
567-
width += pad_width;
568-
height += pad_height;
569-
}
570-
571-
// unshuffle h
572-
cur = ggml_reshape_3d(ctx0, cur, n_embd * scale_factor, width / scale_factor, height);
573-
cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
574-
575-
// unshuffle w
576-
cur = ggml_cont_3d(ctx0, cur, n_embd * scale_factor * scale_factor, height / scale_factor, width / scale_factor);
577-
cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
578-
579-
cur = ggml_cont_2d(ctx0, cur, cur->ne[0], cur->ne[1] * cur->ne[2]);
538+
cur = build_pixel_shuffle(cur, scale_factor);
580539

581540
// projection
582541
cur = ggml_norm(ctx0, cur, 1e-5); // default nn.LayerNorm
@@ -1142,34 +1101,9 @@ struct clip_graph {
11421101
cb(cur, "vit_out", -1);
11431102

11441103
{
1145-
// pixel unshuffle block
1104+
// patch_merger
11461105
const int scale_factor = model.hparams.proj_scale_factor;
1147-
GGML_ASSERT(scale_factor > 1);
1148-
1149-
const int n_embd = cur->ne[0];
1150-
int width = img.nx / patch_size;
1151-
int height = img.ny / patch_size;
1152-
1153-
// pad width and height to factor
1154-
const int64_t pad_width = CLIP_ALIGN(width, scale_factor) - width;
1155-
const int64_t pad_height = CLIP_ALIGN(height, scale_factor) - height;
1156-
cur = ggml_reshape_3d(ctx0, cur, n_embd, width, height);
1157-
if (pad_width || pad_height) {
1158-
cur = ggml_pad(ctx0, cur, 0, pad_width, pad_height, 0);
1159-
width += pad_width;
1160-
height += pad_height;
1161-
}
1162-
1163-
// unshuffle h
1164-
cur = ggml_reshape_3d(ctx0, cur, n_embd * scale_factor, width / scale_factor, height);
1165-
cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
1166-
1167-
// unshuffle w
1168-
cur = ggml_cont_3d(ctx0, cur, n_embd * scale_factor * scale_factor, height / scale_factor, width / scale_factor);
1169-
cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
1170-
1171-
cur = ggml_cont_2d(ctx0, cur, cur->ne[0], cur->ne[1] * cur->ne[2]);
1172-
cb(cur, "pixel_unshuffle", -1);
1106+
cur = build_pixel_shuffle(cur, scale_factor);
11731107

11741108
// projection norm
11751109
int proj_inp_dim = cur->ne[0];
@@ -2107,6 +2041,39 @@ struct clip_graph {
21072041
return cur;
21082042
}
21092043

2044+
// aka pixel_unshuffle in Siglip2, aka patch_merger in Kimi
2045+
// support dynamic resolution
2046+
ggml_tensor * build_pixel_shuffle(ggml_tensor * cur, int scale_factor) {
2047+
GGML_ASSERT(scale_factor > 1);
2048+
2049+
const int n_embd = cur->ne[0];
2050+
int width = img.nx / patch_size;
2051+
int height = img.ny / patch_size;
2052+
2053+
// pad width and height to factor
2054+
const int64_t pad_width = CLIP_ALIGN(width, scale_factor) - width;
2055+
const int64_t pad_height = CLIP_ALIGN(height, scale_factor) - height;
2056+
cur = ggml_reshape_3d(ctx0, cur, n_embd, width, height);
2057+
if (pad_width || pad_height) {
2058+
cur = ggml_pad(ctx0, cur, 0, pad_width, pad_height, 0);
2059+
width += pad_width;
2060+
height += pad_height;
2061+
}
2062+
2063+
// unshuffle h
2064+
cur = ggml_reshape_3d(ctx0, cur, n_embd * scale_factor, width / scale_factor, height);
2065+
cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
2066+
2067+
// unshuffle w
2068+
cur = ggml_cont_3d(ctx0, cur, n_embd * scale_factor * scale_factor, height / scale_factor, width / scale_factor);
2069+
cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
2070+
2071+
cur = ggml_cont_2d(ctx0, cur, cur->ne[0], cur->ne[1] * cur->ne[2]);
2072+
cb(cur, "pixel_shuffle", -1);
2073+
2074+
return cur;
2075+
}
2076+
21102077
};
21112078

21122079
static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch & imgs) {
@@ -2572,6 +2539,7 @@ struct clip_model_loader {
25722539
// some models already exported with legacy (incorrect) naming which is quite messy, let's fix it here
25732540
// note: Qwen model converted from the old surgery script has n_ff = 0, so we cannot use n_ff to check!
25742541
bool is_ffn_swapped = (
2542+
// only old models need this fix
25752543
model.proj_type == PROJECTOR_TYPE_MLP
25762544
|| model.proj_type == PROJECTOR_TYPE_MLP_NORM
25772545
|| model.proj_type == PROJECTOR_TYPE_LDP
@@ -2580,6 +2548,8 @@ struct clip_model_loader {
25802548
|| model.proj_type == PROJECTOR_TYPE_QWEN25VL
25812549
|| model.proj_type == PROJECTOR_TYPE_GLM_EDGE
25822550
|| model.proj_type == PROJECTOR_TYPE_GEMMA3
2551+
|| model.proj_type == PROJECTOR_TYPE_IDEFICS3
2552+
|| model.proj_type == PROJECTOR_TYPE_MINICPMV
25832553
) && layer.ff_up_w && layer.ff_down_w && layer.ff_down_w->ne[0] == hparams.n_embd;
25842554
if (is_ffn_swapped) {
25852555
// swap up and down weights

0 commit comments

Comments
 (0)