Skip to content

Commit 7bd1a01

Browse files
committed
fix
1 parent 68b1507 commit 7bd1a01

File tree

1 file changed

+27
-20
lines changed

1 file changed

+27
-20
lines changed

tools/mtmd/clip.cpp

Lines changed: 27 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -216,7 +216,11 @@ struct clip_hparams {
216216
void set_warmup_n_tokens(int n_tokens) {
217217
int n_tok_per_side = static_cast<int>(std::sqrt(n_tokens));
218218
GGML_ASSERT(n_tok_per_side * n_tok_per_side == n_tokens && "n_tokens must be n*n");
219-
warmup_image_size = n_tok_per_side * patch_size * static_cast<int>(std::sqrt(proj_scale_factor));
219+
warmup_image_size = n_tok_per_side * patch_size * get_scale_factor_per_side();
220+
}
221+
222+
int get_scale_factor_per_side() const {
223+
return static_cast<int>(std::sqrt(proj_scale_factor));
220224
}
221225
};
222226

@@ -546,7 +550,7 @@ struct clip_graph {
546550
const int batch_size = 1;
547551
GGML_ASSERT(n_patches_x == n_patches_y);
548552
const int patches_per_image = n_patches_x;
549-
const int kernel_size = hparams.proj_scale_factor;
553+
const int kernel_size = hparams.get_scale_factor_per_side();
550554

551555
cur = ggml_transpose(ctx0, cur);
552556
cur = ggml_cont_4d(ctx0, cur, patches_per_image, patches_per_image, n_embd, batch_size);
@@ -568,13 +572,13 @@ struct clip_graph {
568572
} else if (ctx->proj_type() == PROJECTOR_TYPE_IDEFICS3) {
569573
// pixel_shuffle
570574
// https://github.com/huggingface/transformers/blob/0a950e0bbe1ed58d5401a6b547af19f15f0c195e/src/transformers/models/idefics3/modeling_idefics3.py#L578
571-
const int scale_factor = model.hparams.proj_scale_factor;
575+
const int scale_factor = model.hparams.get_scale_factor_per_side();
572576
cur = build_patch_merge_permute(cur, scale_factor);
573577
cur = ggml_mul_mat(ctx0, model.projection, cur);
574578

575579
} else if (ctx->proj_type() == PROJECTOR_TYPE_LFM2) {
576580
// pixel unshuffle block
577-
const int scale_factor = model.hparams.proj_scale_factor;
581+
const int scale_factor = model.hparams.get_scale_factor_per_side();
578582
cur = build_patch_merge_permute(cur, scale_factor);
579583

580584
// projection
@@ -598,7 +602,7 @@ struct clip_graph {
598602
}
599603

600604
ggml_cgraph * build_pixtral() {
601-
const int n_merge = hparams.proj_scale_factor;
605+
const int n_merge = hparams.get_scale_factor_per_side();
602606

603607
// 2D input positions
604608
ggml_tensor * pos_h = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches);
@@ -940,7 +944,8 @@ struct clip_graph {
940944

941945
// deepstack features (stack along the feature dimension), [n_embd * len(deepstack_layers), n_patches_x * n_patches_y, batch_size]
942946
ggml_tensor * deepstack_features = nullptr;
943-
const int merge_factor = hparams.proj_scale_factor > 0 ? hparams.proj_scale_factor * hparams.proj_scale_factor : 4; // default 2x2=4 for qwen3vl
947+
const int merge_factor = hparams.proj_scale_factor > 0
948+
? hparams.proj_scale_factor : 4; // default 2x2=4 for qwen3vl
944949

945950
// loop over layers
946951
for (int il = 0; il < n_layer; il++) {
@@ -2366,16 +2371,16 @@ struct clip_graph {
23662371

23672372
// aka pixel_shuffle / pixel_unshuffle / patch_merger (Kimi-VL)
23682373
// support dynamic resolution
2369-
ggml_tensor * build_patch_merge_permute(ggml_tensor * cur, int scale_factor) {
2370-
GGML_ASSERT(scale_factor > 1);
2374+
ggml_tensor * build_patch_merge_permute(ggml_tensor * cur, int kernel_size) {
2375+
GGML_ASSERT(kernel_size > 1);
23712376

23722377
const int n_embd = cur->ne[0];
23732378
int width = img.nx / patch_size;
23742379
int height = img.ny / patch_size;
23752380

23762381
// pad width and height to factor
2377-
const int64_t pad_width = CLIP_ALIGN(width, scale_factor) - width;
2378-
const int64_t pad_height = CLIP_ALIGN(height, scale_factor) - height;
2382+
const int64_t pad_width = CLIP_ALIGN(width, kernel_size) - width;
2383+
const int64_t pad_height = CLIP_ALIGN(height, kernel_size) - height;
23792384
cur = ggml_reshape_3d(ctx0, cur, n_embd, width, height);
23802385
if (pad_width || pad_height) {
23812386
cur = ggml_pad(ctx0, cur, 0, pad_width, pad_height, 0);
@@ -2384,11 +2389,11 @@ struct clip_graph {
23842389
}
23852390

23862391
// unshuffle h
2387-
cur = ggml_reshape_3d(ctx0, cur, n_embd * scale_factor, width / scale_factor, height);
2392+
cur = ggml_reshape_3d(ctx0, cur, n_embd * kernel_size, width / kernel_size, height);
23882393
cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
23892394

23902395
// unshuffle w
2391-
cur = ggml_cont_3d(ctx0, cur, n_embd * scale_factor * scale_factor, height / scale_factor, width / scale_factor);
2396+
cur = ggml_cont_3d(ctx0, cur, n_embd * kernel_size * kernel_size, height / kernel_size, width / kernel_size);
23922397
cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
23932398

23942399
cur = ggml_cont_2d(ctx0, cur, cur->ne[0], cur->ne[1] * cur->ne[2]);
@@ -3203,9 +3208,11 @@ struct clip_model_loader {
32033208
if (ctx_clip.model.modality == CLIP_MODALITY_VISION) {
32043209
img->nx = hparams.warmup_image_size;
32053210
img->ny = hparams.warmup_image_size;
3211+
LOG_INF("%s: warmup with image size = %d x %d\n", __func__, img->nx, img->ny);
32063212
} else {
32073213
img->nx = hparams.warmup_audio_size;
32083214
img->ny = hparams.n_mel_bins;
3215+
LOG_INF("%s: warmup with audio size = %d\n", __func__, img->nx);
32093216
}
32103217
batch.entries.push_back(std::move(img));
32113218

@@ -4020,7 +4027,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
40204027
clip_image_u8 canvas;
40214028
const clip_image_size canvas_size = img_tool::calc_size_preserved_ratio(
40224029
original_size,
4023-
params.patch_size * static_cast<int>(std::sqrt(params.proj_scale_factor)),
4030+
params.patch_size * params.get_scale_factor_per_side(),
40244031
params.image_min_pixels,
40254032
params.image_max_pixels);
40264033
canvas.nx = canvas_size.width;
@@ -4119,10 +4126,11 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
41194126
case PROJECTOR_TYPE_PIXTRAL:
41204127
case PROJECTOR_TYPE_LIGHTONOCR:
41214128
{
4129+
GGML_ASSERT(params.image_min_pixels && params.image_max_pixels);
41224130
clip_image_u8 resized_image;
41234131
const clip_image_size target_size = img_tool::calc_size_preserved_ratio(
41244132
original_size,
4125-
params.patch_size * static_cast<int>(std::sqrt(params.proj_scale_factor)),
4133+
params.patch_size * params.get_scale_factor_per_side(),
41264134
params.image_min_pixels,
41274135
params.image_max_pixels);
41284136
img_tool::resize(*img, resized_image, target_size, img_tool::RESIZE_ALGO_BILINEAR);
@@ -4150,9 +4158,10 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
41504158
case PROJECTOR_TYPE_LFM2:
41514159
case PROJECTOR_TYPE_KIMIVL:
41524160
{
4161+
GGML_ASSERT(params.image_min_pixels && params.image_max_pixels);
41534162
const clip_image_size target_size = img_tool::calc_size_preserved_ratio(
41544163
original_size,
4155-
params.patch_size * static_cast<int>(std::sqrt(params.proj_scale_factor)),
4164+
params.patch_size * params.get_scale_factor_per_side(),
41564165
params.image_min_pixels,
41574166
params.image_max_pixels);
41584167
const std::array<uint8_t, 3> pad_color = {122, 116, 104};
@@ -4339,15 +4348,13 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
43394348
case PROJECTOR_TYPE_INTERNVL:
43404349
case PROJECTOR_TYPE_LLAMA4:
43414350
{
4342-
// both X and Y are downscaled by the scale factor
4343-
int scale_factor = ctx->model.hparams.proj_scale_factor;
4344-
n_patches /= (scale_factor * scale_factor);
4351+
n_patches /= ctx->model.hparams.proj_scale_factor;
43454352
} break;
43464353
case PROJECTOR_TYPE_LFM2:
43474354
case PROJECTOR_TYPE_KIMIVL:
43484355
{
43494356
// dynamic size
4350-
int scale_factor = ctx->model.hparams.proj_scale_factor;
4357+
int scale_factor = params.get_scale_factor_per_side();
43514358
int out_patch_size = params.patch_size * scale_factor;
43524359
int x_patch = CLIP_ALIGN(img->nx, out_patch_size) / out_patch_size;
43534360
int y_patch = CLIP_ALIGN(img->ny, out_patch_size) / out_patch_size;
@@ -4357,7 +4364,7 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
43574364
case PROJECTOR_TYPE_LIGHTONOCR:
43584365
{
43594366
// dynamic size
4360-
int n_merge = params.proj_scale_factor;
4367+
int n_merge = params.get_scale_factor_per_side();
43614368
int n_patches_x = img->nx / patch_size / (n_merge > 0 ? n_merge : 1);
43624369
int n_patches_y = img->ny / patch_size / (n_merge > 0 ? n_merge : 1);
43634370
if (ctx->model.token_embd_img_break) {

0 commit comments

Comments
 (0)