Skip to content

Commit 2c0d960

Browse files
committed
fix llama4 and lfm2
1 parent bfd03fb commit 2c0d960

File tree

1 file changed

+19
-5
lines changed

1 file changed

+19
-5
lines changed

tools/mtmd/clip.cpp

Lines changed: 19 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -550,7 +550,7 @@ struct clip_graph {
550550
const int batch_size = 1;
551551
GGML_ASSERT(n_patches_x == n_patches_y);
552552
const int patches_per_image = n_patches_x;
553-
const int kernel_size = hparams.get_merge_kernel_size();
553+
const int kernel_size = hparams.proj_scale_factor;
554554

555555
cur = ggml_transpose(ctx0, cur);
556556
cur = ggml_cont_4d(ctx0, cur, patches_per_image, patches_per_image, n_embd, batch_size);
@@ -578,7 +578,7 @@ struct clip_graph {
578578

579579
} else if (ctx->proj_type() == PROJECTOR_TYPE_LFM2) {
580580
// pixel unshuffle block
581-
const int scale_factor = model.hparams.proj_scale_factor;
581+
const int scale_factor = model.hparams.get_merge_kernel_size();
582582
cur = build_patch_merge_permute(cur, scale_factor);
583583

584584
// projection
@@ -2715,9 +2715,12 @@ struct clip_model_loader {
27152715
} break;
27162716
case PROJECTOR_TYPE_LFM2:
27172717
{
2718+
// correct non-standard proj_scale_factor value
2719+
int spatial_merge = 2;
2720+
get_u32(KEY_PROJ_SCALE_FACTOR, spatial_merge, false);
2721+
hparams.proj_scale_factor = spatial_merge * spatial_merge;
27182722
// ref: https://huggingface.co/LiquidAI/LFM2-VL-3B/blob/main/preprocessor_config.json
27192723
hparams.set_limit_image_tokens(64, 256);
2720-
get_u32(KEY_PROJ_SCALE_FACTOR, hparams.proj_scale_factor, false);
27212724
} break;
27222725
case PROJECTOR_TYPE_PIXTRAL:
27232726
case PROJECTOR_TYPE_LIGHTONOCR:
@@ -2765,7 +2768,10 @@ struct clip_model_loader {
27652768
case PROJECTOR_TYPE_LLAMA4:
27662769
{
27672770
hparams.rope_theta = 10000.0f;
2768-
get_u32(KEY_PROJ_SCALE_FACTOR, hparams.proj_scale_factor);
2771+
// correct non-standard proj_scale_factor value
2772+
int spatial_merge = 2;
2773+
get_u32(KEY_PROJ_SCALE_FACTOR, spatial_merge, false);
2774+
hparams.proj_scale_factor = spatial_merge * spatial_merge;
27692775
set_llava_uhd_res_candidates(model, 3);
27702776
} break;
27712777
case PROJECTOR_TYPE_ULTRAVOX:
@@ -2785,6 +2791,14 @@ struct clip_model_loader {
27852791
break;
27862792
}
27872793

2794+
// sanity check
2795+
{
2796+
if (hparams.proj_scale_factor) {
2797+
const int n_merge = hparams.get_merge_kernel_size();
2798+
GGML_ASSERT(n_merge * n_merge == hparams.proj_scale_factor);
2799+
}
2800+
}
2801+
27882802
LOG_INF("%s: projector: %s\n", __func__, proj_type.c_str());
27892803
LOG_INF("%s: n_embd: %d\n", __func__, hparams.n_embd);
27902804
LOG_INF("%s: n_head: %d\n", __func__, hparams.n_head);
@@ -4359,7 +4373,7 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
43594373
case PROJECTOR_TYPE_KIMIVL:
43604374
{
43614375
// dynamic size
4362-
int scale_factor = params.get_merge_kernel_size();
4376+
int scale_factor = ctx->model.hparams.get_merge_kernel_size();
43634377
int out_patch_size = params.patch_size * scale_factor;
43644378
int x_patch = CLIP_ALIGN(img->nx, out_patch_size) / out_patch_size;
43654379
int y_patch = CLIP_ALIGN(img->ny, out_patch_size) / out_patch_size;

0 commit comments

Comments
 (0)