@@ -550,7 +550,7 @@ struct clip_graph {
550550 const int batch_size = 1 ;
551551 GGML_ASSERT (n_patches_x == n_patches_y);
552552 const int patches_per_image = n_patches_x;
553- const int kernel_size = hparams.get_merge_kernel_size () ;
553+ const int kernel_size = hparams.proj_scale_factor ;
554554
555555 cur = ggml_transpose (ctx0, cur);
556556 cur = ggml_cont_4d (ctx0, cur, patches_per_image, patches_per_image, n_embd, batch_size);
@@ -578,7 +578,7 @@ struct clip_graph {
578578
579579 } else if (ctx->proj_type () == PROJECTOR_TYPE_LFM2) {
580580 // pixel unshuffle block
581- const int scale_factor = model.hparams .proj_scale_factor ;
581+ const int scale_factor = model.hparams .get_merge_kernel_size () ;
582582 cur = build_patch_merge_permute (cur, scale_factor);
583583
584584 // projection
@@ -2715,9 +2715,12 @@ struct clip_model_loader {
27152715 } break ;
27162716 case PROJECTOR_TYPE_LFM2:
27172717 {
2718+ // correct non-standard proj_scale_factor value
2719+ int spatial_merge = 2 ;
2720+ get_u32 (KEY_PROJ_SCALE_FACTOR, spatial_merge, false );
2721+ hparams.proj_scale_factor = spatial_merge * spatial_merge;
27182722 // ref: https://huggingface.co/LiquidAI/LFM2-VL-3B/blob/main/preprocessor_config.json
27192723 hparams.set_limit_image_tokens (64 , 256 );
2720- get_u32 (KEY_PROJ_SCALE_FACTOR, hparams.proj_scale_factor , false );
27212724 } break ;
27222725 case PROJECTOR_TYPE_PIXTRAL:
27232726 case PROJECTOR_TYPE_LIGHTONOCR:
@@ -2765,7 +2768,10 @@ struct clip_model_loader {
27652768 case PROJECTOR_TYPE_LLAMA4:
27662769 {
27672770 hparams.rope_theta = 10000 .0f ;
2768- get_u32 (KEY_PROJ_SCALE_FACTOR, hparams.proj_scale_factor );
2771+ // correct non-standard proj_scale_factor value
2772+ int spatial_merge = 2 ;
2773+ get_u32 (KEY_PROJ_SCALE_FACTOR, spatial_merge, false );
2774+ hparams.proj_scale_factor = spatial_merge * spatial_merge;
27692775 set_llava_uhd_res_candidates (model, 3 );
27702776 } break ;
27712777 case PROJECTOR_TYPE_ULTRAVOX:
@@ -2785,6 +2791,14 @@ struct clip_model_loader {
27852791 break ;
27862792 }
27872793
2794+ // sanity check
2795+ {
2796+ if (hparams.proj_scale_factor ) {
2797+ const int n_merge = hparams.get_merge_kernel_size ();
2798+ GGML_ASSERT (n_merge * n_merge == hparams.proj_scale_factor );
2799+ }
2800+ }
2801+
27882802 LOG_INF (" %s: projector: %s\n " , __func__, proj_type.c_str ());
27892803 LOG_INF (" %s: n_embd: %d\n " , __func__, hparams.n_embd );
27902804 LOG_INF (" %s: n_head: %d\n " , __func__, hparams.n_head );
@@ -4359,7 +4373,7 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
43594373 case PROJECTOR_TYPE_KIMIVL:
43604374 {
43614375 // dynamic size
4362- int scale_factor = params .get_merge_kernel_size ();
4376+ int scale_factor = ctx-> model . hparams .get_merge_kernel_size ();
43634377 int out_patch_size = params.patch_size * scale_factor;
43644378 int x_patch = CLIP_ALIGN (img->nx , out_patch_size) / out_patch_size;
43654379 int y_patch = CLIP_ALIGN (img->ny , out_patch_size) / out_patch_size;
0 commit comments