@@ -265,6 +265,7 @@ struct clip_model {
265
265
266
266
// LLaVA projection
267
267
ggml_tensor * mm_input_norm_w = nullptr ;
268
+ ggml_tensor * mm_input_norm_b = nullptr ;
268
269
ggml_tensor * mm_0_w = nullptr ;
269
270
ggml_tensor * mm_0_b = nullptr ;
270
271
ggml_tensor * mm_2_w = nullptr ;
@@ -488,11 +489,17 @@ struct clip_graph {
488
489
489
490
ggml_cgraph * build_siglip () {
490
491
ggml_tensor * inp = build_inp ();
492
+
493
+ ggml_tensor * learned_pos_embd = model.position_embeddings ;
494
+ if (ctx->proj_type () == PROJECTOR_TYPE_LFM2) {
495
+ learned_pos_embd = resize_position_embeddings ();
496
+ }
497
+
491
498
ggml_tensor * cur = build_vit (
492
499
inp, n_patches,
493
500
NORM_TYPE_NORMAL,
494
501
hparams.ffn_op ,
495
- model. position_embeddings ,
502
+ learned_pos_embd ,
496
503
nullptr );
497
504
498
505
if (ctx->proj_type () == PROJECTOR_TYPE_GEMMA3) {
@@ -542,6 +549,45 @@ struct clip_graph {
542
549
bsz);
543
550
544
551
cur = ggml_mul_mat (ctx0, model.projection , cur);
552
+ } else if (ctx->proj_type () == PROJECTOR_TYPE_LFM2) {
553
+ // pixel unshuffle block
554
+ const int scale_factor = model.hparams .proj_scale_factor ;
555
+ GGML_ASSERT (scale_factor > 1 );
556
+
557
+ const int n_embd = cur->ne [0 ];
558
+ int width = img.nx / patch_size;
559
+ int height = img.ny / patch_size;
560
+
561
+ // pad width and height to factor
562
+ const int64_t pad_width = CLIP_ALIGN (width, scale_factor) - width;
563
+ const int64_t pad_height = CLIP_ALIGN (height, scale_factor) - height;
564
+ cur = ggml_reshape_3d (ctx0, cur, n_embd, width, height);
565
+ if (pad_width || pad_height) {
566
+ cur = ggml_pad (ctx0, cur, 0 , pad_width, pad_height, 0 );
567
+ width += pad_width;
568
+ height += pad_height;
569
+ }
570
+
571
+ // unshuffle h
572
+ cur = ggml_reshape_3d (ctx0, cur, n_embd * scale_factor, width / scale_factor, height);
573
+ cur = ggml_cont (ctx0, ggml_permute (ctx0, cur, 0 , 2 , 1 , 3 ));
574
+
575
+ // unshuffle w
576
+ cur = ggml_reshape_3d (ctx0, cur, n_embd * scale_factor * scale_factor, height / scale_factor, width / scale_factor);
577
+ cur = ggml_cont (ctx0, ggml_permute (ctx0, cur, 0 , 2 , 1 , 3 ));
578
+
579
+ cur = ggml_reshape_2d (ctx0, cur, cur->ne [0 ], cur->ne [1 ] * cur->ne [2 ]);
580
+
581
+ // projection
582
+ cur = ggml_norm (ctx0, cur, 1e-5 ); // default nn.LayerNorm
583
+ cur = ggml_mul (ctx0, cur, model.mm_input_norm_w );
584
+ cur = ggml_add (ctx0, cur, model.mm_input_norm_b );
585
+
586
+ cur = ggml_mul_mat (ctx0, model.mm_1_w , cur);
587
+ cur = ggml_add (ctx0, cur, model.mm_1_b );
588
+ cur = ggml_gelu (ctx0, cur);
589
+ cur = ggml_mul_mat (ctx0, model.mm_2_w , cur);
590
+ cur = ggml_add (ctx0, cur, model.mm_2_b );
545
591
} else {
546
592
GGML_ABORT (" SigLIP: Unsupported projector type" );
547
593
}
@@ -1560,6 +1606,27 @@ struct clip_graph {
1560
1606
}
1561
1607
}
1562
1608
1609
+ // siglip2 naflex
1610
+ ggml_tensor * resize_position_embeddings () {
1611
+ ggml_tensor * pos_embd = model.position_embeddings ;
1612
+ const int height = img.ny / patch_size;
1613
+ const int width = img.nx / patch_size;
1614
+
1615
+ if (!pos_embd || height * width == pos_embd->ne [1 ]) {
1616
+ return pos_embd;
1617
+ }
1618
+
1619
+ const int n_pos_embd = std::sqrt (pos_embd->ne [1 ]);
1620
+ pos_embd = ggml_reshape_3d (ctx0, pos_embd, n_embd, n_pos_embd, n_pos_embd); // -> (n_embd, n_pos_embd, n_pos_embd)
1621
+ pos_embd = ggml_permute (ctx0, pos_embd, 2 , 0 , 1 , 3 ); // -> (n_pos_embd, n_pos_embd, n_embd)
1622
+ pos_embd = ggml_interpolate (ctx0, pos_embd, width, height, n_embd, 1 , 1 ); // -> (width, height, n_embd)
1623
+ pos_embd = ggml_reshape_2d (ctx0, pos_embd, height * width, n_embd); // -> (height * width, n_embd)
1624
+ pos_embd = ggml_transpose (ctx0, pos_embd); // -> (n_embd, height * width)
1625
+ pos_embd = ggml_cont (ctx0, pos_embd);
1626
+
1627
+ return pos_embd;
1628
+ }
1629
+
1563
1630
// build vision transformer (ViT) cgraph
1564
1631
// this function should cover most of the models
1565
1632
// if your model has specific features, you should probably duplicate this function
@@ -1966,6 +2033,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
1966
2033
switch (ctx->proj_type ()) {
1967
2034
case PROJECTOR_TYPE_GEMMA3:
1968
2035
case PROJECTOR_TYPE_IDEFICS3:
2036
+ case PROJECTOR_TYPE_LFM2:
1969
2037
{
1970
2038
res = graph.build_siglip ();
1971
2039
} break ;
@@ -2230,6 +2298,7 @@ struct clip_model_loader {
2230
2298
}
2231
2299
} break ;
2232
2300
case PROJECTOR_TYPE_IDEFICS3:
2301
+ case PROJECTOR_TYPE_LFM2:
2233
2302
case PROJECTOR_TYPE_INTERNVL:
2234
2303
{
2235
2304
get_u32 (KEY_PROJ_SCALE_FACTOR, hparams.proj_scale_factor , false );
@@ -2533,6 +2602,15 @@ struct clip_model_loader {
2533
2602
{
2534
2603
model.projection = get_tensor (TN_MM_PROJECTOR);
2535
2604
} break ;
2605
+ case PROJECTOR_TYPE_LFM2:
2606
+ {
2607
+ model.mm_input_norm_w = get_tensor (TN_MM_INP_NORM);
2608
+ model.mm_input_norm_b = get_tensor (TN_MM_INP_NORM_B);
2609
+ model.mm_1_w = get_tensor (string_format (TN_LLAVA_PROJ, 1 , " weight" ));
2610
+ model.mm_1_b = get_tensor (string_format (TN_LLAVA_PROJ, 1 , " bias" ));
2611
+ model.mm_2_w = get_tensor (string_format (TN_LLAVA_PROJ, 2 , " weight" ));
2612
+ model.mm_2_b = get_tensor (string_format (TN_LLAVA_PROJ, 2 , " bias" ));
2613
+ } break ;
2536
2614
case PROJECTOR_TYPE_PIXTRAL:
2537
2615
{
2538
2616
model.mm_1_w = get_tensor (string_format (TN_LLAVA_PROJ, 1 , " weight" ));
@@ -3428,6 +3506,43 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
3428
3506
res_imgs->grid_y = inst.grid_size .height ;
3429
3507
return true ;
3430
3508
3509
+ } else if (ctx->proj_type () == PROJECTOR_TYPE_LFM2) {
3510
+ GGML_ASSERT (params.proj_scale_factor );
3511
+
3512
+ // smart resize
3513
+ const int width = img->nx ;
3514
+ const int height = img->ny ;
3515
+ const int total_factor = params.patch_size * params.proj_scale_factor ;
3516
+ constexpr int min_image_tokens = 64 ;
3517
+ constexpr int max_image_tokens = 256 ;
3518
+ const float min_pixels = min_image_tokens * total_factor * total_factor;
3519
+ const float max_pixels = max_image_tokens * total_factor * total_factor;
3520
+
3521
+ auto round_by_factor = [f = total_factor](float x) { return static_cast <int >(std::nearbyintf (x / static_cast <float >(f))) * f; };
3522
+ auto ceil_by_factor = [f = total_factor](float x) { return static_cast <int >(std::ceil (x / static_cast <float >(f))) * f; };
3523
+ auto floor_by_factor = [f = total_factor](float x) { return static_cast <int >(std::floor (x / static_cast <float >(f))) * f; };
3524
+
3525
+ int h_bar = std::max (total_factor, round_by_factor (height));
3526
+ int w_bar = std::max (total_factor, round_by_factor (width));
3527
+
3528
+ if (h_bar * w_bar > max_pixels) {
3529
+ const auto beta = std::sqrt ((height * width) / max_pixels);
3530
+ h_bar = std::max (total_factor, floor_by_factor (height / beta));
3531
+ w_bar = std::max (total_factor, floor_by_factor (width / beta));
3532
+ } else if (h_bar * w_bar < min_pixels) {
3533
+ const auto beta = std::sqrt (min_pixels / (height * width));
3534
+ h_bar = ceil_by_factor (height * beta);
3535
+ w_bar = ceil_by_factor (width * beta);
3536
+ }
3537
+
3538
+ const std::array<uint8_t , 3 > pad_color = {122 , 116 , 104 };
3539
+
3540
+ clip_image_u8 resized_img;
3541
+ image_manipulation::resize_and_pad_image (*img, resized_img, clip_image_size{w_bar, h_bar}, pad_color);
3542
+ clip_image_f32_ptr res (clip_image_f32_init ());
3543
+ normalize_image_u8_to_f32 (resized_img, *res, params.image_mean , params.image_std );
3544
+ res_imgs->entries .push_back (std::move (res));
3545
+ return true ;
3431
3546
}
3432
3547
3433
3548
// the logic below is to pad the shorter side to the longer side with a background color: rgb(122, 116, 104)
@@ -3630,6 +3745,10 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
3630
3745
n_patches_sq /= 2 ;
3631
3746
}
3632
3747
} break ;
3748
+ case PROJECTOR_TYPE_LFM2:
3749
+ {
3750
+ n_patches_sq = (img->nx / (params.patch_size * params.proj_scale_factor )) * (img->ny / (params.patch_size * params.proj_scale_factor ));
3751
+ } break ;
3633
3752
default :
3634
3753
GGML_ABORT (" unsupported projector type" );
3635
3754
}
@@ -4034,6 +4153,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
4034
4153
case PROJECTOR_TYPE_INTERNVL:
4035
4154
case PROJECTOR_TYPE_QWEN2A:
4036
4155
case PROJECTOR_TYPE_ULTRAVOX:
4156
+ case PROJECTOR_TYPE_LFM2:
4037
4157
case PROJECTOR_TYPE_VOXTRAL:
4038
4158
{
4039
4159
// do nothing
@@ -4135,6 +4255,8 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
4135
4255
return ctx->model .mm_model_proj ->ne [1 ];
4136
4256
case PROJECTOR_TYPE_QWEN2A:
4137
4257
return ctx->model .mm_fc_w ->ne [1 ];
4258
+ case PROJECTOR_TYPE_LFM2:
4259
+ return ctx->model .mm_2_w ->ne [1 ];
4138
4260
default :
4139
4261
GGML_ABORT (" Unknown projector type" );
4140
4262
}
0 commit comments