@@ -265,6 +265,7 @@ struct clip_model {
265265
266266 // LLaVA projection
267267 ggml_tensor * mm_input_norm_w = nullptr ;
268+ ggml_tensor * mm_input_norm_b = nullptr ;
268269 ggml_tensor * mm_0_w = nullptr ;
269270 ggml_tensor * mm_0_b = nullptr ;
270271 ggml_tensor * mm_2_w = nullptr ;
@@ -542,6 +543,36 @@ struct clip_graph {
542543 bsz);
543544
544545 cur = ggml_mul_mat (ctx0, model.projection , cur);
546+ } else if (ctx->proj_type () == PROJECTOR_TYPE_LFM2) {
547+ const int scale_factor = model.hparams .proj_scale_factor ;
548+ const int n_embd = cur->ne [0 ];
549+ const int seq = cur->ne [1 ];
550+ const int bsz = 1 ; // batch size, always 1 for now since we don't support batching
551+ const int height = std::sqrt (seq);
552+ const int width = std::sqrt (seq);
553+ GGML_ASSERT (scale_factor != 0 );
554+ cur = ggml_reshape_4d (ctx0, cur, n_embd * scale_factor, width / scale_factor, height, bsz);
555+ cur = ggml_permute (ctx0, cur, 0 , 2 , 1 , 3 );
556+ cur = ggml_reshape_4d (ctx0, ggml_cont (ctx0, cur),
557+ n_embd * scale_factor * scale_factor,
558+ height / scale_factor,
559+ width / scale_factor,
560+ bsz);
561+ cur = ggml_permute (ctx0, cur, 0 , 2 , 1 , 3 );
562+ cur = ggml_reshape_3d (ctx0, ggml_cont (ctx0, cur),
563+ n_embd * scale_factor * scale_factor,
564+ seq / (scale_factor * scale_factor),
565+ bsz);
566+
567+ cur = ggml_norm (ctx0, cur, 1e-5 ); // default nn.LayerNorm
568+ cur = ggml_mul (ctx0, cur, model.mm_input_norm_w );
569+ cur = ggml_add (ctx0, cur, model.mm_input_norm_b );
570+
571+ cur = ggml_mul_mat (ctx0, model.mm_1_w , cur);
572+ cur = ggml_add (ctx0, cur, model.mm_1_b );
573+ cur = ggml_gelu (ctx0, cur);
574+ cur = ggml_mul_mat (ctx0, model.mm_2_w , cur);
575+ cur = ggml_add (ctx0, cur, model.mm_2_b );
545576 } else {
546577 GGML_ABORT (" SigLIP: Unsupported projector type" );
547578 }
@@ -1966,6 +1997,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
19661997 switch (ctx->proj_type ()) {
19671998 case PROJECTOR_TYPE_GEMMA3:
19681999 case PROJECTOR_TYPE_IDEFICS3:
2000+ case PROJECTOR_TYPE_LFM2:
19692001 {
19702002 res = graph.build_siglip ();
19712003 } break ;
@@ -2230,6 +2262,7 @@ struct clip_model_loader {
22302262 }
22312263 } break ;
22322264 case PROJECTOR_TYPE_IDEFICS3:
2265+ case PROJECTOR_TYPE_LFM2:
22332266 case PROJECTOR_TYPE_INTERNVL:
22342267 {
22352268 get_u32 (KEY_PROJ_SCALE_FACTOR, hparams.proj_scale_factor , false );
@@ -2533,6 +2566,15 @@ struct clip_model_loader {
25332566 {
25342567 model.projection = get_tensor (TN_MM_PROJECTOR);
25352568 } break ;
2569+ case PROJECTOR_TYPE_LFM2:
2570+ {
2571+ model.mm_input_norm_w = get_tensor (TN_MM_INP_NORM);
2572+ model.mm_input_norm_b = get_tensor (TN_MM_INP_NORM_B);
2573+ model.mm_1_w = get_tensor (string_format (TN_LLAVA_PROJ, 1 , " weight" ));
2574+ model.mm_1_b = get_tensor (string_format (TN_LLAVA_PROJ, 1 , " bias" ));
2575+ model.mm_2_w = get_tensor (string_format (TN_LLAVA_PROJ, 2 , " weight" ));
2576+ model.mm_2_b = get_tensor (string_format (TN_LLAVA_PROJ, 2 , " bias" ));
2577+ } break ;
25362578 case PROJECTOR_TYPE_PIXTRAL:
25372579 {
25382580 model.mm_1_w = get_tensor (string_format (TN_LLAVA_PROJ, 1 , " weight" ));
@@ -3591,6 +3633,7 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
35913633 n_patches_sq = n_per_side_2d_pool * n_per_side_2d_pool;
35923634 } break ;
35933635 case PROJECTOR_TYPE_IDEFICS3:
3636+ case PROJECTOR_TYPE_LFM2:
35943637 case PROJECTOR_TYPE_INTERNVL:
35953638 {
35963639 // both W and H are divided by proj_scale_factor
@@ -4034,6 +4077,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
40344077 case PROJECTOR_TYPE_INTERNVL:
40354078 case PROJECTOR_TYPE_QWEN2A:
40364079 case PROJECTOR_TYPE_ULTRAVOX:
4080+ case PROJECTOR_TYPE_LFM2:
40374081 case PROJECTOR_TYPE_VOXTRAL:
40384082 {
40394083 // do nothing
@@ -4135,6 +4179,8 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
41354179 return ctx->model .mm_model_proj ->ne [1 ];
41364180 case PROJECTOR_TYPE_QWEN2A:
41374181 return ctx->model .mm_fc_w ->ne [1 ];
4182+ case PROJECTOR_TYPE_LFM2:
4183+ return ctx->model .mm_2_w ->ne [1 ];
41384184 default :
41394185 GGML_ABORT (" Unknown projector type" );
41404186 }
0 commit comments