@@ -621,7 +621,7 @@ struct clip_graph {
621621 }
622622
623623 // arrangement of the [IMG_BREAK] token
624- {
624+ if (model. token_embd_img_break ) {
625625 // not efficient, but works
626626 // the trick is to view the embeddings as a 3D tensor with shape [n_embd, n_patches_per_row, n_rows]
627627 // and then concatenate the [IMG_BREAK] token to the end of each row, aka n_patches_per_row dimension
@@ -2095,6 +2095,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
20952095 res = graph.build_siglip ();
20962096 } break ;
20972097 case PROJECTOR_TYPE_PIXTRAL:
2098+ case PROJECTOR_TYPE_LIGHTONOCR:
20982099 {
20992100 res = graph.build_pixtral ();
21002101 } break ;
@@ -2380,6 +2381,7 @@ struct clip_model_loader {
23802381 get_u32 (KEY_PROJ_SCALE_FACTOR, hparams.proj_scale_factor , false );
23812382 } break ;
23822383 case PROJECTOR_TYPE_PIXTRAL:
2384+ case PROJECTOR_TYPE_LIGHTONOCR:
23832385 {
23842386 hparams.rope_theta = 10000 .0f ;
23852387 hparams.warmup_image_size = hparams.patch_size * 8 ;
@@ -2722,6 +2724,15 @@ struct clip_model_loader {
27222724 model.mm_input_norm_w = get_tensor (TN_MM_INP_NORM, false );
27232725 model.mm_patch_merger_w = get_tensor (TN_MM_PATCH_MERGER, false );
27242726 } break ;
2727+ case PROJECTOR_TYPE_LIGHTONOCR:
2728+ {
2729+ model.mm_1_w = get_tensor (string_format (TN_LLAVA_PROJ, 1 , " weight" ));
2730+ model.mm_1_b = get_tensor (string_format (TN_LLAVA_PROJ, 1 , " bias" ), false );
2731+ model.mm_2_w = get_tensor (string_format (TN_LLAVA_PROJ, 2 , " weight" ));
2732+ model.mm_2_b = get_tensor (string_format (TN_LLAVA_PROJ, 2 , " bias" ), false );
2733+ model.mm_input_norm_w = get_tensor (TN_MM_INP_NORM, false );
2734+ model.mm_patch_merger_w = get_tensor (TN_MM_PATCH_MERGER, false );
2735+ } break ;
27252736 case PROJECTOR_TYPE_ULTRAVOX:
27262737 {
27272738 model.conv1d_1_w = get_tensor (string_format (TN_CONV1D, 1 , " weight" ));
@@ -3622,7 +3633,9 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
36223633 res_imgs->entries .push_back (std::move (img_f32));
36233634 return true ;
36243635
3625- } else if (ctx->proj_type () == PROJECTOR_TYPE_PIXTRAL) {
3636+ } else if (ctx->proj_type () == PROJECTOR_TYPE_PIXTRAL
3637+ || ctx->proj_type () == PROJECTOR_TYPE_LIGHTONOCR
3638+ ) {
36263639 clip_image_u8 resized_image;
36273640 auto new_size = image_manipulation::calc_size_preserved_ratio (original_size, params.patch_size , params.image_size );
36283641 image_manipulation::bilinear_resize (*img, resized_image, new_size.width , new_size.height );
@@ -3865,12 +3878,17 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
38653878 n_patches = x_patch * y_patch;
38663879 } break ;
38673880 case PROJECTOR_TYPE_PIXTRAL:
3881+ case PROJECTOR_TYPE_LIGHTONOCR:
38683882 {
38693883 // dynamic size
38703884 int n_merge = params.spatial_merge_size ;
38713885 int n_patches_x = img->nx / patch_size / (n_merge > 0 ? n_merge : 1 );
38723886 int n_patches_y = img->ny / patch_size / (n_merge > 0 ? n_merge : 1 );
3873- n_patches = n_patches_y * n_patches_x + n_patches_y - 1 ; // + one [IMG_BREAK] per row, except the last row
3887+ if (ctx->model .token_embd_img_break ) {
3888+ n_patches = n_patches_y * n_patches_x + n_patches_y - 1 ; // + one [IMG_BREAK] per row, except the last row
3889+ } else {
3890+ n_patches = n_patches_y * n_patches_x;
3891+ }
38743892 } break ;
38753893 case PROJECTOR_TYPE_VOXTRAL:
38763894 case PROJECTOR_TYPE_ULTRAVOX:
@@ -4247,6 +4265,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
42474265 } break ;
42484266 case PROJECTOR_TYPE_PIXTRAL:
42494267 case PROJECTOR_TYPE_KIMIVL:
4268+ case PROJECTOR_TYPE_LIGHTONOCR:
42504269 {
42514270 // set the 2D positions
42524271 int n_patches_per_col = image_size_width / patch_size;
@@ -4377,6 +4396,7 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
43774396 return ctx->model .mm_model_peg_0_b ->ne [0 ];
43784397 case PROJECTOR_TYPE_MLP:
43794398 case PROJECTOR_TYPE_PIXTRAL:
4399+ case PROJECTOR_TYPE_LIGHTONOCR:
43804400 return ctx->model .mm_2_w ->ne [1 ];
43814401 case PROJECTOR_TYPE_MLP_NORM:
43824402 return ctx->model .mm_3_b ->ne [0 ];
0 commit comments