@@ -2038,6 +2038,16 @@ struct clip_model_loader {
20382038 {
20392039 hparams.rope_theta = 10000 .0f ;
20402040 get_u32 (KEY_PROJ_SCALE_FACTOR, hparams.proj_scale_factor );
2041+
2042+ // borrowed from llava-1.6
2043+ const int psize = hparams.patch_size ;
2044+ hparams.image_grid_pinpoints = {
2045+ psize, psize*2 , // 336, 672
2046+ psize*2 , psize, // 672, 336
2047+ psize*2 , psize*2 , // 672, 672
2048+ psize*3 , psize, // 1008, 336
2049+ psize, psize*3 , // 336, 1008
2050+ };
20412051 } break ;
20422052 default :
20432053 break ;
@@ -3091,15 +3101,29 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
30913101 normalize_image_u8_to_f32 (resized_image, *img_f32, ctx->image_mean , ctx->image_std );
30923102 res_imgs->entries .push_back (std::move (img_f32));
30933103 return true ;
3094- }
3095- else if (ctx->proj_type == PROJECTOR_TYPE_PIXTRAL) {
3104+
3105+ } else if (ctx->proj_type == PROJECTOR_TYPE_PIXTRAL) {
30963106 clip_image_u8 resized_image;
30973107 auto new_size = image_manipulation::calc_size_preserved_ratio (original_size, params.patch_size , params.image_size );
30983108 image_manipulation::bilinear_resize (*img, resized_image, new_size.width , new_size.height );
30993109 clip_image_f32_ptr img_f32 (clip_image_f32_init ());
31003110 normalize_image_u8_to_f32 (resized_image, *img_f32, ctx->image_mean , ctx->image_std );
31013111 res_imgs->entries .push_back (std::move (img_f32));
31023112 return true ;
3113+
3114+ } else if (ctx->proj_type == PROJECTOR_TYPE_LLAMA4) {
3115+ GGML_ASSERT (!params.image_grid_pinpoints .empty ());
3116+ auto const inst = llava_uhd::get_slice_instructions (ctx, original_size);
3117+ std::vector<clip_image_u8_ptr> imgs = llava_uhd::slice_image (img, inst);
3118+
3119+ for (size_t i = 0 ; i < imgs.size (); ++i) {
3120+ clip_image_f32_ptr res (clip_image_f32_init ());
3121+ normalize_image_u8_to_f32 (*imgs[i], *res, ctx->image_mean , ctx->image_std );
3122+ res_imgs->entries .push_back (std::move (res));
3123+ }
3124+
3125+ return true ;
3126+
31033127 }
31043128
31053129 // the logic below is to pad the shorter side to the longer side with a background color: rgb(122, 116, 104)
0 commit comments