@@ -229,21 +229,15 @@ static clip_image_f32 * reshape_by_patch(clip_image_f32 * image, int patch_size)
229229 return patch;
230230}
231231
232- static bool encode_image_with_clip (clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float * image_embd, int * n_img_pos) {
233- // std::vector<clip_image_f32*> img_res_v; // format VectN x H x W x RGB (N x 336 x 336 x 3), so interleaved RGB - different to the python implementation which is N x 3 x 336 x 336
234- clip_image_f32_batch_ptr img_res_v (clip_image_f32_batch_init ());
235- if (!clip_image_preprocess (ctx_clip, img, img_res_v.get ())) {
236- LOG_ERR (" %s: unable to preprocess image\n " , __func__);
237- return false ;
238- }
232+ static bool encode_image_with_clip (clip_ctx * ctx_clip, int n_threads, struct clip_image_f32_batch * preprocessed_img, float * image_embd, int * n_img_pos) {
239233
240234 const int64_t t_img_enc_start_us = ggml_time_us ();
241235
242236 const char * mm_patch_merge_type = clip_patch_merge_type (ctx_clip);
243237
244- const size_t n_imgs = clip_image_f32_batch_n_images (img_res_v. get () );
238+ const size_t n_imgs = clip_image_f32_batch_n_images (preprocessed_img );
245239
246- clip_image_f32 * img_res = clip_image_f32_get_img (img_res_v. get () , 0 );
240+ clip_image_f32 * img_res = clip_image_f32_get_img (preprocessed_img , 0 );
247241 *n_img_pos = clip_n_output_tokens (ctx_clip, img_res);
248242 bool encoded = clip_image_encode (ctx_clip, n_threads, img_res, image_embd); // image_embd shape is 576 x 4096
249243 if (!encoded) {
@@ -282,9 +276,25 @@ bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, co
282276 num_max_patches = 1 ;
283277 }
284278 float * image_embd;
279+ clip_image_f32_batch_ptr preprocessed_img (clip_image_f32_batch_init ());
280+ if (!clip_image_preprocess (ctx_clip, img, preprocessed_img.get ())) {
281+ LOG_ERR (" %s: unable to preprocess image\n " , __func__);
282+ return false ;
283+ }
284+
285285 if (clip_is_qwen2vl (ctx_clip)) {
286286 // qwen2vl don't split image into chunks, so `num_max_patches` is not needed.
287- image_embd = (float *)malloc (clip_embd_nbytes_by_img (ctx_clip, img->nx , img->ny ));
287+ // sometimes they resize the image LARGER than before (padding up), so we must account for that
288+ int max_nx = img->nx ;
289+ int max_ny = img->ny ;
290+ for (int i=0 ;i<preprocessed_img->entries .size ();++i)
291+ {
292+ int a = preprocessed_img->entries [i].get ()->nx ;
293+ int b = preprocessed_img->entries [i].get ()->ny ;
294+ max_nx = std::max (max_nx,a);
295+ max_ny = std::max (max_ny,b);
296+ }
297+ image_embd = (float *)malloc (clip_embd_nbytes_by_img (ctx_clip, max_nx, max_ny));
288298 } else {
289299 image_embd = (float *)malloc (clip_embd_nbytes (ctx_clip)*num_max_patches); // TODO: base on gridsize/llava model
290300 }
@@ -294,7 +304,7 @@ bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, co
294304 }
295305
296306 int n_img_pos;
297- if (!encode_image_with_clip (ctx_clip, n_threads, img , image_embd, &n_img_pos)) {
307+ if (!encode_image_with_clip (ctx_clip, n_threads, preprocessed_img. get () , image_embd, &n_img_pos)) {
298308 LOG_ERR (" %s: cannot encode image, aborting\n " , __func__);
299309 free (image_embd);
300310 return false ;
0 commit comments