@@ -275,140 +275,12 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
275275
276276 const size_t n_imgs = clip_image_f32_batch_n_images (img_res_v.get ());
277277
278- if (clip_is_minicpmv (ctx_clip) || clip_is_qwen2vl (ctx_clip)) {
279- std::vector<float *> image_embd_v;
280- image_embd_v.resize (n_imgs);
281- clip_image_size load_image_size;
282-
283- for (size_t i = 0 ; i < n_imgs; i++) {
284- const int64_t t_img_enc_step_start_us = ggml_time_us ();
285- int nx = clip_image_f32_batch_nx (img_res_v.get (), i);
286- int ny = clip_image_f32_batch_ny (img_res_v.get (), i);
287- image_embd_v[i] = (float *)malloc (clip_embd_nbytes_by_img (ctx_clip, nx, ny));
288- int patch_size = 14 ;
289- load_image_size.width = nx;
290- load_image_size.height = ny;
291- clip_add_load_image_size (ctx_clip, &load_image_size);
292-
293- bool encoded = false ;
294- clip_image_f32 * img_res = clip_image_f32_get_img (img_res_v.get (), i);
295- if (clip_is_qwen2vl (ctx_clip)) {
296- encoded = clip_image_encode (ctx_clip, n_threads, img_res, image_embd_v[i]);
297- }
298- else {
299- encoded = clip_image_encode (ctx_clip, n_threads, reshape_by_patch (img_res, patch_size), image_embd_v[i]);
300- }
301-
302- if (!encoded) {
303- LOG_ERR (" Unable to encode image - spatial_unpad - subimage %d of %d\n " , (int ) i+1 , (int ) n_imgs);
304- return false ;
305- }
306- const int64_t t_img_enc_steop_batch_us = ggml_time_us ();
307- LOG_INF (" %s: step %d of %d encoded in %8.2f ms\n " , __func__, (int )i+1 , (int )n_imgs, (t_img_enc_steop_batch_us - t_img_enc_step_start_us) / 1000.0 );
308- }
309- const int64_t t_img_enc_batch_us = ggml_time_us ();
310- LOG_INF (" %s: all %d segments encoded in %8.2f ms\n " , __func__, (int )n_imgs, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0 );
311-
312- int n_img_pos_out = 0 ;
313- for (size_t i = 0 ; i < image_embd_v.size (); i++) {
314- int nx = clip_image_f32_batch_nx (img_res_v.get (), i);
315- int ny = clip_image_f32_batch_ny (img_res_v.get (), i);
316- clip_image_f32 * img_res = clip_image_f32_get_img (img_res_v.get (), i);
317- std::memcpy (
318- image_embd + n_img_pos_out * clip_n_mmproj_embd (ctx_clip),
319- image_embd_v[i],
320- clip_embd_nbytes_by_img (ctx_clip, nx, ny));
321- n_img_pos_out += clip_n_output_tokens (ctx_clip, img_res);
322- }
323- *n_img_pos = n_img_pos_out;
324- for (size_t i = 0 ; i < image_embd_v.size (); i++) {
325- free (image_embd_v[i]);
326- }
327- image_embd_v.clear ();
328- load_image_size.width = img->nx ;
329- load_image_size.height = img->ny ;
330- clip_add_load_image_size (ctx_clip, &load_image_size);
331- LOG_INF (" %s: load_image_size %d %d\n " , __func__, load_image_size.width , load_image_size.height );
332- }
333- else if (clip_is_glm (ctx_clip)){
334- struct clip_image_size * load_image_size = clip_image_size_init ();
335- load_image_size->width = clip_image_f32_batch_nx (img_res_v.get (), 0 );
336- load_image_size->height = clip_image_f32_batch_ny (img_res_v.get (), 0 );
337- clip_add_load_image_size (ctx_clip, load_image_size);
338-
339- clip_image_f32 * img_res = clip_image_f32_get_img (img_res_v.get (), 0 );
340- bool encoded = clip_image_encode (ctx_clip, n_threads, img_res, image_embd);
341- int pos = int (load_image_size->width /clip_get_patch_size (ctx_clip)/2 );
342- *n_img_pos = (pos * pos + 2 );
343- if (!encoded){
344- LOG_ERR (" Unable to encode image \n " );
345- return false ;
346- }
347- }
348- else if (clip_is_pixtral (ctx_clip)){
349- clip_image_f32 * img_res = clip_image_f32_get_img (img_res_v.get (), 0 );
350- *n_img_pos = clip_n_output_tokens (ctx_clip, img_res);
351- bool encoded = clip_image_encode (ctx_clip, n_threads, img_res, image_embd); // image_embd shape is 576 x 4096
352- if (!encoded) {
353- LOG_ERR (" Unable to encode image\n " );
354-
355- return false ;
356- }
357- }
358- else if (strcmp (mm_patch_merge_type, " spatial_unpad" ) != 0 ) {
359- // flat / default llava-1.5 type embedding
360- clip_image_f32 * img_res = clip_image_f32_get_img (img_res_v.get (), 0 );
361- *n_img_pos = clip_n_output_tokens (ctx_clip, img_res);
362- bool encoded = clip_image_encode (ctx_clip, n_threads, img_res, image_embd); // image_embd shape is 576 x 4096
363- if (!encoded) {
364- LOG_ERR (" Unable to encode image\n " );
365-
366- return false ;
367- }
368- }
369- else {
370- // spatial_unpad llava-1.6 type embedding
371- // TODO: CLIP needs batching support - in HF the llm projection is separate after encoding, which might be a solution to quickly get batching working
372- std::vector<float *> image_embd_v;
373- image_embd_v.resize (n_imgs);
374- for (size_t i = 0 ; i < n_imgs; i++) {
375- clip_image_f32 * img_res = clip_image_f32_get_img (img_res_v.get (), i);
376- image_embd_v[i] = (float *)malloc (clip_embd_nbytes (ctx_clip)); // 576 patches * 4096 embeddings * 4 bytes = 9437184
377- const bool encoded = clip_image_encode (ctx_clip, n_threads, img_res, image_embd_v[i]); // image data is in 3x336x336 format and will be converted to 336x336x3 inside
378- if (!encoded) {
379- LOG_ERR (" Unable to encode image - spatial_unpad - subimage %d of %d\n " , (int ) i+1 , (int ) n_imgs);
380- return false ;
381- }
382- }
383- const int64_t t_img_enc_batch_us = ggml_time_us ();
384- LOG_INF (" %s: %d segments encoded in %8.2f ms\n " , __func__, (int )n_imgs, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0 );
385-
386- const int32_t * image_grid = clip_image_grid (ctx_clip);
387- const size_t num_gridpoints = get_clip_image_grid_size (ctx_clip);
388-
389- std::vector<std::pair<int , int >> grid_pinpoints;
390- for (size_t i = 0 ; i < num_gridpoints; i += 2 ) {
391- grid_pinpoints.push_back ({image_grid[i], image_grid[i+1 ]});
392- }
393-
394- const int32_t image_size = clip_get_image_size (ctx_clip);
395-
396- struct clip_image_grid_shape grid_shape = get_anyres_image_grid_shape ({img->nx ,img->ny }, grid_pinpoints, image_size);
397-
398- int n_img_pos_out;
399- clip_image_f32 * img_input = clip_image_f32_get_img (img_res_v.get (), 0 );
400- clip_llava_handle_patches (ctx_clip, image_embd_v, grid_shape, image_embd, &n_img_pos_out, img_input);
401- *n_img_pos = n_img_pos_out;
402-
403- for (size_t i = 0 ; i < image_embd_v.size (); i++) {
404- free (image_embd_v[i]);
405- }
406- image_embd_v.clear ();
407-
408- // debug image/segment/normalization content:
409- // clip_image_u8 * tmp = clip_image_u8_init();
410- // clip_image_convert_f32_to_u8(*image_feature, *tmp);
411- // clip_image_save_to_bmp(*tmp, "image_feature.bmp");
278+ clip_image_f32 * img_res = clip_image_f32_get_img (img_res_v.get (), 0 );
279+ *n_img_pos = clip_n_output_tokens (ctx_clip, img_res);
280+ bool encoded = clip_image_encode (ctx_clip, n_threads, img_res, image_embd); // image_embd shape is 576 x 4096
281+ if (!encoded) {
282+ LOG_ERR (" Unable to encode image\n " );
283+ return false ;
412284 }
413285
414286 LOG_INF (" %s: image embedding created: %d tokens\n " , __func__, *n_img_pos);
0 commit comments