Skip to content

Commit c0edde6

Browse files
committed
hey what do you know, it worked
1 parent d04b4ee commit c0edde6

File tree

1 file changed

+6
-134
lines changed

1 file changed

+6
-134
lines changed

tools/mtmd/llava.cpp

Lines changed: 6 additions & 134 deletions
Original file line numberDiff line numberDiff line change
@@ -275,140 +275,12 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
275275

276276
const size_t n_imgs = clip_image_f32_batch_n_images(img_res_v.get());
277277

278-
if (clip_is_minicpmv(ctx_clip) || clip_is_qwen2vl(ctx_clip)) {
279-
std::vector<float *> image_embd_v;
280-
image_embd_v.resize(n_imgs);
281-
clip_image_size load_image_size;
282-
283-
for (size_t i = 0; i < n_imgs; i++) {
284-
const int64_t t_img_enc_step_start_us = ggml_time_us();
285-
int nx = clip_image_f32_batch_nx(img_res_v.get(), i);
286-
int ny = clip_image_f32_batch_ny(img_res_v.get(), i);
287-
image_embd_v[i] = (float *)malloc(clip_embd_nbytes_by_img(ctx_clip, nx, ny));
288-
int patch_size = 14;
289-
load_image_size.width = nx;
290-
load_image_size.height = ny;
291-
clip_add_load_image_size(ctx_clip, &load_image_size);
292-
293-
bool encoded = false;
294-
clip_image_f32 * img_res = clip_image_f32_get_img(img_res_v.get(), i);
295-
if (clip_is_qwen2vl(ctx_clip)) {
296-
encoded = clip_image_encode(ctx_clip, n_threads, img_res, image_embd_v[i]);
297-
}
298-
else {
299-
encoded = clip_image_encode(ctx_clip, n_threads, reshape_by_patch(img_res, patch_size), image_embd_v[i]);
300-
}
301-
302-
if (!encoded) {
303-
LOG_ERR("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) n_imgs);
304-
return false;
305-
}
306-
const int64_t t_img_enc_steop_batch_us = ggml_time_us();
307-
LOG_INF("%s: step %d of %d encoded in %8.2f ms\n", __func__, (int)i+1, (int)n_imgs, (t_img_enc_steop_batch_us - t_img_enc_step_start_us) / 1000.0);
308-
}
309-
const int64_t t_img_enc_batch_us = ggml_time_us();
310-
LOG_INF("%s: all %d segments encoded in %8.2f ms\n", __func__, (int)n_imgs, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0);
311-
312-
int n_img_pos_out = 0;
313-
for (size_t i = 0; i < image_embd_v.size(); i++) {
314-
int nx = clip_image_f32_batch_nx(img_res_v.get(), i);
315-
int ny = clip_image_f32_batch_ny(img_res_v.get(), i);
316-
clip_image_f32 * img_res = clip_image_f32_get_img(img_res_v.get(), i);
317-
std::memcpy(
318-
image_embd + n_img_pos_out * clip_n_mmproj_embd(ctx_clip),
319-
image_embd_v[i],
320-
clip_embd_nbytes_by_img(ctx_clip, nx, ny));
321-
n_img_pos_out += clip_n_output_tokens(ctx_clip, img_res);
322-
}
323-
*n_img_pos = n_img_pos_out;
324-
for (size_t i = 0; i < image_embd_v.size(); i++) {
325-
free(image_embd_v[i]);
326-
}
327-
image_embd_v.clear();
328-
load_image_size.width = img->nx;
329-
load_image_size.height = img->ny;
330-
clip_add_load_image_size(ctx_clip, &load_image_size);
331-
LOG_INF("%s: load_image_size %d %d\n", __func__, load_image_size.width, load_image_size.height);
332-
}
333-
else if (clip_is_glm(ctx_clip)){
334-
struct clip_image_size * load_image_size = clip_image_size_init();
335-
load_image_size->width = clip_image_f32_batch_nx(img_res_v.get(), 0);
336-
load_image_size->height = clip_image_f32_batch_ny(img_res_v.get(), 0);
337-
clip_add_load_image_size(ctx_clip, load_image_size);
338-
339-
clip_image_f32 * img_res = clip_image_f32_get_img(img_res_v.get(), 0);
340-
bool encoded = clip_image_encode(ctx_clip, n_threads, img_res, image_embd);
341-
int pos = int(load_image_size->width/clip_get_patch_size(ctx_clip)/2);
342-
*n_img_pos = (pos * pos + 2);
343-
if (!encoded){
344-
LOG_ERR("Unable to encode image \n");
345-
return false;
346-
}
347-
}
348-
else if (clip_is_pixtral(ctx_clip)){
349-
clip_image_f32 * img_res = clip_image_f32_get_img(img_res_v.get(), 0);
350-
*n_img_pos = clip_n_output_tokens(ctx_clip, img_res);
351-
bool encoded = clip_image_encode(ctx_clip, n_threads, img_res, image_embd); // image_embd shape is 576 x 4096
352-
if (!encoded) {
353-
LOG_ERR("Unable to encode image\n");
354-
355-
return false;
356-
}
357-
}
358-
else if (strcmp(mm_patch_merge_type, "spatial_unpad") != 0) {
359-
// flat / default llava-1.5 type embedding
360-
clip_image_f32 * img_res = clip_image_f32_get_img(img_res_v.get(), 0);
361-
*n_img_pos = clip_n_output_tokens(ctx_clip, img_res);
362-
bool encoded = clip_image_encode(ctx_clip, n_threads, img_res, image_embd); // image_embd shape is 576 x 4096
363-
if (!encoded) {
364-
LOG_ERR("Unable to encode image\n");
365-
366-
return false;
367-
}
368-
}
369-
else {
370-
// spatial_unpad llava-1.6 type embedding
371-
// TODO: CLIP needs batching support - in HF the llm projection is separate after encoding, which might be a solution to quickly get batching working
372-
std::vector<float *> image_embd_v;
373-
image_embd_v.resize(n_imgs);
374-
for (size_t i = 0; i < n_imgs; i++) {
375-
clip_image_f32 * img_res = clip_image_f32_get_img(img_res_v.get(), i);
376-
image_embd_v[i] = (float *)malloc(clip_embd_nbytes(ctx_clip)); // 576 patches * 4096 embeddings * 4 bytes = 9437184
377-
const bool encoded = clip_image_encode(ctx_clip, n_threads, img_res, image_embd_v[i]); // image data is in 3x336x336 format and will be converted to 336x336x3 inside
378-
if (!encoded) {
379-
LOG_ERR("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) n_imgs);
380-
return false;
381-
}
382-
}
383-
const int64_t t_img_enc_batch_us = ggml_time_us();
384-
LOG_INF("%s: %d segments encoded in %8.2f ms\n", __func__, (int)n_imgs, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0);
385-
386-
const int32_t * image_grid = clip_image_grid(ctx_clip);
387-
const size_t num_gridpoints = get_clip_image_grid_size(ctx_clip);
388-
389-
std::vector<std::pair<int, int>> grid_pinpoints;
390-
for (size_t i = 0; i < num_gridpoints; i += 2) {
391-
grid_pinpoints.push_back({image_grid[i], image_grid[i+1]});
392-
}
393-
394-
const int32_t image_size = clip_get_image_size(ctx_clip);
395-
396-
struct clip_image_grid_shape grid_shape = get_anyres_image_grid_shape({img->nx,img->ny}, grid_pinpoints, image_size);
397-
398-
int n_img_pos_out;
399-
clip_image_f32 * img_input = clip_image_f32_get_img(img_res_v.get(), 0);
400-
clip_llava_handle_patches(ctx_clip, image_embd_v, grid_shape, image_embd, &n_img_pos_out, img_input);
401-
*n_img_pos = n_img_pos_out;
402-
403-
for (size_t i = 0; i < image_embd_v.size(); i++) {
404-
free(image_embd_v[i]);
405-
}
406-
image_embd_v.clear();
407-
408-
// debug image/segment/normalization content:
409-
// clip_image_u8 * tmp = clip_image_u8_init();
410-
// clip_image_convert_f32_to_u8(*image_feature, *tmp);
411-
// clip_image_save_to_bmp(*tmp, "image_feature.bmp");
278+
clip_image_f32 * img_res = clip_image_f32_get_img(img_res_v.get(), 0);
279+
*n_img_pos = clip_n_output_tokens(ctx_clip, img_res);
280+
bool encoded = clip_image_encode(ctx_clip, n_threads, img_res, image_embd); // image_embd shape is 576 x 4096
281+
if (!encoded) {
282+
LOG_ERR("Unable to encode image\n");
283+
return false;
412284
}
413285

414286
LOG_INF("%s: image embedding created: %d tokens\n", __func__, *n_img_pos);

0 commit comments

Comments
 (0)