@@ -501,15 +501,14 @@ struct llama_vision_processor_uhd : llama_vision_processor {
501501 llama_image_u8 source_image;
502502 bicubic_resize (img, source_image, best_size.first , best_size.second );
503503 // source_image = image.resize(best_size, Image.Resampling.BICUBIC)
504- images[images.size ()-1 ].push_back (source_image);
505- }
506- else if (multiple > 1 ) {
504+ images.back ().push_back (source_image);
505+ } else if (multiple > 1 ) {
507506 auto best_size = find_best_resize (original_size, scale_resolution, patch_size);
508507 llama_image_u8 source_image;
509508 bicubic_resize (img, source_image, best_size.first , best_size.second );
510509 // source_image = image.copy().resize(best_resize, Image.Resampling.BICUBIC)
511510 LLAMA_LOG_DEBUG (" %s: image_size: %d %d; source_image size: %d %d\n " , __func__, img.nx , img.ny , best_size.first , best_size.second );
512- images[images. size ()- 1 ] .push_back (source_image);
511+ images. back () .push_back (source_image);
513512
514513 std::pair<int , int > best_grid = find_best_grid (max_slice_nums, multiple, log_ratio);
515514 LLAMA_LOG_DEBUG (" %s: image_size: %d %d; best_grid: %d %d\n " , __func__, img.nx , img.ny , best_grid.first , best_grid.second );
@@ -541,7 +540,7 @@ struct llama_vision_processor_uhd : llama_vision_processor {
541540 patch.buf [j+2 ] = refine_image.buf [i+2 ];
542541 }
543542 }
544- images[images. size ()- 1 ] .push_back (patch);
543+ images. back () .push_back (patch);
545544 }
546545 }
547546 }
@@ -948,7 +947,7 @@ static int32_t llama_vision_encode_impl(llama_vision_context & ctx, const llama_
948947 // set raw input
949948 {
950949 struct ggml_tensor * inp_raw = ggml_graph_get_tensor (gf, " inp_raw" );
951- float * data = ( float *) malloc ( ggml_nbytes (inp_raw));
950+ std::vector< float > inp_buf ( ggml_nelements (inp_raw));
952951
953952 for (int i = 0 ; i < batch_size; i++) {
954953 const int nx = inp.px * inp.n_px ;
@@ -959,48 +958,71 @@ static int32_t llama_vision_encode_impl(llama_vision_context & ctx, const llama_
959958 for (int k = 0 ; k < 3 ; k++) {
960959 for (int y = 0 ; y < ny; y++) {
961960 for (int x = 0 ; x < nx; x++) {
962- data [(b * 3 * n) + k * n + y * nx + x] = inp.buf [b][3 * (y * nx + x) + k];
961+ inp_buf [(b * 3 * n) + k * n + y * nx + x] = inp.buf [b][3 * (y * nx + x) + k];
963962 }
964963 }
965964 }
966965 }
967966 }
968- ggml_backend_tensor_set (inp_raw, data, 0 , ggml_nbytes (inp_raw));
969- free (data);
967+ ggml_backend_tensor_set (inp_raw, inp_buf.data (), 0 , ggml_nbytes (inp_raw));
970968 }
971969
972970 if (model.class_embedding ) {
973971 struct ggml_tensor * inp_embd = ggml_graph_get_tensor (gf, " inp_embd" );
974972 ggml_set_zero (inp_embd);
975973 }
976974
977- {
975+ if (hparams.arch == LLM_ARCH_VISION_MINICPMV) {
976+ // inspired from siglip:
977+ // -> https://huggingface.co/HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit
978+ // -> https://huggingface.co/HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit/blob/d66538faeba44480d0bfaa42145eef26f9423199/modeling_siglip.py#L316
978979 struct ggml_tensor * positions = ggml_graph_get_tensor (gf, " inp_pos" );
980+ std::vector<int > pos_buf (ggml_nelements (positions));
981+ GGML_ASSERT (num_positions == (int )pos_buf.size ());
979982
980- int * positions_data = (int *)malloc (ggml_nbytes (positions));
983+ int bucket_coords_h[70 ];
984+ int bucket_coords_w[70 ];
985+ for (size_t i = 0 ; i < inp.n_py ; i++) {
986+ bucket_coords_h[i] = std::floor (70.0 *i/inp.n_py );
987+ }
988+ for (size_t i = 0 ; i < inp.n_px ; i++) {
989+ bucket_coords_w[i] = std::floor (70.0 *i/inp.n_px );
990+ }
991+ for (size_t i = 0 , id = 0 ; i < inp.n_py ; i++){
992+ for (size_t j = 0 ; j < inp.n_px ; j++){
993+ pos_buf[id++] = bucket_coords_h[i]*70 + bucket_coords_w[j];
994+ }
995+ }
996+ ggml_backend_tensor_set (positions, pos_buf.data (), 0 , ggml_nbytes (positions));
997+
998+ } else {
999+ struct ggml_tensor * positions = ggml_graph_get_tensor (gf, " inp_pos" );
1000+ std::vector<int > pos_buf (ggml_nelements (positions));
1001+ GGML_ASSERT (num_positions == (int )pos_buf.size ());
9811002 for (int i = 0 ; i < num_positions; i++) {
982- positions_data [i] = i;
1003+ pos_buf [i] = i;
9831004 }
984- ggml_backend_tensor_set (positions, positions_data, 0 , ggml_nbytes (positions));
985- free (positions_data);
1005+ ggml_backend_tensor_set (positions, pos_buf.data (), 0 , ggml_nbytes (positions));
9861006 }
9871007
9881008 struct ggml_tensor * patches = ggml_graph_get_tensor (gf, " inp_patches" );
9891009 if (patches) {
990- int * patches_data = (int *)malloc (ggml_nbytes (patches));
1010+ std::vector<int > patches_buf (ggml_nelements (patches));
1011+ GGML_ASSERT (num_patches == (int )patches_buf.size ());
9911012 for (int i = 0 ; i < num_patches; i++) {
992- patches_data [i] = i + 1 ;
1013+ patches_buf [i] = i + 1 ;
9931014 }
994- ggml_backend_tensor_set (patches, patches_data, 0 , ggml_nbytes (patches));
995- free (patches_data);
1015+ ggml_backend_tensor_set (patches, patches_buf.data (), 0 , ggml_nbytes (patches));
9961016 }
9971017
9981018 // compute
1019+ int64_t t_start = ggml_time_ms ();
9991020 ggml_backend_sched_graph_compute (ctx.sched , gf);
10001021
10011022 // the last node is the embedding tensor
10021023 struct ggml_tensor * output_node = ggml_graph_node (gf, -1 );
10031024 // LLAMA_LOG_INFO("%s: output tensor shape = %lld %lld %lld %lld\n", __func__, output->ne[0], output->ne[1], output->ne[2], output->ne[3]);
1025+ LLAMA_LOG_DEBUG (" %s: compute time = %lld ms\n " , __func__, ggml_time_ms () - t_start);
10041026
10051027 // copy output node to context
10061028 if (ctx.ctx_ggml ) {
0 commit comments