@@ -718,6 +718,9 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
718718 else if (ctx->minicpmv_version == 3 ) {
719719 pos_embed = ggml_new_tensor_3d (ctx0, GGML_TYPE_F32, 3584 , pos_w * pos_h, 1 );
720720 }
721+ else if (ctx->minicpmv_version == 4 ) {
722+ pos_embed = ggml_new_tensor_3d (ctx0, GGML_TYPE_F32, 3584 , pos_w * pos_h, 1 );
723+ }
721724 ggml_set_name (pos_embed, " pos_embed" );
722725 ggml_set_input (pos_embed);
723726 }
@@ -1053,6 +1056,11 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
10531056 n_head = hidden_size/d_head;
10541057 num_query = 64 ;
10551058 }
1059+ else if (ctx->minicpmv_version == 4 ) {
1060+ hidden_size = 3584 ;
1061+ n_head = hidden_size/d_head;
1062+ num_query = 64 ;
1063+ }
10561064
10571065 struct ggml_tensor * Q = ggml_add (ctx0, ggml_mul_mat (ctx0, model.mm_model_attn_q_w , q), model.mm_model_attn_q_b );
10581066 Q = ggml_scale_inplace (ctx0, Q, 1 .0f / sqrt ((float )d_head));
@@ -2041,6 +2049,7 @@ static std::vector<std::vector<clip_image_u8 *>> uhd_slice_image(const clip_imag
20412049 images[images.size ()-1 ].push_back (patch);
20422050 }
20432051 }
2052+ clip_image_u8_free (refine_image);
20442053 }
20452054 return images;
20462055}
@@ -2079,6 +2088,13 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
20792088 clip_image_f32_free (res);
20802089 }
20812090 }
2091+ for (size_t i = 0 ; i < imgs.size (); ++i) {
2092+ for (size_t j = 0 ; j < imgs[i].size (); ++j) {
2093+ if (imgs[i][j] != nullptr ) {
2094+ clip_image_u8_free (imgs[i][j]);
2095+ }
2096+ }
2097+ }
20822098 return true ;
20832099 }
20842100 else if (ctx->has_qwen2vl_merger ) {
@@ -2335,6 +2351,9 @@ int clip_n_patches_by_img(const struct clip_ctx * ctx, struct clip_image_f32 * i
23352351 else if (ctx->minicpmv_version == 3 ) {
23362352 n_patches = 64 ;
23372353 }
2354+ else if (ctx->minicpmv_version == 4 ) {
2355+ n_patches = 64 ;
2356+ }
23382357 } else if (ctx->proj_type == PROJECTOR_TYPE_MERGER) {
23392358 int patch_size = params.patch_size * 2 ;
23402359 int x_patch = img->nx / patch_size + (int )(img->nx % patch_size > 0 );
@@ -2514,8 +2533,8 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
25142533 // -> https://huggingface.co/HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit/blob/d66538faeba44480d0bfaa42145eef26f9423199/modeling_siglip.py#L316
25152534 struct ggml_tensor * positions = ggml_graph_get_tensor (gf, " positions" );
25162535 int * positions_data = (int *)malloc (ggml_nbytes (positions));
2517- int bucket_coords_h[70 ];
2518- int bucket_coords_w[70 ];
2536+ int bucket_coords_h[1024 ];
2537+ int bucket_coords_w[1024 ];
25192538 for (int i = 0 ; i < pos_h; i++){
25202539 bucket_coords_h[i] = std::floor (70.0 *i/pos_h);
25212540 }
@@ -2543,6 +2562,9 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
25432562 else if (ctx->minicpmv_version == 3 ) {
25442563 embed_dim = 3584 ;
25452564 }
2565+ else if (ctx->minicpmv_version == 4 ) {
2566+ embed_dim = 3584 ;
2567+ }
25462568 auto pos_embed_t = get_2d_sincos_pos_embed (embed_dim, std::make_pair (pos_w, pos_h));
25472569
25482570 float * pos_embed_data = (float *)malloc (ggml_nbytes (pos_embed));
@@ -2786,6 +2808,9 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
27862808 else if (ctx->minicpmv_version == 3 ) {
27872809 return 3584 ;
27882810 }
2811+ else if (ctx->minicpmv_version == 4 ) {
2812+ return 3584 ;
2813+ }
27892814 }
27902815 if (ctx->proj_type == PROJECTOR_TYPE_MERGER) {
27912816 return ctx->vision_model .mm_1_b ->ne [0 ];
0 commit comments