@@ -81,6 +81,7 @@ static std::string format(const char * fmt, ...) {
8181#define KEY_HAS_VIS_ENC " clip.has_vision_encoder"
8282#define KEY_HAS_LLAVA_PROJ " clip.has_llava_projector"
8383#define KEY_HAS_MINICPMV_PROJ " clip.has_minicpmv_projector"
84+ #define KEY_MINICPMV_VERSION " clip.minicpmv_version"
8485#define KEY_USE_GELU " clip.use_gelu"
8586#define KEY_N_EMBD " clip.%s.embedding_length"
8687#define KEY_N_FF " clip.%s.feed_forward_length"
@@ -526,6 +527,7 @@ struct clip_ctx {
526527 bool has_vision_encoder = false ;
527528 bool has_llava_projector = false ;
528529 bool has_minicpmv_projector = false ;
530+ int minicpmv_version = 2 ;
529531
530532 struct clip_vision_model vision_model;
531533 projector_type proj_type = PROJECTOR_TYPE_MLP;
@@ -641,7 +643,12 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
641643 if (ctx->has_minicpmv_projector ) {
642644 int pos_w = image_size_width/patch_size;
643645 int pos_h = image_size_height/patch_size;
644- pos_embed = ggml_new_tensor_3d (ctx0, GGML_TYPE_F32, 4096 , pos_w * pos_h, 1 );
646+ if (ctx->minicpmv_version == 2 ) {
647+ pos_embed = ggml_new_tensor_3d (ctx0, GGML_TYPE_F32, 4096 , pos_w * pos_h, 1 );
648+ }
649+ else if (ctx->minicpmv_version == 3 ) {
650+ pos_embed = ggml_new_tensor_3d (ctx0, GGML_TYPE_F32, 3584 , pos_w * pos_h, 1 );
651+ }
645652 ggml_set_name (pos_embed, " pos_embed" );
646653 ggml_set_input (pos_embed);
647654 }
@@ -768,8 +775,8 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
768775 embeddings = ggml_gelu (ctx0, embeddings);
769776 embeddings = ggml_mul_mat (ctx0, model.mm_2_w , embeddings);
770777 embeddings = ggml_add (ctx0, embeddings, model.mm_2_b );
771-
772- } else if (ctx->proj_type == PROJECTOR_TYPE_MLP_NORM) {
778+ }
779+ else if (ctx->proj_type == PROJECTOR_TYPE_MLP_NORM) {
773780 embeddings = ggml_mul_mat (ctx0, model.mm_0_w , embeddings);
774781 embeddings = ggml_add (ctx0, embeddings, model.mm_0_b );
775782 // ggml_tensor_printf(embeddings, "mm_0_w",0,true,false);
@@ -949,10 +956,20 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
949956 }
950957
951958 { // attention
952- const int hidden_size = 4096 ;
959+ int hidden_size = 4096 ;
953960 const int d_head = 128 ;
954- const int n_head = hidden_size/d_head;
955- const int num_query = 96 ;
961+ int n_head = hidden_size/d_head;
962+ int num_query = 96 ;
963+ if (ctx->minicpmv_version == 2 ) {
964+ hidden_size = 4096 ;
965+ n_head = hidden_size/d_head;
966+ num_query = 96 ;
967+ }
968+ else if (ctx->minicpmv_version == 3 ) {
969+ hidden_size = 3584 ;
970+ n_head = hidden_size/d_head;
971+ num_query = 64 ;
972+ }
956973
957974 struct ggml_tensor * Q = ggml_add (ctx0, ggml_mul_mat (ctx0, model.mm_model_attn_q_w , q), model.mm_model_attn_q_b );
958975 Q = ggml_scale_inplace (ctx0, Q, 1 .0f / sqrt ((float )d_head));
@@ -1149,6 +1166,11 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
11491166 new_clip->has_minicpmv_projector = gguf_get_val_bool (ctx, idx);
11501167 }
11511168
1169+ idx = gguf_find_key (ctx, KEY_MINICPMV_VERSION);
1170+ if (idx != -1 ) {
1171+ new_clip->minicpmv_version = gguf_get_val_i32 (ctx, idx);
1172+ }
1173+
11521174 // GGML_ASSERT(new_clip->has_llava_projector); // see monatis/clip.cpp for image and/or text encoding for semantic search
11531175
11541176 GGML_ASSERT (new_clip->has_vision_encoder );
@@ -1910,10 +1932,12 @@ int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip) {
19101932// returns the normalized float tensor for llava-1.5, for spatial_unpad with anyres processing for llava-1.6 it returns the normalized image patch tensors as a vector
19111933// res_imgs memory is being allocated here, previous allocations will be freed if found
19121934bool clip_image_preprocess (struct clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32_batch * res_imgs) {
1913- if (clip_is_minicpmv (ctx)) {
1914- std::vector<std::vector<clip_image_u8 *>> imgs = uhd_slice_image (img);
1935+
1936+ if (clip_is_minicpmv (ctx)){
1937+ int max_slice_nums = 9 ;
1938+ std::vector<std::vector<clip_image_u8 *>> imgs = uhd_slice_image (img, max_slice_nums);
19151939 res_imgs->size = 0 ;
1916- for (size_t i = 0 ; i < imgs.size (); ++i) {
1940+ for (size_t i = 0 ; i < imgs.size (); ++i){
19171941 res_imgs->size += imgs[i].size ();
19181942 }
19191943 res_imgs->data = new clip_image_f32[res_imgs->size ];
@@ -2146,7 +2170,12 @@ int clip_n_patches(const struct clip_ctx * ctx) {
21462170 if (ctx->proj_type == PROJECTOR_TYPE_LDP || ctx->proj_type == PROJECTOR_TYPE_LDPV2) {
21472171 n_patches /= 4 ;
21482172 } else if (ctx->proj_type == PROJECTOR_TYPE_RESAMPLER) {
2149- n_patches = 96 ;
2173+ if (ctx->minicpmv_version == 2 ) {
2174+ n_patches = 96 ;
2175+ }
2176+ else if (ctx->minicpmv_version == 3 ) {
2177+ n_patches = 64 ;
2178+ }
21502179 }
21512180
21522181 return n_patches;
@@ -2282,6 +2311,11 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
22822311 const int patch_size = hparams.patch_size ;
22832312 const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size));
22842313 const int num_positions = num_patches + (ctx->has_class_embedding ? 1 : 0 );
2314+ if (ctx->load_image_size ==nullptr ){
2315+ ctx->load_image_size = clip_image_size_init ();
2316+ }
2317+ const int pos_w = ctx->load_image_size ->width /patch_size;
2318+ const int pos_h = ctx->load_image_size ->height /patch_size;
22852319
22862320 {
22872321 struct ggml_tensor * inp_raw = ggml_graph_get_tensor (gf, " inp_raw" );
@@ -2316,8 +2350,18 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
23162350 // -> https://huggingface.co/HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit/blob/d66538faeba44480d0bfaa42145eef26f9423199/modeling_siglip.py#L316
23172351 struct ggml_tensor * positions = ggml_graph_get_tensor (gf, " positions" );
23182352 int * positions_data = (int *)malloc (ggml_nbytes (positions));
2319- for (int i = 0 ; i < num_positions; i++) {
2320- positions_data[i] = std::floor (70.0 *i/num_positions);
2353+ int bucket_coords_h[70 ];
2354+ int bucket_coords_w[70 ];
2355+ for (int i = 0 ; i < pos_h; i++){
2356+ bucket_coords_h[i] = std::floor (70.0 *i/pos_h);
2357+ }
2358+ for (int i = 0 ; i < pos_w; i++){
2359+ bucket_coords_w[i] = std::floor (70.0 *i/pos_w);
2360+ }
2361+ for (int i = 0 , id = 0 ; i < pos_h; i++){
2362+ for (int j = 0 ; j < pos_w; j++){
2363+ positions_data[id++] = bucket_coords_h[i]*70 + bucket_coords_w[j];
2364+ }
23212365 }
23222366 ggml_backend_tensor_set (positions, positions_data, 0 , ggml_nbytes (positions));
23232367 free (positions_data);
@@ -2328,12 +2372,13 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
23282372 // -> https://huggingface.co/Qwen/Qwen-VL/tree/main
23292373 // -> https://huggingface.co/Qwen/Qwen-VL/blob/0547ed36a86561e2e42fecec8fd0c4f6953e33c4/visual.py#L23
23302374 struct ggml_tensor * pos_embed = ggml_graph_get_tensor (gf, " pos_embed" );
2331- if (ctx->load_image_size ==nullptr ){
2332- ctx->load_image_size = clip_image_size_init ();
2333- }
2334- int pos_w = ctx->load_image_size ->width /patch_size;
2335- int pos_h = ctx->load_image_size ->height /patch_size;
23362375 int embed_dim = 4096 ;
2376+ if (ctx->minicpmv_version == 2 ) {
2377+ embed_dim = 4096 ;
2378+ }
2379+ else if (ctx->minicpmv_version == 3 ) {
2380+ embed_dim = 3584 ;
2381+ }
23372382 auto pos_embed_t = get_2d_sincos_pos_embed (embed_dim, std::make_pair (pos_w, pos_h));
23382383
23392384 float * pos_embed_data = (float *)malloc (ggml_nbytes (pos_embed));
@@ -2346,7 +2391,8 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
23462391 ggml_backend_tensor_set (pos_embed, pos_embed_data, 0 , ggml_nbytes (pos_embed));
23472392 free (pos_embed_data);
23482393 }
2349- } else {
2394+ }
2395+ else {
23502396 {
23512397 if (ctx->has_class_embedding ) {
23522398 struct ggml_tensor * embeddings = ggml_graph_get_tensor (gf, " embeddings" );
@@ -2548,13 +2594,21 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
25482594 return ctx->vision_model .mm_3_b ->ne [0 ];
25492595 }
25502596 if (ctx->proj_type == PROJECTOR_TYPE_RESAMPLER) {
2551- return 4096 ;
2597+ if (ctx->minicpmv_version == 2 ) {
2598+ return 4096 ;
2599+ }
2600+ else if (ctx->minicpmv_version == 3 ) {
2601+ return 3584 ;
2602+ }
25522603 }
25532604
25542605 std::string proj_type = PROJECTOR_TYPE_NAMES[ctx->proj_type ];
25552606 throw std::runtime_error (format (" %s: don't support projector with: %s currently\n " , __func__, proj_type.c_str ()));
25562607}
25572608
2558- bool clip_is_minicpmv (const struct clip_ctx * ctx) {
2559- return ctx->has_minicpmv_projector ;
2609+ int clip_is_minicpmv (const struct clip_ctx * ctx) {
2610+ if (ctx->has_minicpmv_projector ) {
2611+ return ctx->minicpmv_version ;
2612+ }
2613+ return 0 ;
25602614}
0 commit comments