@@ -1114,15 +1114,8 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im
11141114 if (ctx->proj_type == PROJECTOR_TYPE_MINICPMV) {
11151115 int pos_w = image_size_width/patch_size;
11161116 int pos_h = image_size_height/patch_size;
1117- if (ctx->minicpmv_version == 2 ) {
1118- pos_embed = ggml_new_tensor_3d (ctx0, GGML_TYPE_F32, 4096 , pos_w * pos_h, 1 );
1119- }
1120- else if (ctx->minicpmv_version == 3 ) {
1121- pos_embed = ggml_new_tensor_3d (ctx0, GGML_TYPE_F32, 3584 , pos_w * pos_h, 1 );
1122- }
1123- else if (ctx->minicpmv_version == 4 ) {
1124- pos_embed = ggml_new_tensor_3d (ctx0, GGML_TYPE_F32, 3584 , pos_w * pos_h, 1 );
1125- }
1117+ int n_output_dim = clip_n_mmproj_embd (ctx);
1118+ pos_embed = ggml_new_tensor_3d (ctx0, GGML_TYPE_F32, n_output_dim, pos_w * pos_h, 1 );
11261119 ggml_set_name (pos_embed, " pos_embed" );
11271120 ggml_set_input (pos_embed);
11281121 }
@@ -1460,23 +1453,17 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im
14601453 }
14611454
14621455 { // attention
1463- int hidden_size = 4096 ;
1456+ int hidden_size = clip_n_mmproj_embd (ctx) ;
14641457 const int d_head = 128 ;
14651458 int n_head = hidden_size/d_head;
14661459 int num_query = 96 ;
14671460 if (ctx->minicpmv_version == 2 ) {
1468- hidden_size = 4096 ;
1469- n_head = hidden_size/d_head;
14701461 num_query = 96 ;
14711462 }
14721463 else if (ctx->minicpmv_version == 3 ) {
1473- hidden_size = 3584 ;
1474- n_head = hidden_size/d_head;
14751464 num_query = 64 ;
14761465 }
14771466 else if (ctx->minicpmv_version == 4 ) {
1478- hidden_size = 3584 ;
1479- n_head = hidden_size/d_head;
14801467 num_query = 64 ;
14811468 }
14821469
@@ -3136,19 +3123,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
31363123 // -> https://huggingface.co/Qwen/Qwen-VL/tree/main
31373124 // -> https://huggingface.co/Qwen/Qwen-VL/blob/0547ed36a86561e2e42fecec8fd0c4f6953e33c4/visual.py#L23
31383125 struct ggml_tensor * pos_embed = ggml_graph_get_tensor (gf, " pos_embed" );
3139- int embed_dim = 4096 ;
3140- if (ctx->minicpmv_version == 2 ) {
3141- embed_dim = 4096 ;
3142- }
3143- else if (ctx->minicpmv_version == 3 ) {
3144- embed_dim = 3584 ;
3145- }
3146- else if (ctx->minicpmv_version == 4 ) {
3147- embed_dim = 3584 ;
3148- }
3149- else {
3150- GGML_ABORT (" Unknown minicpmv version" );
3151- }
3126+ int embed_dim = clip_n_mmproj_embd (ctx);
31523127
31533128 // TODO @ngxson : this is very inefficient, can we do this using ggml_sin and ggml_cos?
31543129 auto pos_embed_t = get_2d_sincos_pos_embed (embed_dim, std::make_pair (pos_w, pos_h));
0 commit comments