@@ -919,112 +919,6 @@ struct clip_graph {
919919 return gf;
920920 }
921921
922- ggml_cgraph * build_minicpmv_embedding () {
923- GGML_ASSERT (model.class_embedding == nullptr );
924- const int n_pos = n_patches;
925-
926- // for selecting learned pos embd, used by ViT
927- struct ggml_tensor * positions = ggml_new_tensor_1d (ctx0, GGML_TYPE_I32, n_pos);
928- ggml_set_name (positions, " positions" );
929- ggml_set_input (positions);
930-
931- ggml_tensor * learned_pos_embd = ggml_get_rows (ctx0, model.position_embeddings , positions);
932-
933- ggml_tensor * inp = build_inp ();
934- if (learned_pos_embd) {
935- inp = ggml_add (ctx0, inp, learned_pos_embd);
936- cb (inp, " pos_embed" , -1 );
937- }
938- ggml_tensor * embeddings = inp;
939-
940- // pre-layernorm
941- if (model.pre_ln_w ) {
942- embeddings = ggml_norm (ctx0, embeddings, eps);
943- ggml_set_name (embeddings, " pre_ln" );
944- embeddings = ggml_add (ctx0, ggml_mul (ctx0, embeddings, model.pre_ln_w ), model.pre_ln_b );
945- }
946-
947- ggml_build_forward_expand (gf, embeddings);
948- return gf;
949- }
950-
951- ggml_cgraph * build_minicpmv_resampler () {
952- const int batch_size = 1 ;
953-
954- GGML_ASSERT (model.class_embedding == nullptr );
955- const int n_pos = n_patches;
956-
957- const int image_size_width = img.nx ;
958- const int image_size_height = img.ny ;
959- const int patch_size = hparams.patch_size ;
960- const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size));
961-
962- // position embeddings for the projector (not for ViT)
963- int n_output_dim = clip_n_mmproj_embd (ctx);
964- ggml_tensor * pos_embed = ggml_new_tensor_3d (ctx0, GGML_TYPE_F32, n_output_dim, n_pos, batch_size);
965- ggml_set_name (pos_embed, " pos_embed" );
966- ggml_set_input (pos_embed);
967-
968- struct ggml_tensor * embeddings = ggml_new_tensor_2d (ctx0, GGML_TYPE_F32, 1152 , num_patches);
969- ggml_set_name (embeddings, " embeddings" );
970- ggml_set_input (embeddings);
971-
972- // resampler projector (it is just another transformer)
973-
974- ggml_tensor * q = model.mm_model_query ;
975- ggml_tensor * v = ggml_mul_mat (ctx0, model.mm_model_kv_proj , embeddings);
976-
977- // norm
978- q = build_norm (q, model.mm_model_ln_q_w , model.mm_model_ln_q_b , NORM_TYPE_NORMAL, eps, -1 );
979- v = build_norm (v, model.mm_model_ln_kv_w , model.mm_model_ln_kv_b , NORM_TYPE_NORMAL, eps, -1 );
980-
981- // k = v + pos_embed
982- ggml_tensor * k = ggml_add (ctx0, v, pos_embed);
983-
984- // attention
985- {
986- int n_embd = clip_n_mmproj_embd (ctx);
987- const int d_head = 128 ;
988- int n_head = n_embd/d_head;
989- // Use actual config value if available, otherwise fall back to hardcoded values
990- int num_query = ctx->model .hparams .minicpmv_query_num ;
991-
992- ggml_tensor * Q = ggml_add (ctx0,
993- ggml_mul_mat (ctx0, model.mm_model_attn_q_w , q),
994- model.mm_model_attn_q_b );
995- ggml_tensor * K = ggml_add (ctx0,
996- ggml_mul_mat (ctx0, model.mm_model_attn_k_w , k),
997- model.mm_model_attn_k_b );
998- ggml_tensor * V = ggml_add (ctx0,
999- ggml_mul_mat (ctx0, model.mm_model_attn_v_w , v),
1000- model.mm_model_attn_v_b );
1001-
1002- Q = ggml_reshape_3d (ctx0, Q, d_head, n_head, num_query);
1003- K = ggml_reshape_3d (ctx0, K, d_head, n_head, n_pos);
1004- V = ggml_reshape_3d (ctx0, V, d_head, n_head, n_pos);
1005-
1006- cb (Q, " resampler_Q" , -1 );
1007- cb (K, " resampler_K" , -1 );
1008- cb (V, " resampler_V" , -1 );
1009-
1010- embeddings = build_attn (
1011- model.mm_model_attn_o_w ,
1012- model.mm_model_attn_o_b ,
1013- Q, K, V, nullptr , kq_scale, -1 );
1014- cb (embeddings, " resampler_attn_out" , -1 );
1015- }
1016- // layernorm
1017- embeddings = build_norm (embeddings, model.mm_model_ln_post_w , model.mm_model_ln_post_b , NORM_TYPE_NORMAL, eps, -1 );
1018-
1019- // projection
1020- embeddings = ggml_mul_mat (ctx0, model.mm_model_proj , embeddings);
1021-
1022- // build the graph
1023- ggml_build_forward_expand (gf, embeddings);
1024-
1025- return gf;
1026- }
1027-
1028922 ggml_cgraph * build_internvl () {
1029923 GGML_ASSERT (model.class_embedding != nullptr );
1030924 GGML_ASSERT (model.position_embeddings != nullptr );
@@ -3371,10 +3265,9 @@ struct llava_uhd {
33713265 const int original_width = original_size.width ;
33723266 const int original_height = original_size.height ;
33733267
3374- bool has_slices = original_size.width > slice_size || original_size.height > slice_size;
3268+ const bool has_slices = original_size.width > slice_size || original_size.height > slice_size;
33753269 const bool has_pinpoints = !ctx->model .hparams .image_res_candidates .empty ();
33763270
3377- // has_slices = false;
33783271 if (!has_slices) {
33793272 // skip slicing logic
33803273 res.overview_size = clip_image_size{slice_size, slice_size};
0 commit comments