@@ -1083,16 +1083,24 @@ struct clip_graph {
10831083 }
10841084
10851085 ggml_cgraph * build_minicpmv () {
1086- const int batch_size = 1 ;
1087-
10881086 GGML_ASSERT (model.class_embedding == nullptr );
1089- const int n_pos = n_patches;
1087+ const int n_pos = n_patches;
1088+ const int n_embd_proj = clip_n_mmproj_embd (ctx);
10901089
10911090 // position embeddings for the projector (not for ViT)
1092- int n_output_dim = clip_n_mmproj_embd (ctx);
1093- ggml_tensor * pos_embed = ggml_new_tensor_3d (ctx0, GGML_TYPE_F32, n_output_dim, n_pos, batch_size);
1094- ggml_set_name (pos_embed, " pos_embed" );
1095- ggml_set_input (pos_embed);
1091+ // see: https://huggingface.co/openbmb/MiniCPM-o-2_6/blob/main/resampler.py#L70
1092+ // base frequency omega
1093+ ggml_tensor * omega = ggml_new_tensor_1d (ctx0, GGML_TYPE_F32, n_embd_proj / 4 );
1094+ ggml_set_name (omega, " omega" );
1095+ ggml_set_input (omega);
1096+
1097+ // 2D input positions (using float for sinusoidal embeddings)
1098+ ggml_tensor * pos_h = ggml_new_tensor_2d (ctx0, GGML_TYPE_F32, 1 , n_pos);
1099+ ggml_set_name (pos_h, " pos_h" );
1100+ ggml_set_input (pos_h);
1101+ ggml_tensor * pos_w = ggml_new_tensor_2d (ctx0, GGML_TYPE_F32, 1 , n_pos);
1102+ ggml_set_name (pos_w, " pos_w" );
1103+ ggml_set_input (pos_w);
10961104
10971105 // for selecting learned pos embd, used by ViT
10981106 struct ggml_tensor * positions = ggml_new_tensor_1d (ctx0, GGML_TYPE_I32, n_pos);
@@ -1103,7 +1111,7 @@ struct clip_graph {
11031111
11041112 ggml_tensor * inp = build_inp ();
11051113 ggml_tensor * embeddings = build_vit (
1106- inp, n_patches ,
1114+ inp, n_pos ,
11071115 NORM_TYPE_NORMAL,
11081116 hparams.ffn_op ,
11091117 learned_pos_embd,
@@ -1115,17 +1123,39 @@ struct clip_graph {
11151123 ggml_tensor * v = ggml_mul_mat (ctx0, model.mm_model_kv_proj , embeddings);
11161124
11171125 // norm
1118- q = build_norm (q, model.mm_model_ln_q_w , model.mm_model_ln_q_b , NORM_TYPE_NORMAL, eps, -1 );
1126+ q = build_norm (q, model.mm_model_ln_q_w , model.mm_model_ln_q_b , NORM_TYPE_NORMAL, eps, -1 );
11191127 v = build_norm (v, model.mm_model_ln_kv_w , model.mm_model_ln_kv_b , NORM_TYPE_NORMAL, eps, -1 );
11201128
1129+ // calculate sinusoidal pos embd
1130+ ggml_tensor * pos_embed = nullptr ;
1131+ {
1132+ // outer product
1133+ ggml_tensor * omega_b = ggml_repeat_4d (ctx0, omega, omega->ne [0 ], n_pos, 1 , 1 ); // n_pos rows
1134+ ggml_tensor * theta_x = ggml_mul (ctx0, omega_b, pos_w);
1135+ ggml_tensor * theta_y = ggml_mul (ctx0, omega_b, pos_h);
1136+ // sin and cos
1137+ ggml_tensor * pos_embd_x = ggml_concat (
1138+ ctx0,
1139+ ggml_sin (ctx0, theta_x),
1140+ ggml_cos (ctx0, theta_x),
1141+ 0 // concat on first dim
1142+ );
1143+ ggml_tensor * pos_embd_y = ggml_concat (
1144+ ctx0,
1145+ ggml_sin (ctx0, theta_y),
1146+ ggml_cos (ctx0, theta_y),
1147+ 0 // concat on first dim
1148+ );
1149+ pos_embed = ggml_concat (ctx0, pos_embd_x, pos_embd_y, 0 );
1150+ }
1151+
11211152 // k = v + pos_embed
11221153 ggml_tensor * k = ggml_add (ctx0, v, pos_embed);
11231154
11241155 // attention
11251156 {
1126- int n_embd = clip_n_mmproj_embd (ctx);
11271157 const int d_head = 128 ;
1128- int n_head = n_embd /d_head;
1158+ int n_head = n_embd_proj /d_head;
11291159 // Use actual config value if available, otherwise fall back to hardcoded values
11301160 int num_query = ctx->model .hparams .minicpmv_query_num ;
11311161 ggml_tensor * Q = ggml_add (ctx0,
@@ -4564,92 +4594,6 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
45644594 return n_patches;
45654595}
45664596
4567- static std::vector<std::vector<std::vector<float >>> get_1d_sincos_pos_embed_from_grid_new (int embed_dim, const std::vector<std::vector<float >> & pos) {
4568- assert (embed_dim % 2 == 0 );
4569- int H = pos.size ();
4570- int W = pos[0 ].size ();
4571-
4572- std::vector<float > omega (embed_dim / 2 );
4573- for (int i = 0 ; i < embed_dim / 2 ; ++i) {
4574- omega[i] = 1.0 / pow (10000.0 , static_cast <float >(i) / (embed_dim / 2 ));
4575- }
4576-
4577- std::vector<std::vector<std::vector<float >>> emb (H, std::vector<std::vector<float >>(W, std::vector<float >(embed_dim)));
4578- for (int h = 0 ; h < H; ++h) {
4579- for (int w = 0 ; w < W; ++w) {
4580- for (int d = 0 ; d < embed_dim / 2 ; ++d) {
4581- float out_value = pos[h][w] * omega[d];
4582- emb[h][w][d] = sin (out_value);
4583- emb[h][w][d + embed_dim / 2 ] = cos (out_value);
4584- }
4585- }
4586- }
4587-
4588- return emb;
4589- }
4590-
4591- static std::vector<std::vector<std::vector<float >>> get_2d_sincos_pos_embed_from_grid (int embed_dim, const std::vector<std::vector<std::vector<float >>> & grid) {
4592- assert (embed_dim % 2 == 0 );
4593- std::vector<std::vector<std::vector<float >>> emb_h = get_1d_sincos_pos_embed_from_grid_new (embed_dim / 2 , grid[0 ]); // (H, W, D/2)
4594- std::vector<std::vector<std::vector<float >>> emb_w = get_1d_sincos_pos_embed_from_grid_new (embed_dim / 2 , grid[1 ]); // (H, W, D/2)
4595-
4596- int H = emb_h.size ();
4597- int W = emb_h[0 ].size ();
4598- std::vector<std::vector<std::vector<float >>> emb (H, std::vector<std::vector<float >>(W, std::vector<float >(embed_dim)));
4599-
4600- for (int h = 0 ; h < H; ++h) {
4601- for (int w = 0 ; w < W; ++w) {
4602- for (int d = 0 ; d < embed_dim / 2 ; ++d) {
4603- emb[h][w][d] = emb_h[h][w][d];
4604- emb[h][w][d + embed_dim / 2 ] = emb_w[h][w][d];
4605- }
4606- }
4607- }
4608- return emb;
4609- }
4610-
4611- static std::vector<std::vector<float >> get_2d_sincos_pos_embed (int embed_dim, const std::pair<int , int > image_size) {
4612- int grid_h_size = image_size.first ;
4613- int grid_w_size = image_size.second ;
4614-
4615- std::vector<float > grid_h (grid_h_size);
4616- std::vector<float > grid_w (grid_w_size);
4617-
4618- for (int i = 0 ; i < grid_h_size; ++i) {
4619- grid_h[i] = static_cast <float >(i);
4620- }
4621- for (int i = 0 ; i < grid_w_size; ++i) {
4622- grid_w[i] = static_cast <float >(i);
4623- }
4624-
4625- std::vector<std::vector<float >> grid (grid_h_size, std::vector<float >(grid_w_size));
4626- for (int h = 0 ; h < grid_h_size; ++h) {
4627- for (int w = 0 ; w < grid_w_size; ++w) {
4628- grid[h][w] = grid_w[w];
4629- }
4630- }
4631- std::vector<std::vector<std::vector<float >>> grid_2d = {grid, grid};
4632- for (int h = 0 ; h < grid_h_size; ++h) {
4633- for (int w = 0 ; w < grid_w_size; ++w) {
4634- grid_2d[0 ][h][w] = grid_h[h];
4635- grid_2d[1 ][h][w] = grid_w[w];
4636- }
4637- }
4638-
4639- std::vector<std::vector<std::vector<float >>> pos_embed_3d = get_2d_sincos_pos_embed_from_grid (embed_dim, grid_2d);
4640-
4641- int H = image_size.first ;
4642- int W = image_size.second ;
4643- std::vector<std::vector<float >> pos_embed_2d (H * W, std::vector<float >(embed_dim));
4644- for (int h = 0 ; h < H; ++h) {
4645- for (int w = 0 ; w < W; ++w) {
4646- pos_embed_2d[w * H + h] = pos_embed_3d[h][w];
4647- }
4648- }
4649-
4650- return pos_embed_2d;
4651- }
4652-
46534597bool clip_image_encode (struct clip_ctx * ctx, const int n_threads, clip_image_f32 * img, float * vec) {
46544598 clip_image_f32_batch imgs;
46554599 clip_image_f32_ptr img_copy (clip_image_f32_init ());
@@ -4788,22 +4732,28 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
47884732 }
47894733 set_input_i32 (" positions" , positions);
47904734
4791- // inspired from resampler of Qwen-VL:
4792- // -> https://huggingface.co/Qwen/Qwen-VL/tree/main
4793- // -> https://huggingface.co/Qwen/Qwen-VL/blob/0547ed36a86561e2e42fecec8fd0c4f6953e33c4/visual.py#L23
4794- int embed_dim = clip_n_mmproj_embd (ctx);
4795-
4796- // TODO @ngxson : this is very inefficient, can we do this using ggml_sin and ggml_cos?
4797- auto pos_embed_t = get_2d_sincos_pos_embed (embed_dim, std::make_pair (pos_w, pos_h));
4798-
4799- std::vector<float > pos_embed (embed_dim * pos_w * pos_h);
4800- for (int i = 0 ; i < pos_w * pos_h; ++i){
4801- for (int j = 0 ; j < embed_dim; ++j){
4802- pos_embed[i * embed_dim + j] = pos_embed_t [i][j];
4803- }
4735+ // inputs for resampler projector
4736+ // set the 2D positions (using float for sinusoidal embedding)
4737+ int n_patches_per_col = image_size_width / patch_size;
4738+ std::vector<float > pos_data (n_pos);
4739+ // dimension H
4740+ for (int i = 0 ; i < n_pos; i++) {
4741+ pos_data[i] = static_cast <float >(i / n_patches_per_col);
48044742 }
4805-
4806- set_input_f32 (" pos_embed" , pos_embed);
4743+ set_input_f32 (" pos_h" , pos_data);
4744+ // dimension W
4745+ for (int i = 0 ; i < n_pos; i++) {
4746+ pos_data[i] = static_cast <float >(i % n_patches_per_col);
4747+ }
4748+ set_input_f32 (" pos_w" , pos_data);
4749+ // base frequency omega
4750+ const float base_freq = 10000 .0f ;
4751+ const int n_embd_proj = clip_n_mmproj_embd (ctx);
4752+ std::vector<float > omega (n_embd_proj / 4 );
4753+ for (int i = 0 ; i < n_embd_proj / 4 ; ++i) {
4754+ omega[i] = 1 .0f / std::pow (base_freq, static_cast <float >(i) / (n_embd_proj / 4 ));
4755+ }
4756+ set_input_f32 (" omega" , omega);
48074757 } break ;
48084758 case PROJECTOR_TYPE_QWEN2VL:
48094759 case PROJECTOR_TYPE_QWEN3VL:
0 commit comments