Skip to content

Commit 4882f0f

Browse files
authored
clip: implement minicpm-v sinusoidal embd using GGML (ggml-org#17036)
* clip: implement minicpm-v sinusoidal embd using GGML * fix repeat op
1 parent 9d7c518 commit 4882f0f

File tree

1 file changed

+62
-112
lines changed

1 file changed

+62
-112
lines changed

tools/mtmd/clip.cpp

Lines changed: 62 additions & 112 deletions
Original file line numberDiff line numberDiff line change
@@ -1083,16 +1083,24 @@ struct clip_graph {
10831083
}
10841084

10851085
ggml_cgraph * build_minicpmv() {
1086-
const int batch_size = 1;
1087-
10881086
GGML_ASSERT(model.class_embedding == nullptr);
1089-
const int n_pos = n_patches;
1087+
const int n_pos = n_patches;
1088+
const int n_embd_proj = clip_n_mmproj_embd(ctx);
10901089

10911090
// position embeddings for the projector (not for ViT)
1092-
int n_output_dim = clip_n_mmproj_embd(ctx);
1093-
ggml_tensor * pos_embed = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_output_dim, n_pos, batch_size);
1094-
ggml_set_name(pos_embed, "pos_embed");
1095-
ggml_set_input(pos_embed);
1091+
// see: https://huggingface.co/openbmb/MiniCPM-o-2_6/blob/main/resampler.py#L70
1092+
// base frequency omega
1093+
ggml_tensor * omega = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, n_embd_proj / 4);
1094+
ggml_set_name(omega, "omega");
1095+
ggml_set_input(omega);
1096+
1097+
// 2D input positions (using float for sinusoidal embeddings)
1098+
ggml_tensor * pos_h = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 1, n_pos);
1099+
ggml_set_name(pos_h, "pos_h");
1100+
ggml_set_input(pos_h);
1101+
ggml_tensor * pos_w = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 1, n_pos);
1102+
ggml_set_name(pos_w, "pos_w");
1103+
ggml_set_input(pos_w);
10961104

10971105
// for selecting learned pos embd, used by ViT
10981106
struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos);
@@ -1103,7 +1111,7 @@ struct clip_graph {
11031111

11041112
ggml_tensor * inp = build_inp();
11051113
ggml_tensor * embeddings = build_vit(
1106-
inp, n_patches,
1114+
inp, n_pos,
11071115
NORM_TYPE_NORMAL,
11081116
hparams.ffn_op,
11091117
learned_pos_embd,
@@ -1115,17 +1123,39 @@ struct clip_graph {
11151123
ggml_tensor * v = ggml_mul_mat(ctx0, model.mm_model_kv_proj, embeddings);
11161124

11171125
// norm
1118-
q = build_norm(q, model.mm_model_ln_q_w, model.mm_model_ln_q_b, NORM_TYPE_NORMAL, eps, -1);
1126+
q = build_norm(q, model.mm_model_ln_q_w, model.mm_model_ln_q_b, NORM_TYPE_NORMAL, eps, -1);
11191127
v = build_norm(v, model.mm_model_ln_kv_w, model.mm_model_ln_kv_b, NORM_TYPE_NORMAL, eps, -1);
11201128

1129+
// calculate sinusoidal pos embd
1130+
ggml_tensor * pos_embed = nullptr;
1131+
{
1132+
// outer product
1133+
ggml_tensor * omega_b = ggml_repeat_4d(ctx0, omega, omega->ne[0], n_pos, 1, 1); // n_pos rows
1134+
ggml_tensor * theta_x = ggml_mul(ctx0, omega_b, pos_w);
1135+
ggml_tensor * theta_y = ggml_mul(ctx0, omega_b, pos_h);
1136+
// sin and cos
1137+
ggml_tensor * pos_embd_x = ggml_concat(
1138+
ctx0,
1139+
ggml_sin(ctx0, theta_x),
1140+
ggml_cos(ctx0, theta_x),
1141+
0 // concat on first dim
1142+
);
1143+
ggml_tensor * pos_embd_y = ggml_concat(
1144+
ctx0,
1145+
ggml_sin(ctx0, theta_y),
1146+
ggml_cos(ctx0, theta_y),
1147+
0 // concat on first dim
1148+
);
1149+
pos_embed = ggml_concat(ctx0, pos_embd_x, pos_embd_y, 0);
1150+
}
1151+
11211152
// k = v + pos_embed
11221153
ggml_tensor * k = ggml_add(ctx0, v, pos_embed);
11231154

11241155
// attention
11251156
{
1126-
int n_embd = clip_n_mmproj_embd(ctx);
11271157
const int d_head = 128;
1128-
int n_head = n_embd/d_head;
1158+
int n_head = n_embd_proj/d_head;
11291159
// Use actual config value if available, otherwise fall back to hardcoded values
11301160
int num_query = ctx->model.hparams.minicpmv_query_num;
11311161
ggml_tensor * Q = ggml_add(ctx0,
@@ -4564,92 +4594,6 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
45644594
return n_patches;
45654595
}
45664596

4567-
static std::vector<std::vector<std::vector<float>>> get_1d_sincos_pos_embed_from_grid_new(int embed_dim, const std::vector<std::vector<float>> & pos) {
4568-
assert(embed_dim % 2 == 0);
4569-
int H = pos.size();
4570-
int W = pos[0].size();
4571-
4572-
std::vector<float> omega(embed_dim / 2);
4573-
for (int i = 0; i < embed_dim / 2; ++i) {
4574-
omega[i] = 1.0 / pow(10000.0, static_cast<float>(i) / (embed_dim / 2));
4575-
}
4576-
4577-
std::vector<std::vector<std::vector<float>>> emb(H, std::vector<std::vector<float>>(W, std::vector<float>(embed_dim)));
4578-
for (int h = 0; h < H; ++h) {
4579-
for (int w = 0; w < W; ++w) {
4580-
for (int d = 0; d < embed_dim / 2; ++d) {
4581-
float out_value = pos[h][w] * omega[d];
4582-
emb[h][w][d] = sin(out_value);
4583-
emb[h][w][d + embed_dim / 2] = cos(out_value);
4584-
}
4585-
}
4586-
}
4587-
4588-
return emb;
4589-
}
4590-
4591-
static std::vector<std::vector<std::vector<float>>> get_2d_sincos_pos_embed_from_grid(int embed_dim, const std::vector<std::vector<std::vector<float>>> & grid) {
4592-
assert(embed_dim % 2 == 0);
4593-
std::vector<std::vector<std::vector<float>>> emb_h = get_1d_sincos_pos_embed_from_grid_new(embed_dim / 2, grid[0]); // (H, W, D/2)
4594-
std::vector<std::vector<std::vector<float>>> emb_w = get_1d_sincos_pos_embed_from_grid_new(embed_dim / 2, grid[1]); // (H, W, D/2)
4595-
4596-
int H = emb_h.size();
4597-
int W = emb_h[0].size();
4598-
std::vector<std::vector<std::vector<float>>> emb(H, std::vector<std::vector<float>>(W, std::vector<float>(embed_dim)));
4599-
4600-
for (int h = 0; h < H; ++h) {
4601-
for (int w = 0; w < W; ++w) {
4602-
for (int d = 0; d < embed_dim / 2; ++d) {
4603-
emb[h][w][d] = emb_h[h][w][d];
4604-
emb[h][w][d + embed_dim / 2] = emb_w[h][w][d];
4605-
}
4606-
}
4607-
}
4608-
return emb;
4609-
}
4610-
4611-
static std::vector<std::vector<float>> get_2d_sincos_pos_embed(int embed_dim, const std::pair<int, int> image_size) {
4612-
int grid_h_size = image_size.first;
4613-
int grid_w_size = image_size.second;
4614-
4615-
std::vector<float> grid_h(grid_h_size);
4616-
std::vector<float> grid_w(grid_w_size);
4617-
4618-
for (int i = 0; i < grid_h_size; ++i) {
4619-
grid_h[i] = static_cast<float>(i);
4620-
}
4621-
for (int i = 0; i < grid_w_size; ++i) {
4622-
grid_w[i] = static_cast<float>(i);
4623-
}
4624-
4625-
std::vector<std::vector<float>> grid(grid_h_size, std::vector<float>(grid_w_size));
4626-
for (int h = 0; h < grid_h_size; ++h) {
4627-
for (int w = 0; w < grid_w_size; ++w) {
4628-
grid[h][w] = grid_w[w];
4629-
}
4630-
}
4631-
std::vector<std::vector<std::vector<float>>> grid_2d = {grid, grid};
4632-
for (int h = 0; h < grid_h_size; ++h) {
4633-
for (int w = 0; w < grid_w_size; ++w) {
4634-
grid_2d[0][h][w] = grid_h[h];
4635-
grid_2d[1][h][w] = grid_w[w];
4636-
}
4637-
}
4638-
4639-
std::vector<std::vector<std::vector<float>>> pos_embed_3d = get_2d_sincos_pos_embed_from_grid(embed_dim, grid_2d);
4640-
4641-
int H = image_size.first;
4642-
int W = image_size.second;
4643-
std::vector<std::vector<float>> pos_embed_2d(H * W, std::vector<float>(embed_dim));
4644-
for (int h = 0; h < H; ++h) {
4645-
for (int w = 0; w < W; ++w) {
4646-
pos_embed_2d[w * H + h] = pos_embed_3d[h][w];
4647-
}
4648-
}
4649-
4650-
return pos_embed_2d;
4651-
}
4652-
46534597
bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f32 * img, float * vec) {
46544598
clip_image_f32_batch imgs;
46554599
clip_image_f32_ptr img_copy(clip_image_f32_init());
@@ -4788,22 +4732,28 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
47884732
}
47894733
set_input_i32("positions", positions);
47904734

4791-
// inspired from resampler of Qwen-VL:
4792-
// -> https://huggingface.co/Qwen/Qwen-VL/tree/main
4793-
// -> https://huggingface.co/Qwen/Qwen-VL/blob/0547ed36a86561e2e42fecec8fd0c4f6953e33c4/visual.py#L23
4794-
int embed_dim = clip_n_mmproj_embd(ctx);
4795-
4796-
// TODO @ngxson : this is very inefficient, can we do this using ggml_sin and ggml_cos?
4797-
auto pos_embed_t = get_2d_sincos_pos_embed(embed_dim, std::make_pair(pos_w, pos_h));
4798-
4799-
std::vector<float> pos_embed(embed_dim * pos_w * pos_h);
4800-
for(int i = 0; i < pos_w * pos_h; ++i){
4801-
for(int j = 0; j < embed_dim; ++j){
4802-
pos_embed[i * embed_dim + j] = pos_embed_t[i][j];
4803-
}
4735+
// inputs for resampler projector
4736+
// set the 2D positions (using float for sinusoidal embedding)
4737+
int n_patches_per_col = image_size_width / patch_size;
4738+
std::vector<float> pos_data(n_pos);
4739+
// dimension H
4740+
for (int i = 0; i < n_pos; i++) {
4741+
pos_data[i] = static_cast<float>(i / n_patches_per_col);
48044742
}
4805-
4806-
set_input_f32("pos_embed", pos_embed);
4743+
set_input_f32("pos_h", pos_data);
4744+
// dimension W
4745+
for (int i = 0; i < n_pos; i++) {
4746+
pos_data[i] = static_cast<float>(i % n_patches_per_col);
4747+
}
4748+
set_input_f32("pos_w", pos_data);
4749+
// base frequency omega
4750+
const float base_freq = 10000.0f;
4751+
const int n_embd_proj = clip_n_mmproj_embd(ctx);
4752+
std::vector<float> omega(n_embd_proj / 4);
4753+
for (int i = 0; i < n_embd_proj / 4; ++i) {
4754+
omega[i] = 1.0f / std::pow(base_freq, static_cast<float>(i) / (n_embd_proj / 4));
4755+
}
4756+
set_input_f32("omega", omega);
48074757
} break;
48084758
case PROJECTOR_TYPE_QWEN2VL:
48094759
case PROJECTOR_TYPE_QWEN3VL:

0 commit comments

Comments
 (0)