Skip to content

Commit ba3f546

Browse files
committed
fix qwen2vl
1 parent bfd5794 commit ba3f546

File tree

1 file changed

+93
-85
lines changed

1 file changed

+93
-85
lines changed

tools/mtmd/clip.cpp

Lines changed: 93 additions & 85 deletions
Original file line numberDiff line numberDiff line change
@@ -416,13 +416,6 @@ struct clip_graph {
416416
gf = ggml_new_graph(ctx0);
417417
}
418418

419-
ggml_tensor * build_inp_raw() {
420-
ggml_tensor * inp_raw = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, img.nx, img.ny, 3);
421-
ggml_set_name(inp_raw, "inp_raw");
422-
ggml_set_input(inp_raw);
423-
return inp_raw;
424-
}
425-
426419
ggml_cgraph * build_siglip() {
427420
ggml_tensor * inp_raw = build_inp_raw();
428421
ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
@@ -571,74 +564,6 @@ struct clip_graph {
571564
return gf;
572565
}
573566

574-
// implementation of the 2D RoPE without adding a new op in ggml
575-
// this is not efficient (use double the memory), but works on all backends
576-
// TODO: there was a more efficient which relies on ggml_view and ggml_rope_ext_inplace, but the rope inplace does not work well with non-contiguous tensors ; we should fix that and revert back to the original implementation in https://github.com/ggml-org/llama.cpp/pull/13065
577-
static ggml_tensor * build_rope_2d(
578-
ggml_context * ctx0,
579-
ggml_tensor * cur,
580-
ggml_tensor * pos_h,
581-
ggml_tensor * pos_w,
582-
const float freq_base
583-
) {
584-
const int64_t n_dim = cur->ne[0];
585-
const int64_t n_head = cur->ne[1];
586-
const int64_t n_pos = cur->ne[2];
587-
588-
// for example, if we have cur tensor of shape (n_dim=8, n_head, n_pos)
589-
// we will have a list of 4 inv_freq: 1e-0, 1e-1, 1e-2, 1e-3
590-
// first half of cur will use 1e-0, 1e-2 (even)
591-
// second half of cur will use 1e-1, 1e-3 (odd)
592-
// the trick here is to rotate just half of n_dim, so inv_freq will automatically be even
593-
// ^ don't ask me why, it's math! -2(2i) / n_dim == -2i / (n_dim/2)
594-
// then for the second half, we use freq_scale to shift the inv_freq
595-
// ^ why? replace (2i) with (2i+1) in the above equation
596-
const float freq_scale_odd = std::pow(freq_base, (float)-2/n_dim);
597-
598-
// first half
599-
ggml_tensor * first;
600-
{
601-
first = ggml_view_3d(ctx0, cur,
602-
n_dim/2, n_head, n_pos,
603-
ggml_row_size(cur->type, n_dim),
604-
ggml_row_size(cur->type, n_dim*n_head),
605-
0);
606-
first = ggml_rope_ext(
607-
ctx0,
608-
first,
609-
pos_h, // positions
610-
nullptr, // freq factors
611-
n_dim/2, // n_dims
612-
0, 0, freq_base,
613-
1.0f, 0.0f, 1.0f, 0.0f, 0.0f
614-
);
615-
}
616-
617-
// second half
618-
ggml_tensor * second;
619-
{
620-
second = ggml_view_3d(ctx0, cur,
621-
n_dim/2, n_head, n_pos,
622-
ggml_row_size(cur->type, n_dim),
623-
ggml_row_size(cur->type, n_dim*n_head),
624-
n_dim/2 * ggml_element_size(cur));
625-
second = ggml_cont(ctx0, second); // copy, because ggml_rope don't play well with non-contiguous tensors
626-
second = ggml_rope_ext(
627-
ctx0,
628-
second,
629-
pos_w, // positions
630-
nullptr, // freq factors
631-
n_dim/2, // n_dims
632-
0, 0, freq_base,
633-
freq_scale_odd,
634-
0.0f, 1.0f, 0.0f, 0.0f
635-
);
636-
}
637-
638-
cur = ggml_concat(ctx0, first, second, 0);
639-
return cur;
640-
}
641-
642567
ggml_cgraph * build_pixtral() {
643568
const int n_merge = hparams.spatial_merge_size;
644569

@@ -799,7 +724,7 @@ struct clip_graph {
799724
const int batch_size = 1;
800725
const bool use_window_attn = hparams.n_wa_pattern > 0;
801726
const int n_wa_pattern = hparams.n_wa_pattern;
802-
const int n_pos = n_patches + (model.class_embedding ? 1 : 0);
727+
const int n_pos = n_patches;
803728
const int num_position_ids = n_pos * 4; // m-rope requires 4 dim per position
804729

805730
int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4};
@@ -873,6 +798,9 @@ struct clip_graph {
873798
// rmsnorm1
874799
cur = ggml_rms_norm(ctx0, cur, eps);
875800
cur = ggml_mul(ctx0, cur, model.layers[il].ln_1_w);
801+
if (model.layers[il].ln_1_b) {
802+
cur = ggml_add(ctx0, cur, model.layers[il].ln_1_b);
803+
}
876804

877805
// self-attention
878806
{
@@ -930,21 +858,16 @@ struct clip_graph {
930858
// rms norm2
931859
cur = ggml_rms_norm(ctx0, cur, eps);
932860
cur = ggml_mul(ctx0, cur, model.layers[il].ln_2_w);
861+
if (model.layers[il].ln_2_b) {
862+
cur = ggml_add(ctx0, cur, model.layers[il].ln_2_b);
863+
}
933864

934865
// mlp
935866

936867
if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL) {
937868
cur = ggml_mul_mat(ctx0, model.layers[il].ff_up_w, cur);
938869
cur = ggml_add(ctx0, cur, model.layers[il].ff_up_b);
939-
940-
if (ctx->use_gelu) {
941-
cur = ggml_gelu_inplace(ctx0, cur);
942-
} else if (ctx->use_silu) {
943-
cur = ggml_silu_inplace(ctx0, cur);
944-
} else {
945-
cur = ggml_gelu_quick_inplace(ctx0, cur);
946-
}
947-
870+
cur = ggml_gelu_quick_inplace(ctx0, cur);
948871
cur = ggml_mul_mat(ctx0, model.layers[il].ff_down_w, cur);
949872
cur = ggml_add(ctx0, cur, model.layers[il].ff_down_b);
950873

@@ -983,6 +906,10 @@ struct clip_graph {
983906
ggml_set_name(embeddings, "post_ln");
984907

985908
embeddings = ggml_mul(ctx0, embeddings, model.post_ln_w);
909+
910+
if (model.post_ln_b) {
911+
embeddings = ggml_add(ctx0, embeddings, model.post_ln_b);
912+
}
986913
}
987914

988915
embeddings = ggml_reshape_3d(ctx0, embeddings, n_embd * 4, n_pos / 4, batch_size);
@@ -1458,6 +1385,87 @@ struct clip_graph {
14581385

14591386
return gf;
14601387
}
1388+
1389+
private:
1390+
//
1391+
// utility functions
1392+
//
1393+
1394+
ggml_tensor * build_inp_raw() {
1395+
ggml_tensor * inp_raw = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, img.nx, img.ny, 3);
1396+
ggml_set_name(inp_raw, "inp_raw");
1397+
ggml_set_input(inp_raw);
1398+
return inp_raw;
1399+
}
1400+
1401+
// implementation of the 2D RoPE without adding a new op in ggml
1402+
// this is not efficient (use double the memory), but works on all backends
1403+
// TODO: there was a more efficient which relies on ggml_view and ggml_rope_ext_inplace, but the rope inplace does not work well with non-contiguous tensors ; we should fix that and revert back to the original implementation in https://github.com/ggml-org/llama.cpp/pull/13065
1404+
static ggml_tensor * build_rope_2d(
1405+
ggml_context * ctx0,
1406+
ggml_tensor * cur,
1407+
ggml_tensor * pos_h,
1408+
ggml_tensor * pos_w,
1409+
const float freq_base
1410+
) {
1411+
const int64_t n_dim = cur->ne[0];
1412+
const int64_t n_head = cur->ne[1];
1413+
const int64_t n_pos = cur->ne[2];
1414+
1415+
// for example, if we have cur tensor of shape (n_dim=8, n_head, n_pos)
1416+
// we will have a list of 4 inv_freq: 1e-0, 1e-1, 1e-2, 1e-3
1417+
// first half of cur will use 1e-0, 1e-2 (even)
1418+
// second half of cur will use 1e-1, 1e-3 (odd)
1419+
// the trick here is to rotate just half of n_dim, so inv_freq will automatically be even
1420+
// ^ don't ask me why, it's math! -2(2i) / n_dim == -2i / (n_dim/2)
1421+
// then for the second half, we use freq_scale to shift the inv_freq
1422+
// ^ why? replace (2i) with (2i+1) in the above equation
1423+
const float freq_scale_odd = std::pow(freq_base, (float)-2/n_dim);
1424+
1425+
// first half
1426+
ggml_tensor * first;
1427+
{
1428+
first = ggml_view_3d(ctx0, cur,
1429+
n_dim/2, n_head, n_pos,
1430+
ggml_row_size(cur->type, n_dim),
1431+
ggml_row_size(cur->type, n_dim*n_head),
1432+
0);
1433+
first = ggml_rope_ext(
1434+
ctx0,
1435+
first,
1436+
pos_h, // positions
1437+
nullptr, // freq factors
1438+
n_dim/2, // n_dims
1439+
0, 0, freq_base,
1440+
1.0f, 0.0f, 1.0f, 0.0f, 0.0f
1441+
);
1442+
}
1443+
1444+
// second half
1445+
ggml_tensor * second;
1446+
{
1447+
second = ggml_view_3d(ctx0, cur,
1448+
n_dim/2, n_head, n_pos,
1449+
ggml_row_size(cur->type, n_dim),
1450+
ggml_row_size(cur->type, n_dim*n_head),
1451+
n_dim/2 * ggml_element_size(cur));
1452+
second = ggml_cont(ctx0, second); // copy, because ggml_rope don't play well with non-contiguous tensors
1453+
second = ggml_rope_ext(
1454+
ctx0,
1455+
second,
1456+
pos_w, // positions
1457+
nullptr, // freq factors
1458+
n_dim/2, // n_dims
1459+
0, 0, freq_base,
1460+
freq_scale_odd,
1461+
0.0f, 1.0f, 0.0f, 0.0f
1462+
);
1463+
}
1464+
1465+
cur = ggml_concat(ctx0, first, second, 0);
1466+
return cur;
1467+
}
1468+
14611469
};
14621470

14631471
static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch & imgs) {

0 commit comments

Comments
 (0)