@@ -244,8 +244,6 @@ struct clip_vision_model {
244244 // GLMV-Edge projection
245245 struct ggml_tensor * mm_model_adapter_conv_w = nullptr ;
246246 struct ggml_tensor * mm_model_adapter_conv_b = nullptr ;
247- struct ggml_tensor * boi_w = nullptr ;
248- struct ggml_tensor * eoi_w = nullptr ;
249247
250248 // MobileVLM projection
251249 struct ggml_tensor * mm_model_mlp_1_w = nullptr ;
@@ -556,15 +554,15 @@ static ggml_cgraph * clip_image_build_graph_siglip(clip_ctx * ctx, const clip_im
556554}
557555
558556// implementation of the 2D RoPE without adding a new op in ggml
557+ // this is not efficient (use double the memory), but works on all backends
558+ // TODO: there was a more efficient which relies on ggml_view and ggml_rope_ext_inplace, but the rope inplace does not work well with non-contiguous tensors ; we should fix that and revert back to the original implementation in https://github.com/ggml-org/llama.cpp/pull/13065
559559static ggml_tensor * build_rope_2d (
560- ggml_cgraph * gf,
561560 ggml_context * ctx0,
562561 ggml_tensor * cur,
563562 ggml_tensor * pos_h,
564563 ggml_tensor * pos_w,
565564 const float freq_base
566565) {
567- ggml_tensor * tmp;
568566 const int64_t n_dim = cur->ne [0 ];
569567 const int64_t n_head = cur->ne [1 ];
570568 const int64_t n_pos = cur->ne [2 ];
@@ -573,18 +571,23 @@ static ggml_tensor * build_rope_2d(
573571 // we will have a list of 4 inv_freq: 1e-0, 1e-1, 1e-2, 1e-3
574572 // first half of cur will use 1e-0, 1e-2 (even)
575573 // second half of cur will use 1e-1, 1e-3 (odd)
576- //
577- // for the first half, the trick here is to rotate n_dim/2, so inv_freq will be even
574+ // the trick here is to rotate just half of n_dim, so inv_freq will automatically be even
578575 // ^ don't ask me why, it's math! -2(2i) / n_dim == -2i / (n_dim/2)
579576 // then for the second half, we use freq_scale to shift the inv_freq
580577 // ^ why? replace (2i) with (2i+1) in the above equation
581578 const float freq_scale_odd = std::pow (freq_base, (float )-2 /n_dim);
582579
583580 // first half
581+ ggml_tensor * first;
584582 {
585- cur = ggml_rope_ext_inplace (
583+ first = ggml_view_3d (ctx0, cur,
584+ n_dim/2 , n_head, n_pos,
585+ ggml_row_size (cur->type , n_dim),
586+ ggml_row_size (cur->type , n_dim*n_head),
587+ 0 );
588+ first = ggml_rope_ext (
586589 ctx0,
587- cur ,
590+ first ,
588591 pos_h, // positions
589592 nullptr , // freq factors
590593 n_dim/2 , // n_dims
@@ -594,26 +597,27 @@ static ggml_tensor * build_rope_2d(
594597 }
595598
596599 // second half
600+ ggml_tensor * second;
597601 {
598- tmp = ggml_view_3d (ctx0, cur,
602+ second = ggml_view_3d (ctx0, cur,
599603 n_dim/2 , n_head, n_pos,
600604 ggml_row_size (cur->type , n_dim),
601605 ggml_row_size (cur->type , n_dim*n_head),
602606 n_dim/2 * ggml_element_size (cur));
603- tmp = ggml_rope_ext_inplace (
607+ second = ggml_cont (ctx0, second); // copy, because ggml_rope don't play well with non-contiguous tensors
608+ second = ggml_rope_ext (
604609 ctx0,
605- tmp ,
610+ second ,
606611 pos_w, // positions
607612 nullptr , // freq factors
608613 n_dim/2 , // n_dims
609614 0 , 0 , freq_base,
610615 freq_scale_odd,
611616 0 .0f , 1 .0f , 0 .0f , 0 .0f
612617 );
613- // calculate inplace (modify cur directly)
614- ggml_build_forward_expand (gf, tmp);
615618 }
616619
620+ cur = ggml_concat (ctx0, first, second, 0 );
617621 return cur;
618622}
619623
@@ -682,13 +686,13 @@ static ggml_cgraph * clip_image_build_graph_pixtral(clip_ctx * ctx, const clip_i
682686 struct ggml_tensor * Q = ggml_mul_mat (ctx0, model.layers [il].q_w , cur);
683687
684688 Q = ggml_reshape_3d (ctx0, Q, d_head, n_head, num_patches);
685- Q = build_rope_2d (gf, ctx0, Q, pos_h, pos_w, hparams.rope_theta );
689+ Q = build_rope_2d (ctx0, Q, pos_h, pos_w, hparams.rope_theta );
686690 Q = ggml_cont (ctx0, ggml_permute (ctx0, Q, 0 , 2 , 1 , 3 ));
687691
688692 struct ggml_tensor * K = ggml_mul_mat (ctx0, model.layers [il].k_w , cur);
689693
690694 K = ggml_reshape_3d (ctx0, K, d_head, n_head, num_patches);
691- K = build_rope_2d (gf, ctx0, K, pos_h, pos_w, hparams.rope_theta );
695+ K = build_rope_2d (ctx0, K, pos_h, pos_w, hparams.rope_theta );
692696 K = ggml_cont (ctx0, ggml_permute (ctx0, K, 0 , 2 , 1 , 3 ));
693697
694698 struct ggml_tensor * V = ggml_mul_mat (ctx0, model.layers [il].v_w , cur);
@@ -1697,8 +1701,6 @@ struct clip_model_loader {
16971701 vision_model.mm_model_mlp_1_w = get_tensor (string_format (TN_GLM_ADAPTER_D_H_2_4H," weight" ));
16981702 vision_model.mm_model_mlp_2_w = get_tensor (string_format (TN_GLM_ADAPTER_GATE," weight" ));
16991703 vision_model.mm_model_mlp_3_w = get_tensor (string_format (TN_GLM_ADAPTER_D_4H_2_H," weight" ));
1700- vision_model.boi_w = get_tensor (TN_GLM_BOI_W);
1701- vision_model.eoi_w = get_tensor (TN_GLM_EOI_W);
17021704 } break ;
17031705 case PROJECTOR_TYPE_MERGER:
17041706 {
@@ -2593,8 +2595,7 @@ void clip_free(clip_ctx * ctx) {
25932595}
25942596
25952597size_t clip_embd_nbytes (const struct clip_ctx * ctx) {
2596- int extra_tokens = ctx->has_glm_projector ? 2 : 0 ;
2597- return (clip_n_patches (ctx) + extra_tokens) * clip_n_mmproj_embd (ctx) * sizeof (float );
2598+ return clip_n_patches (ctx) * clip_n_mmproj_embd (ctx) * sizeof (float );
25982599}
25992600
26002601size_t clip_embd_nbytes_by_img (const struct clip_ctx * ctx, int img_h, int img_w) {
@@ -2790,9 +2791,6 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
27902791 }
27912792 if (ctx->has_glm_projector ) {
27922793 GGML_ASSERT (batch_size == 1 );
2793- ggml_tensor * boi = ctx->vision_model .boi_w ;
2794- ggml_backend_tensor_get (boi,vec,0 ,ggml_nbytes (boi));
2795- vec = (float *)(vec+ggml_nelements (boi)); // offset for boi
27962794 }
27972795
27982796 // build the inference graph
@@ -2804,10 +2802,15 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
28042802 const auto & model = ctx->vision_model ;
28052803 const auto & hparams = model.hparams ;
28062804
2805+ // TODO @ngxson : this is ugly, need to refactor later
2806+ bool support_dynamic_size = ctx->has_minicpmv_projector
2807+ || ctx->has_qwen2vl_merger
2808+ || ctx->proj_type == PROJECTOR_TYPE_PIXTRAL;
2809+
28072810 const int image_size = hparams.image_size ;
28082811 int image_size_width = image_size;
28092812 int image_size_height = image_size;
2810- if (ctx-> has_minicpmv_projector | ctx-> has_qwen2vl_merger ) {
2813+ if (support_dynamic_size ) {
28112814 image_size_width = imgs.entries [0 ]->nx ;
28122815 image_size_height = imgs.entries [0 ]->ny ;
28132816 }
@@ -2819,9 +2822,20 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
28192822
28202823 {
28212824 struct ggml_tensor * inp_raw = ggml_graph_get_tensor (gf, " inp_raw" );
2822- float * data = (float *)malloc (ggml_nbytes (inp_raw));
2825+ std::vector<float > inp_data (ggml_nelements (inp_raw));
2826+ float * data = inp_data.data ();
2827+
2828+ // layout of data (note: the channel dim is unrolled to better visualize the layout):
2829+ //
2830+ // ┌──W──┐
2831+ // │ H │ channel = R
2832+ // ├─────┤ │
2833+ // │ H │ channel = G
2834+ // ├─────┤ │
2835+ // │ H │ channel = B
2836+ // └─────┘ │
2837+ // ──────┘ x B
28232838
2824- // TODO @ngxson : this whole code block is ugly, will need to be refactored
28252839 for (size_t i = 0 ; i < imgs.entries .size (); i++) {
28262840 const int nx = imgs.entries [i]->nx ;
28272841 const int ny = imgs.entries [i]->ny ;
@@ -2836,17 +2850,19 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
28362850 const int n = nx * ny;
28372851
28382852 for (int b = 0 ; b < batch_size; b++) {
2839- for (int k = 0 ; k < 3 ; k++) {
2840- for (int y = 0 ; y < ny; y++) {
2841- for (int x = 0 ; x < nx; x++) {
2842- data[(b * 3 * n) + k * n + y * nx + x] = imgs.entries [b]->buf [3 * (y * nx + x) + k];
2843- }
2853+ float * batch_entry = data + b * (3 *n);
2854+ for (int y = 0 ; y < ny; y++) {
2855+ for (int x = 0 ; x < nx; x++) {
2856+ size_t base_src = 3 *(y * nx + x); // idx of the first channel
2857+ size_t base_dst = y * nx + x; // idx of the first channel
2858+ batch_entry[ base_dst] = imgs.entries [b]->buf [base_src ];
2859+ batch_entry[1 *n + base_dst] = imgs.entries [b]->buf [base_src + 1 ];
2860+ batch_entry[2 *n + base_dst] = imgs.entries [b]->buf [base_src + 2 ];
28442861 }
28452862 }
28462863 }
28472864 }
28482865 ggml_backend_tensor_set (inp_raw, data, 0 , ggml_nbytes (inp_raw));
2849- free (data);
28502866 }
28512867 if (ctx->has_minicpmv_projector ) {
28522868 {
@@ -3001,13 +3017,6 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
30013017 // copy the embeddings to the location passed by the user
30023018 ggml_backend_tensor_get (embeddings, vec, 0 , ggml_nbytes (embeddings));
30033019
3004- if (ctx->has_glm_projector ) {
3005- // eoi
3006- ggml_tensor * eoi = ctx->vision_model .eoi_w ;
3007- int offset = ggml_nelements (embeddings);
3008- ggml_backend_tensor_get (eoi, vec+offset, 0 , ggml_nbytes (eoi));
3009- }
3010-
30113020 return true ;
30123021}
30133022
0 commit comments