@@ -554,15 +554,15 @@ static ggml_cgraph * clip_image_build_graph_siglip(clip_ctx * ctx, const clip_im
554554}
555555
556556// implementation of the 2D RoPE without adding a new op in ggml
557+ // this is not efficient (use double the memory), but works on all backends
558+ // TODO: there was a more efficient which relies on ggml_view and ggml_rope_ext_inplace, but the rope inplace does not work well with non-contiguous tensors ; we should fix that and revert back to the original implementation in https://github.com/ggml-org/llama.cpp/pull/13065
557559static ggml_tensor * build_rope_2d (
558- ggml_cgraph * gf,
559560 ggml_context * ctx0,
560561 ggml_tensor * cur,
561562 ggml_tensor * pos_h,
562563 ggml_tensor * pos_w,
563564 const float freq_base
564565) {
565- ggml_tensor * tmp;
566566 const int64_t n_dim = cur->ne [0 ];
567567 const int64_t n_head = cur->ne [1 ];
568568 const int64_t n_pos = cur->ne [2 ];
@@ -571,18 +571,23 @@ static ggml_tensor * build_rope_2d(
571571 // we will have a list of 4 inv_freq: 1e-0, 1e-1, 1e-2, 1e-3
572572 // first half of cur will use 1e-0, 1e-2 (even)
573573 // second half of cur will use 1e-1, 1e-3 (odd)
574- //
575- // for the first half, the trick here is to rotate n_dim/2, so inv_freq will be even
574+ // the trick here is to rotate just half of n_dim, so inv_freq will automatically be even
576575 // ^ don't ask me why, it's math! -2(2i) / n_dim == -2i / (n_dim/2)
577576 // then for the second half, we use freq_scale to shift the inv_freq
578577 // ^ why? replace (2i) with (2i+1) in the above equation
579578 const float freq_scale_odd = std::pow (freq_base, (float )-2 /n_dim);
580579
581580 // first half
581+ ggml_tensor * first;
582582 {
583- cur = ggml_rope_ext_inplace (
583+ first = ggml_view_3d (ctx0, cur,
584+ n_dim/2 , n_head, n_pos,
585+ ggml_row_size (cur->type , n_dim),
586+ ggml_row_size (cur->type , n_dim*n_head),
587+ 0 );
588+ first = ggml_rope_ext (
584589 ctx0,
585- cur ,
590+ first ,
586591 pos_h, // positions
587592 nullptr , // freq factors
588593 n_dim/2 , // n_dims
@@ -592,26 +597,27 @@ static ggml_tensor * build_rope_2d(
592597 }
593598
594599 // second half
600+ ggml_tensor * second;
595601 {
596- tmp = ggml_view_3d (ctx0, cur,
602+ second = ggml_view_3d (ctx0, cur,
597603 n_dim/2 , n_head, n_pos,
598604 ggml_row_size (cur->type , n_dim),
599605 ggml_row_size (cur->type , n_dim*n_head),
600606 n_dim/2 * ggml_element_size (cur));
601- tmp = ggml_rope_ext_inplace (
607+ second = ggml_cont (ctx0, second); // copy, because ggml_rope don't play well with non-contiguous tensors
608+ second = ggml_rope_ext (
602609 ctx0,
603- tmp ,
610+ second ,
604611 pos_w, // positions
605612 nullptr , // freq factors
606613 n_dim/2 , // n_dims
607614 0 , 0 , freq_base,
608615 freq_scale_odd,
609616 0 .0f , 1 .0f , 0 .0f , 0 .0f
610617 );
611- // calculate inplace (modify cur directly)
612- ggml_build_forward_expand (gf, tmp);
613618 }
614619
620+ cur = ggml_concat (ctx0, first, second, 0 );
615621 return cur;
616622}
617623
@@ -680,13 +686,13 @@ static ggml_cgraph * clip_image_build_graph_pixtral(clip_ctx * ctx, const clip_i
680686 struct ggml_tensor * Q = ggml_mul_mat (ctx0, model.layers [il].q_w , cur);
681687
682688 Q = ggml_reshape_3d (ctx0, Q, d_head, n_head, num_patches);
683- Q = build_rope_2d (gf, ctx0, Q, pos_h, pos_w, hparams.rope_theta );
689+ Q = build_rope_2d (ctx0, Q, pos_h, pos_w, hparams.rope_theta );
684690 Q = ggml_cont (ctx0, ggml_permute (ctx0, Q, 0 , 2 , 1 , 3 ));
685691
686692 struct ggml_tensor * K = ggml_mul_mat (ctx0, model.layers [il].k_w , cur);
687693
688694 K = ggml_reshape_3d (ctx0, K, d_head, n_head, num_patches);
689- K = build_rope_2d (gf, ctx0, K, pos_h, pos_w, hparams.rope_theta );
695+ K = build_rope_2d (ctx0, K, pos_h, pos_w, hparams.rope_theta );
690696 K = ggml_cont (ctx0, ggml_permute (ctx0, K, 0 , 2 , 1 , 3 ));
691697
692698 struct ggml_tensor * V = ggml_mul_mat (ctx0, model.layers [il].v_w , cur);
@@ -2796,10 +2802,15 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
27962802 const auto & model = ctx->vision_model ;
27972803 const auto & hparams = model.hparams ;
27982804
2805+ // TODO @ngxson : this is ugly, need to refactor later
2806+ bool support_dynamic_size = ctx->has_minicpmv_projector
2807+ || ctx->has_qwen2vl_merger
2808+ || ctx->proj_type == PROJECTOR_TYPE_PIXTRAL;
2809+
27992810 const int image_size = hparams.image_size ;
28002811 int image_size_width = image_size;
28012812 int image_size_height = image_size;
2802- if (ctx-> has_minicpmv_projector | ctx-> has_qwen2vl_merger ) {
2813+ if (support_dynamic_size ) {
28032814 image_size_width = imgs.entries [0 ]->nx ;
28042815 image_size_height = imgs.entries [0 ]->ny ;
28052816 }
@@ -2811,9 +2822,20 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
28112822
28122823 {
28132824 struct ggml_tensor * inp_raw = ggml_graph_get_tensor (gf, " inp_raw" );
2814- float * data = (float *)malloc (ggml_nbytes (inp_raw));
2825+ std::vector<float > inp_data (ggml_nelements (inp_raw));
2826+ float * data = inp_data.data ();
2827+
2828+ // layout of data (note: the channel dim is unrolled to better visualize the layout):
2829+ //
2830+ // ┌──W──┐
2831+ // │ H │ channel = R
2832+ // ├─────┤ │
2833+ // │ H │ channel = G
2834+ // ├─────┤ │
2835+ // │ H │ channel = B
2836+ // └─────┘ │
2837+ // ──────┘ x B
28152838
2816- // TODO @ngxson : this whole code block is ugly, will need to be refactored
28172839 for (size_t i = 0 ; i < imgs.entries .size (); i++) {
28182840 const int nx = imgs.entries [i]->nx ;
28192841 const int ny = imgs.entries [i]->ny ;
@@ -2828,17 +2850,19 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
28282850 const int n = nx * ny;
28292851
28302852 for (int b = 0 ; b < batch_size; b++) {
2831- for (int k = 0 ; k < 3 ; k++) {
2832- for (int y = 0 ; y < ny; y++) {
2833- for (int x = 0 ; x < nx; x++) {
2834- data[(b * 3 * n) + k * n + y * nx + x] = imgs.entries [b]->buf [3 * (y * nx + x) + k];
2835- }
2853+ float * batch_entry = data + b * (3 *n);
2854+ for (int y = 0 ; y < ny; y++) {
2855+ for (int x = 0 ; x < nx; x++) {
2856+ size_t base_src = 3 *(y * nx + x); // idx of the first channel
2857+ size_t base_dst = y * nx + x; // idx of the first channel
2858+ batch_entry[ base_dst] = imgs.entries [b]->buf [base_src ];
2859+ batch_entry[1 *n + base_dst] = imgs.entries [b]->buf [base_src + 1 ];
2860+ batch_entry[2 *n + base_dst] = imgs.entries [b]->buf [base_src + 2 ];
28362861 }
28372862 }
28382863 }
28392864 }
28402865 ggml_backend_tensor_set (inp_raw, data, 0 , ggml_nbytes (inp_raw));
2841- free (data);
28422866 }
28432867 if (ctx->has_minicpmv_projector ) {
28442868 {
0 commit comments