@@ -659,19 +659,19 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
659659 if (ctx->has_qwen2vl_merger ) {
660660 GGML_ASSERT (image_size_width % (patch_size * 2 ) == 0 );
661661 GGML_ASSERT (image_size_height % (patch_size * 2 ) == 0 );
662-
662+
663663 auto inp_1 = ggml_conv_2d (ctx0, model.patch_embeddings_1 , inp_raw, patch_size, patch_size, 0 , 0 , 1 , 1 );
664664 inp = ggml_add (ctx0, inp, inp_1);
665665 inp = ggml_cont (ctx0, ggml_permute (ctx0, inp, 1 , 2 , 0 , 3 )); // [w, h, c, b] -> [c, w, h, b]
666666 inp = ggml_reshape_4d (
667- ctx0, inp,
667+ ctx0, inp,
668668 hidden_size * 2 , patches_w / 2 , patches_h, batch_size);
669669 inp = ggml_reshape_4d (
670- ctx0, inp,
670+ ctx0, inp,
671671 hidden_size * 2 , patches_w / 2 , 2 , batch_size * (patches_h / 2 ));
672672 inp = ggml_cont (ctx0, ggml_permute (ctx0, inp, 0 , 2 , 1 , 3 ));
673673 inp = ggml_reshape_3d (
674- ctx0, inp,
674+ ctx0, inp,
675675 hidden_size, patches_w * patches_h, batch_size);
676676 }
677677 else {
@@ -756,7 +756,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
756756 Q = ggml_reshape_4d (ctx0, Q, d_head, n_head, num_positions, batch_size);
757757 if (ctx->has_qwen2vl_merger ) {
758758 Q = ggml_rope_multi (
759- ctx0, Q, positions, nullptr ,
759+ ctx0, Q, positions, nullptr ,
760760 d_head/2 , mrope_sections, GGML_ROPE_TYPE_VISION, 32768 , 10000 , 1 , 0 , 1 , 32 , 1 );
761761 }
762762 Q = ggml_scale_inplace (ctx0, Q, 1 .0f / sqrt ((float )d_head));
@@ -769,7 +769,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
769769 K = ggml_reshape_4d (ctx0, K, d_head, n_head, num_positions, batch_size);
770770 if (ctx->has_qwen2vl_merger ) {
771771 K = ggml_rope_multi (
772- ctx0, K, positions, nullptr ,
772+ ctx0, K, positions, nullptr ,
773773 d_head/2 , mrope_sections, GGML_ROPE_TYPE_VISION, 32768 , 10000 , 1 , 0 , 1 , 32 , 1 );
774774 }
775775 K = ggml_cont (ctx0, ggml_permute (ctx0, K, 0 , 2 , 1 , 3 ));
@@ -1286,7 +1286,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
12861286
12871287 idx = get_key_idx (ctx, KEY_USE_GELU);
12881288 new_clip->use_gelu = gguf_get_val_bool (ctx, idx);
1289-
1289+
12901290 try {
12911291 idx = get_key_idx (ctx, KEY_USE_SILU);
12921292 new_clip->use_silu = gguf_get_val_bool (ctx, idx);
@@ -2079,14 +2079,14 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
20792079 }
20802080 }
20812081 return true ;
2082- }
2082+ }
20832083 else if (ctx->has_qwen2vl_merger ) {
20842084 clip_image_u8 * resized = clip_image_u8_init ();
20852085 auto patch_size = clip_patch_size (ctx) * 2 ;
20862086 int nx = ceil ((float )img->nx / patch_size) * patch_size;
20872087 int ny = ceil ((float )img->ny / patch_size) * patch_size;
20882088 bicubic_resize (*img, *resized, nx, ny);
2089-
2089+
20902090 res_imgs->data = new clip_image_f32[1 ];
20912091 // clip_image_f32 * res = clip_image_f32_init();
20922092 normalize_image_u8_to_f32 (resized, res_imgs->data , ctx->image_mean , ctx->image_std );
@@ -2573,7 +2573,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
25732573 const int pw = image_size_width / patch_size;
25742574 const int ph = image_size_height / patch_size;
25752575 int * positions_data = (int *)malloc (ggml_nbytes (positions));
2576-
2576+
25772577 int ptr = 0 ;
25782578 for (int y = 0 ; y < ph; y+=2 )
25792579 {
@@ -2590,7 +2590,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
25902590 }
25912591 }
25922592 }
2593-
2593+
25942594 ggml_backend_tensor_set (positions, positions_data, 0 , ggml_nbytes (positions));
25952595 free (positions_data);
25962596 }
0 commit comments