@@ -3026,7 +3026,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
30263026 const int patch_size = hparams.patch_size ;
30273027 const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size));
30283028 const int num_positions = num_patches + (model.class_embedding ? 1 : 0 );
3029- const int pos_w = ctx->load_image_size .width / patch_size;
3029+ const int pos_w = ctx->load_image_size .width / patch_size;
30303030 const int pos_h = ctx->load_image_size .height / patch_size;
30313031
30323032 const bool use_window_attn = hparams.n_wa_pattern > 0 ; // for qwen2.5vl
@@ -3138,13 +3138,13 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
31383138 } break ;
31393139 case PROJECTOR_TYPE_QWEN2VL:
31403140 {
3141+ const int merge_ratio = 2 ;
31413142 const int pw = image_size_width / patch_size;
31423143 const int ph = image_size_height / patch_size;
31433144 std::vector<int > positions (num_positions * 4 );
3144-
31453145 int ptr = 0 ;
3146- for (int y = 0 ; y < ph; y += 2 ) {
3147- for (int x = 0 ; x < pw; x += 2 ) {
3146+ for (int y = 0 ; y < ph; y += merge_ratio ) {
3147+ for (int x = 0 ; x < pw; x += merge_ratio ) {
31483148 for (int dy = 0 ; dy < 2 ; dy++) {
31493149 for (int dx = 0 ; dx < 2 ; dx++) {
31503150 positions[ ptr] = y + dy;
@@ -3180,10 +3180,8 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
31803180 std::vector<float > mask (pow (ipw * iph, 2 ), std::numeric_limits<float >::lowest ());
31813181 int mask_row = 0 ;
31823182
3183- for (int y = 0 ; y < ph; y += grid_window)
3184- {
3185- for (int x = 0 ; x < pw; x += grid_window)
3186- {
3183+ for (int y = 0 ; y < ph; y += grid_window) {
3184+ for (int x = 0 ; x < pw; x += grid_window) {
31873185 const int win_h = std::min (grid_window, ph - y);
31883186 const int win_w = std::min (grid_window, pw - x);
31893187 const int dst_0 = dst;
0 commit comments