@@ -1720,8 +1720,7 @@ struct clip_model_loader {
17201720 get_u32 (KEY_IMAGE_SIZE, hparams.image_size );
17211721 get_u32 (KEY_PATCH_SIZE, hparams.patch_size );
17221722 get_u32 (KEY_IMAGE_CROP_RESOLUTION, hparams.image_crop_resolution , false );
1723- get_u32 (KEY_ATTN_WINDOW_SIZE, hparams.attn_window_size , false );
1724- get_u32 (KEY_WIN_ATTN_PATTERN, hparams.n_wa_pattern , false );
1723+ get_u32 (KEY_WIN_ATTN_PATTERN, hparams.n_wa_pattern , ctx_clip.proj_type == PROJECTOR_TYPE_QWEN2_5_VL);
17251724 get_arr_int (KEY_IMAGE_GRID_PINPOINTS, hparams.image_grid_pinpoints , false );
17261725
17271726 {
@@ -3210,12 +3209,13 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
32103209 std::vector<int > idx (ph * pw);
32113210 std::vector<int > inv_idx (ph * pw);
32123211
3213- if (hparams.attn_window_size > 0 ) {
3212+ if (use_window_attn) {
3213+ const int attn_window_size = 112 ;
32143214 struct ggml_tensor * window_idx = ggml_graph_get_tensor (gf, " window_idx" );
32153215 struct ggml_tensor * inv_window_idx = ggml_graph_get_tensor (gf, " inv_window_idx" );
32163216 struct ggml_tensor * window_mask = ggml_graph_get_tensor (gf, " window_mask" );
32173217
3218- const int grid_window = hparams. attn_window_size / patch_size / merge_ratio;
3218+ const int grid_window = attn_window_size / patch_size / merge_ratio;
32193219 int dst = 0 ;
32203220 // [num_vision_tokens, num_vision_tokens] attention mask tensor
32213221 std::vector<float > mask (pow (ipw * iph, 2 ), std::numeric_limits<float >::lowest ());
@@ -3342,9 +3342,10 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
33423342 struct ggml_tensor * window_mask = ggml_graph_get_tensor (gf, " window_mask" );
33433343
33443344 const int merge_ratio = 2 ;
3345+ const int attn_window_size = 112 ;
33453346 const int pw = image_size_width / patch_size / merge_ratio;
33463347 const int ph = image_size_height / patch_size / merge_ratio;
3347- const int grid_window = hparams. attn_window_size / patch_size / merge_ratio;
3348+ const int grid_window = attn_window_size / patch_size / merge_ratio;
33483349 const int ipw = image_size_width / patch_size;
33493350 const int iph = image_size_height / patch_size;
33503351 /*
0 commit comments