@@ -171,7 +171,7 @@ struct clip_hparams {
171171 int32_t image_crop_resolution;
172172 std::unordered_set<int32_t > vision_feature_layer;
173173 int32_t attn_window_size;
174- std::vector< int32_t > full_attn_layers ;
174+ int32_t n_wa_pattern ;
175175};
176176
177177struct clip_layer {
@@ -799,7 +799,8 @@ static ggml_cgraph * clip_image_build_graph_qwen2_5_vl(clip_ctx * ctx, const cli
799799 const int n_head = hparams.n_head ;
800800 const int d_head = hidden_size / n_head;
801801 const float eps = hparams.eps ;
802- const bool use_window_attn = hparams.full_attn_layers .size () > 0 ;
802+ const int n_wa_pattern = hparams.n_wa_pattern ;
803+ const bool use_window_attn = hparams.n_wa_pattern > 0 ;
803804 int mrope_sections[4 ] = {d_head/4 , d_head/4 , d_head/4 , d_head/4 };
804805
805806 const int batch_size = imgs.entries .size ();
@@ -926,8 +927,7 @@ static ggml_cgraph * clip_image_build_graph_qwen2_5_vl(clip_ctx * ctx, const cli
926927 V = ggml_reshape_3d (ctx0, V, num_positions, d_head, n_head * batch_size);
927928
928929 struct ggml_tensor * KQ = ggml_mul_mat (ctx0, K, Q);
929- const bool inlist = std::find (hparams.full_attn_layers .begin (), hparams.full_attn_layers .end (), il) != hparams.full_attn_layers .end ();
930- const bool full_attn = use_window_attn ? inlist : true ;
930+ const bool full_attn = use_window_attn ? (il + 1 ) % n_wa_pattern == 0 : true ;
931931 if (full_attn) {
932932 KQ = ggml_soft_max_ext (ctx0, KQ, nullptr , 1 .0f / sqrtf ((float )d_head), 0 .0f );
933933 } else {
@@ -1721,8 +1721,8 @@ struct clip_model_loader {
17211721 get_u32 (KEY_PATCH_SIZE, hparams.patch_size );
17221722 get_u32 (KEY_IMAGE_CROP_RESOLUTION, hparams.image_crop_resolution , false );
17231723 get_u32 (KEY_ATTN_WINDOW_SIZE, hparams.attn_window_size , false );
1724+ get_u32 (KEY_WIN_ATTN_PATTERN, hparams.n_wa_pattern , false );
17241725 get_arr_int (KEY_IMAGE_GRID_PINPOINTS, hparams.image_grid_pinpoints , false );
1725- get_arr_int (KEY_FULLATTN_BLK_IDX, hparams.full_attn_layers , false );
17261726
17271727 {
17281728 std::string mm_patch_merge_type;
@@ -3074,6 +3074,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
30743074 bool support_dynamic_size = ctx->has_minicpmv_projector
30753075 || ctx->has_qwen2vl_merger
30763076 || ctx->proj_type == PROJECTOR_TYPE_PIXTRAL;
3077+ const bool use_window_attn = hparams.n_wa_pattern > 0 ;
30773078
30783079 const int image_size = hparams.image_size ;
30793080 int image_size_width = image_size;
@@ -3335,7 +3336,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
33353336 }
33363337 }
33373338
3338- if (hparams. attn_window_size > 0 && ctx->proj_type == PROJECTOR_TYPE_QWEN2_5_VL) {
3339+ if (use_window_attn && ctx->proj_type == PROJECTOR_TYPE_QWEN2_5_VL) {
33393340 struct ggml_tensor * window_idx = ggml_graph_get_tensor (gf, " window_idx" );
33403341 struct ggml_tensor * inv_window_idx = ggml_graph_get_tensor (gf, " inv_window_idx" );
33413342 struct ggml_tensor * window_mask = ggml_graph_get_tensor (gf, " window_mask" );
@@ -3388,9 +3389,9 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
33883389 }
33893390 }
33903391
3391- if (window_idx) ggml_backend_tensor_set (window_idx, idx.data (), 0 , ggml_nbytes (window_idx));
3392- if (inv_window_idx) ggml_backend_tensor_set (inv_window_idx, inv_idx.data (), 0 , ggml_nbytes (inv_window_idx));
3393- if (window_mask) ggml_backend_tensor_set (window_mask, mask.data (), 0 , ggml_nbytes (window_mask));
3392+ ggml_backend_tensor_set (window_idx, idx.data (), 0 , ggml_nbytes (window_idx));
3393+ ggml_backend_tensor_set (inv_window_idx, inv_idx.data (), 0 , ggml_nbytes (inv_window_idx));
3394+ ggml_backend_tensor_set (window_mask, mask.data (), 0 , ggml_nbytes (window_mask));
33943395 }
33953396
33963397 ggml_backend_cpu_set_n_threads (ctx->backend_cpu , n_threads);
0 commit comments