@@ -2791,14 +2791,8 @@ struct clip_model_loader {
27912791 get_u32 (KEY_SPATIAL_MERGE_SIZE, hparams.n_merge , false );
27922792 get_u32 (KEY_WIN_ATTN_PATTERN, hparams.n_wa_pattern , model.proj_type == PROJECTOR_TYPE_QWEN25VL); // only 2.5 requires it
27932793 // ref: https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct/blob/main/preprocessor_config.json
2794- // the actual max limit is 12845056/14/14/2/2/4 = 4096 tokens
2795- // but we set a lower value to avoid OOM
2796- // TODO: make it configurable by user
2797- // TODO (2): bbox coordinates become inaccurate with small number of tokens,
2798- // therefore we need to increase the min_tokens
2799- // see: https://github.com/ggml-org/llama.cpp/issues/16842#issuecomment-3475144858
2800- hparams.set_limit_image_tokens (8 , 2048 );
2801- hparams.set_warmup_n_tokens (256 ); // avoid OOM on warmup
2794+ hparams.set_limit_image_tokens (8 , 4096 );
2795+ hparams.set_warmup_n_tokens (46 *46 ); // avoid OOM on warmup
28022796 const int warn_min_pixels = 1024 * hparams.n_merge * hparams.n_merge * hparams.patch_size * hparams.patch_size ;
28032797 if (hparams.image_min_pixels < warn_min_pixels) {
28042798 LOG_WRN (" %s: Qwen-VL models require at minimum 1024 image tokens to function correctly on grounding tasks\n " , __func__);
@@ -4814,7 +4808,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
48144808 case PROJECTOR_TYPE_QWEN2VL:
48154809 case PROJECTOR_TYPE_QWEN3VL:
48164810 {
4817- const int merge_ratio = 2 ;
4811+ const int merge_ratio = hparams. n_merge ;
48184812 const int pw = image_size_width / patch_size;
48194813 const int ph = image_size_height / patch_size;
48204814 std::vector<int > positions (n_pos * 4 );
0 commit comments