@@ -2038,13 +2038,13 @@ struct clip_model_loader {
20382038 get_u32 (KEY_PROJ_SCALE_FACTOR, hparams.proj_scale_factor );
20392039
20402040 // borrowed from llava-1.6
2041- const int psize = hparams.patch_size ;
2041+ const int isize = hparams.patch_size ;
20422042 hparams.image_grid_pinpoints = {
2043- psize , psize *2 , // 336, 672
2044- psize *2 , psize , // 672, 336
2045- psize *2 , psize *2 , // 672, 672
2046- psize *3 , psize , // 1008, 336
2047- psize , psize *3 , // 336, 1008
2043+ isize , isize *2 , // 336, 672
2044+ isize *2 , isize , // 672, 336
2045+ isize *2 , isize *2 , // 672, 672
2046+ isize *3 , isize , // 1008, 336
2047+ isize , isize *3 , // 336, 1008
20482048 };
20492049 } break ;
20502050 default :
@@ -2968,7 +2968,7 @@ struct llava_uhd {
29682968
29692969 // used by llava 1.6 with custom list of pinpoints
29702970 static clip_image_size select_best_resolution (const std::vector<int32_t > & pinpoints, const clip_image_size & original_size) {
2971- std::vector<clip_image_size> possible_resolutions;
2971+ std::vector<clip_image_size> possible_resolutions; // TODO @ngxson : construct this inside hparams, not here
29722972 for (size_t i = 0 ; i < pinpoints.size (); i += 2 ) {
29732973 possible_resolutions.push_back (clip_image_size{pinpoints[i], pinpoints[i+1 ]});
29742974 }
@@ -3077,7 +3077,6 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
30773077 else if (ctx->proj_type == PROJECTOR_TYPE_GLM_EDGE
30783078 || ctx->proj_type == PROJECTOR_TYPE_GEMMA3
30793079 || ctx->proj_type == PROJECTOR_TYPE_IDEFICS3
3080- || ctx->proj_type == PROJECTOR_TYPE_LLAMA4
30813080 || ctx->proj_type == PROJECTOR_TYPE_INTERNVL // TODO @ngxson : support dynamic resolution
30823081 ) {
30833082 clip_image_u8 resized_image;
0 commit comments