@@ -3426,24 +3426,18 @@ struct img_tool {
34263426 // RESIZE_ALGO_LANCZOS, // TODO
34273427 };
34283428
3429- enum resize_pad {
3430- RESIZE_PAD_NONE,
3431- RESIZE_PAD_AROUND,
3432- RESIZE_PAD_BOTTOM_RIGHT,
3433- };
3434-
34353429 static void resize (
34363430 const clip_image_u8 & src,
34373431 clip_image_u8 & dst,
34383432 const clip_image_size & target_resolution,
34393433 resize_algo algo,
3440- resize_pad pad_mode = RESIZE_PAD_AROUND,
3434+ bool add_padding = true , // TODO: define the behavior for add_padding = false
34413435 std::array<uint8_t , 3 > pad_color = {0 , 0 , 0 }) {
34423436 dst.nx = target_resolution.width ;
34433437 dst.ny = target_resolution.height ;
34443438 dst.buf .resize (3 * dst.nx * dst.ny );
34453439
3446- if (pad_mode == RESIZE_PAD_NONE ) {
3440+ if (!add_padding ) {
34473441 // direct resize
34483442 switch (algo) {
34493443 case RESIZE_ALGO_BILINEAR:
@@ -3478,15 +3472,8 @@ struct img_tool {
34783472 // fill dst with pad_color
34793473 fill (dst, pad_color);
34803474
3481- int offset_x = 0 ;
3482- int offset_y = 0 ;
3483- if (pad_mode == RESIZE_PAD_AROUND) {
3484- offset_x = (target_resolution.width - new_width) / 2 ;
3485- offset_y = (target_resolution.height - new_height) / 2 ;
3486- } else if (pad_mode == RESIZE_PAD_BOTTOM_RIGHT) {
3487- offset_x = target_resolution.width - new_width;
3488- offset_y = target_resolution.height - new_height;
3489- }
3475+ int offset_x = (target_resolution.width - new_width) / 2 ;
3476+ int offset_y = (target_resolution.height - new_height) / 2 ;
34903477
34913478 composite (dst, resized_image, offset_x, offset_y);
34923479 }
@@ -3523,8 +3510,9 @@ struct img_tool {
35233510 float target_width_f = static_cast <float >(inp_size.width ) * scale;
35243511 float target_height_f = static_cast <float >(inp_size.height ) * scale;
35253512
3526- int aligned_width = CLIP_ALIGN ((int )target_width_f, align_size);
3527- int aligned_height = CLIP_ALIGN ((int )target_height_f, align_size);
3513+ auto ceil_by_factor = [f = align_size](float x) { return static_cast <int >(std::ceil (x / static_cast <float >(f))) * f; };
3514+ int aligned_width = ceil_by_factor (target_width_f);
3515+ int aligned_height = ceil_by_factor (target_height_f);
35283516
35293517 return {aligned_width, aligned_height};
35303518 }
@@ -3852,7 +3840,7 @@ struct llava_uhd {
38523840 } else {
38533841 // only algo bicubic preserves the ratio; old models rely on this behavior
38543842 // TODO: do we need to support other algos here?
3855- img_tool::resize (*img, *refined_img, inst.refined_size , img_tool::RESIZE_ALGO_BICUBIC, img_tool::RESIZE_PAD_NONE );
3843+ img_tool::resize (*img, *refined_img, inst.refined_size , img_tool::RESIZE_ALGO_BICUBIC, false );
38563844 }
38573845
38583846 // create slices
@@ -4022,35 +4010,17 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
40224010 case PROJECTOR_TYPE_QWEN3VL:
40234011 {
40244012 // step 1: make a blank canvas which aligns to the grid
4025- clip_image_u8 canvas ;
4026- const clip_image_size canvas_size = img_tool::calc_size_preserved_ratio (
4013+ clip_image_u8 resized ;
4014+ const clip_image_size new_size = img_tool::calc_size_preserved_ratio (
40274015 original_size,
4028- params.patch_size * params. n_merge ,
4016+ params.patch_size * 2 ,
40294017 params.image_min_pixels ,
40304018 params.image_max_pixels );
4031- canvas.nx = canvas_size.width ;
4032- canvas.ny = canvas_size.height ;
4033- canvas.buf .resize (3 * canvas.nx * canvas.ny );
4034- img_tool::fill (canvas, {0 , 0 , 0 });
4035-
4036- // step 2: composite resized image onto the canvas, top-left corner
4037- if (original_size.height > canvas.ny || original_size.width > canvas.nx ) {
4038- // need to resize original image first
4039- clip_image_u8 resized;
4040- const clip_image_size scaled_size = img_tool::calc_size_preserved_ratio (
4041- original_size,
4042- 1 , // no need to align here since we will composite onto canvas
4043- std::min (canvas.nx , canvas.ny )); // fit into the canvas
4044- img_tool::resize (*img, resized, scaled_size, img_tool::RESIZE_ALGO_BILINEAR);
4045- img_tool::composite (canvas, resized, 0 , 0 );
4046- } else {
4047- // no resizing needed
4048- img_tool::composite (canvas, *img, 0 , 0 );
4049- }
4050-
4019+ img_tool::resize (*img, resized, new_size, img_tool::RESIZE_ALGO_BILINEAR, false );
4020+ // clip_image_save_to_bmp(canvas, "preproc.bmp");
40514021 clip_image_f32_ptr img_f32 (clip_image_f32_init ());
40524022 // clip_image_f32_ptr res(clip_image_f32_init());
4053- normalize_image_u8_to_f32 (canvas , *img_f32, params.image_mean , params.image_std );
4023+ normalize_image_u8_to_f32 (resized , *img_f32, params.image_mean , params.image_std );
40544024 // res_imgs->data[0] = *res;
40554025 res_imgs->entries .push_back (std::move (img_f32));
40564026 } break ;
@@ -4163,7 +4133,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
41634133 const std::array<uint8_t , 3 > pad_color = {122 , 116 , 104 };
41644134
41654135 clip_image_u8 resized_img;
4166- img_tool::resize (*img, resized_img, target_size, img_tool::RESIZE_ALGO_BILINEAR, img_tool::RESIZE_PAD_AROUND , pad_color);
4136+ img_tool::resize (*img, resized_img, target_size, img_tool::RESIZE_ALGO_BILINEAR, true , pad_color);
41674137 clip_image_f32_ptr res (clip_image_f32_init ());
41684138 normalize_image_u8_to_f32 (resized_img, *res, params.image_mean , params.image_std );
41694139 res_imgs->entries .push_back (std::move (res));
@@ -4195,7 +4165,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
41954165 const std::array<uint8_t , 3 > pad_color = {122 , 116 , 104 };
41964166
41974167 // resize the image to the target_size
4198- img_tool::resize (*img, *temp, clip_image_size{params.image_size , params.image_size }, img_tool::RESIZE_ALGO_BILINEAR, img_tool::RESIZE_PAD_AROUND , pad_color);
4168+ img_tool::resize (*img, *temp, clip_image_size{params.image_size , params.image_size }, img_tool::RESIZE_ALGO_BILINEAR, true , pad_color);
41994169
42004170 clip_image_f32_ptr res (clip_image_f32_init ());
42014171 normalize_image_u8_to_f32 (*temp, *res, params.image_mean , params.image_std );
@@ -4268,15 +4238,15 @@ int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 *
42684238 const auto & params = ctx->model .hparams ;
42694239 const int n_total = clip_n_output_tokens (ctx, img);
42704240 if (ctx->proj_type () == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type () == PROJECTOR_TYPE_QWEN25VL || ctx->proj_type () == PROJECTOR_TYPE_QWEN3VL) {
4271- return img->nx / (params.patch_size * 2 ) + ( int )(img-> nx % params. patch_size > 0 ) ;
4241+ return img->nx / (params.patch_size * 2 );
42724242 }
42734243 return n_total;
42744244}
42754245
42764246int clip_n_output_tokens_y (const struct clip_ctx * ctx, struct clip_image_f32 * img) {
42774247 const auto & params = ctx->model .hparams ;
42784248 if (ctx->proj_type () == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type () == PROJECTOR_TYPE_QWEN25VL || ctx->proj_type () == PROJECTOR_TYPE_QWEN3VL) {
4279- return img->ny / (params.patch_size * 2 ) + ( int )(img-> ny % params. patch_size > 0 ) ;
4249+ return img->ny / (params.patch_size * 2 );
42804250 }
42814251 return 1 ;
42824252}
@@ -4334,9 +4304,8 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
43344304 case PROJECTOR_TYPE_QWEN3VL:
43354305 {
43364306 // dynamic size (2 conv, so double patch size)
4337- int patch_size = params.patch_size * 2 ;
4338- int x_patch = img->nx / patch_size + (int )(img->nx % patch_size > 0 );
4339- int y_patch = img->ny / patch_size + (int )(img->ny % patch_size > 0 );
4307+ int x_patch = img->nx / (params.patch_size * 2 );
4308+ int y_patch = img->ny / (params.patch_size * 2 );
43404309 n_patches = x_patch * y_patch;
43414310 } break ;
43424311 case PROJECTOR_TYPE_GEMMA3:
0 commit comments