Skip to content

Commit 66d5c43

Browse files
committed
implement mix/max pixels
1 parent 13cd204 commit 66d5c43

File tree

1 file changed

+57
-40
lines changed

1 file changed

+57
-40
lines changed

tools/mtmd/clip.cpp

Lines changed: 57 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -171,7 +171,9 @@ struct clip_hparams {
171171
int32_t n_head;
172172
int32_t n_layer;
173173
// idefics3
174-
int32_t preproc_image_size = 0; // aka max_dimension
174+
int32_t image_longest_edge = 0;
175+
int32_t image_min_pixels = 0;
176+
int32_t image_max_pixels = 0;
175177
int32_t proj_scale_factor = 0;
176178

177179
float image_mean[3];
@@ -204,6 +206,13 @@ struct clip_hparams {
204206
bool has_llava_projector = false;
205207
int minicpmv_version = 0;
206208
int32_t minicpmv_query_num = 0; // MiniCPM-V query number
209+
210+
// used by LFM2 and KIMI-VL
211+
void set_limit_image_tokens(int n_tokens_min, int n_tokens_max) {
212+
const int total_factor = patch_size * proj_scale_factor;
213+
image_min_pixels = n_tokens_min * total_factor * total_factor;
214+
image_max_pixels = n_tokens_max * total_factor * total_factor;
215+
}
207216
};
208217

209218
struct clip_layer {
@@ -2577,7 +2586,7 @@ struct clip_model_loader {
25772586

25782587
if (is_vision) {
25792588
get_u32(KEY_IMAGE_SIZE, hparams.image_size);
2580-
get_u32(KEY_PREPROC_IMAGE_SIZE, hparams.preproc_image_size, false);
2589+
get_u32(KEY_PREPROC_IMAGE_SIZE, hparams.image_longest_edge, false);
25812590
get_u32(KEY_PATCH_SIZE, hparams.patch_size);
25822591
get_u32(KEY_IMAGE_CROP_RESOLUTION, hparams.image_crop_resolution, false);
25832592
get_i32(KEY_MINICPMV_VERSION, hparams.minicpmv_version, false); // legacy
@@ -2686,26 +2695,29 @@ struct clip_model_loader {
26862695
hparams.minicpmv_version = 2; // default to 2 if not set
26872696
}
26882697
} break;
2698+
case PROJECTOR_TYPE_INTERNVL:
2699+
{
2700+
get_u32(KEY_PROJ_SCALE_FACTOR, hparams.proj_scale_factor, false);
2701+
} break;
26892702
case PROJECTOR_TYPE_IDEFICS3:
26902703
case PROJECTOR_TYPE_LFM2:
2691-
case PROJECTOR_TYPE_INTERNVL:
26922704
{
2705+
hparams.set_limit_image_tokens(64, 1024);
26932706
get_u32(KEY_PROJ_SCALE_FACTOR, hparams.proj_scale_factor, false);
26942707
} break;
26952708
case PROJECTOR_TYPE_PIXTRAL:
26962709
case PROJECTOR_TYPE_LIGHTONOCR:
26972710
{
26982711
hparams.rope_theta = 10000.0f;
26992712
hparams.warmup_image_size = hparams.patch_size * 8;
2700-
// Mistral Small 2506 needs 1024x1024 image size cap to prevent OOM
2701-
// ref: https://github.com/ggml-org/llama.cpp/issues/14310
2702-
hparams.image_size = 1024;
2713+
hparams.set_limit_image_tokens(64, 1024);
27032714
get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.spatial_merge_size, false);
27042715
} break;
27052716
case PROJECTOR_TYPE_KIMIVL:
27062717
{
27072718
hparams.rope_theta = 10000.0f;
27082719
hparams.warmup_image_size = hparams.patch_size * 8;
2720+
hparams.set_limit_image_tokens(64, 1024);
27092721
get_u32(KEY_PROJ_SCALE_FACTOR, hparams.proj_scale_factor, false);
27102722
} break;
27112723
case PROJECTOR_TYPE_GEMMA3:
@@ -3494,14 +3506,14 @@ struct img_tool {
34943506

34953507
// calculate the size of the **resized** image, while preserving the aspect ratio
34963508
// the calculated size will be aligned to the nearest multiple of align_size
3497-
// if H or W size is larger than max_dimension, it will be resized to max_dimension
3498-
static clip_image_size calc_size_preserved_ratio(const clip_image_size & inp_size, const int align_size, const int max_dimension) {
3499-
if (inp_size.width <= 0 || inp_size.height <= 0 || align_size <= 0 || max_dimension <= 0) {
3509+
// if H or W size is larger than longest_edge, it will be resized to longest_edge
3510+
static clip_image_size calc_size_preserved_ratio(const clip_image_size & inp_size, const int align_size, const int longest_edge) {
3511+
if (inp_size.width <= 0 || inp_size.height <= 0 || align_size <= 0 || longest_edge <= 0) {
35003512
return {0, 0};
35013513
}
35023514

3503-
float scale = std::min(static_cast<float>(max_dimension) / inp_size.width,
3504-
static_cast<float>(max_dimension) / inp_size.height);
3515+
float scale = std::min(static_cast<float>(longest_edge) / inp_size.width,
3516+
static_cast<float>(longest_edge) / inp_size.height);
35053517

35063518
float target_width_f = static_cast<float>(inp_size.width) * scale;
35073519
float target_height_f = static_cast<float>(inp_size.height) * scale;
@@ -3512,6 +3524,33 @@ struct img_tool {
35123524
return {aligned_width, aligned_height};
35133525
}
35143526

3527+
// calculate the size of the **resized** image, while preserving the aspect ratio
3528+
// the calculated size will have min_pixels <= W*H <= max_pixels
3529+
// this is referred as "smart_resize" in transformers code
3530+
static clip_image_size calc_size_preserved_ratio(const clip_image_size & inp_size, const int align_size, const int min_pixels, const int max_pixels) {
3531+
const int width = inp_size.width;
3532+
const int height = inp_size.height;
3533+
3534+
auto round_by_factor = [f = align_size](float x) { return static_cast<int>(std::nearbyintf(x / static_cast<float>(f))) * f; };
3535+
auto ceil_by_factor = [f = align_size](float x) { return static_cast<int>(std::ceil(x / static_cast<float>(f))) * f; };
3536+
auto floor_by_factor = [f = align_size](float x) { return static_cast<int>(std::floor(x / static_cast<float>(f))) * f; };
3537+
3538+
int h_bar = std::max(align_size, round_by_factor(height));
3539+
int w_bar = std::max(align_size, round_by_factor(width));
3540+
3541+
if (h_bar * w_bar > max_pixels) {
3542+
const auto beta = std::sqrt(static_cast<float>(height * width) / max_pixels);
3543+
h_bar = std::max(align_size, floor_by_factor(height / beta));
3544+
w_bar = std::max(align_size, floor_by_factor(width / beta));
3545+
} else if (h_bar * w_bar < min_pixels) {
3546+
const auto beta = std::sqrt(static_cast<float>(min_pixels) / (height * width));
3547+
h_bar = ceil_by_factor(height * beta);
3548+
w_bar = ceil_by_factor(width * beta);
3549+
}
3550+
3551+
return {w_bar, h_bar};
3552+
}
3553+
35153554
private:
35163555
// draw src image into dst image at offset (offset_x, offset_y)
35173556
static void draw_into(clip_image_u8 & dst, const clip_image_u8 & src, int offset_x, int offset_y) {
@@ -3982,7 +4021,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
39824021
//
39834022
// CITE: https://github.com/huggingface/transformers/blob/main/src/transformers/models/idefics3/image_processing_idefics3.py#L737
39844023
const clip_image_size refined_size = img_tool::calc_size_preserved_ratio(
3985-
original_size, params.image_size, params.preproc_image_size);
4024+
original_size, params.image_size, params.image_longest_edge);
39864025
// LOG_INF("%s: original size: %d x %d, refined size: %d x %d\n",
39874026
// __func__, original_size.width, original_size.height,
39884027
// refined_size.width, refined_size.height);
@@ -4064,37 +4103,15 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
40644103
case PROJECTOR_TYPE_LFM2:
40654104
case PROJECTOR_TYPE_KIMIVL:
40664105
{
4067-
GGML_ASSERT(params.proj_scale_factor);
4068-
// smart resize
4069-
const int width = img->nx;
4070-
const int height = img->ny;
4071-
const int total_factor = params.patch_size * params.proj_scale_factor;
4072-
constexpr int min_image_tokens = 64;
4073-
constexpr int max_image_tokens = 1024;
4074-
const float min_pixels = min_image_tokens * total_factor * total_factor;
4075-
const float max_pixels = max_image_tokens * total_factor * total_factor;
4076-
4077-
auto round_by_factor = [f = total_factor](float x) { return static_cast<int>(std::nearbyintf(x / static_cast<float>(f))) * f; };
4078-
auto ceil_by_factor = [f = total_factor](float x) { return static_cast<int>(std::ceil(x / static_cast<float>(f))) * f; };
4079-
auto floor_by_factor = [f = total_factor](float x) { return static_cast<int>(std::floor(x / static_cast<float>(f))) * f; };
4080-
4081-
int h_bar = std::max(total_factor, round_by_factor(height));
4082-
int w_bar = std::max(total_factor, round_by_factor(width));
4083-
4084-
if (h_bar * w_bar > max_pixels) {
4085-
const auto beta = std::sqrt((height * width) / max_pixels);
4086-
h_bar = std::max(total_factor, floor_by_factor(height / beta));
4087-
w_bar = std::max(total_factor, floor_by_factor(width / beta));
4088-
} else if (h_bar * w_bar < min_pixels) {
4089-
const auto beta = std::sqrt(min_pixels / (height * width));
4090-
h_bar = ceil_by_factor(height * beta);
4091-
w_bar = ceil_by_factor(width * beta);
4092-
}
4093-
4106+
const clip_image_size target_size = img_tool::calc_size_preserved_ratio(
4107+
original_size,
4108+
params.patch_size * params.proj_scale_factor,
4109+
params.image_min_pixels,
4110+
params.image_max_pixels);
40944111
const std::array<uint8_t, 3> pad_color = {122, 116, 104};
40954112

40964113
clip_image_u8 resized_img;
4097-
img_tool::resize(*img, resized_img, clip_image_size{w_bar, h_bar}, img_tool::RESIZE_ALGO_BILINEAR, img_tool::RESIZE_PAD_AROUND, pad_color);
4114+
img_tool::resize(*img, resized_img, target_size, img_tool::RESIZE_ALGO_BILINEAR, img_tool::RESIZE_PAD_AROUND, pad_color);
40984115
clip_image_f32_ptr res(clip_image_f32_init());
40994116
normalize_image_u8_to_f32(resized_img, *res, params.image_mean, params.image_std);
41004117
res_imgs->entries.push_back(std::move(res));

0 commit comments

Comments
 (0)