Skip to content

Commit b37cd47

Browse files
ngxsontheo77186
authored andcommitted
mtmd : fix idefics3 preprocessing (ggml-org#16806)
* mtmd : fix idefics3 preprocessing * disable granite test * fix test for granite
1 parent c2767da commit b37cd47

File tree

2 files changed

+12
-5
lines changed

2 files changed

+12
-5
lines changed

tools/mtmd/clip.cpp

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -171,7 +171,7 @@ struct clip_hparams {
171171
int32_t n_head;
172172
int32_t n_layer;
173173
// idefics3
174-
int32_t preproc_image_size = 0;
174+
int32_t preproc_image_size = 0; // aka max_dimension
175175
int32_t proj_scale_factor = 0;
176176

177177
float image_mean[3];
@@ -3480,8 +3480,8 @@ struct image_manipulation {
34803480
return {0, 0};
34813481
}
34823482

3483-
float scale = std::min(1.0f, std::min(static_cast<float>(max_dimension) / inp_size.width,
3484-
static_cast<float>(max_dimension) / inp_size.height));
3483+
float scale = std::min(static_cast<float>(max_dimension) / inp_size.width,
3484+
static_cast<float>(max_dimension) / inp_size.height);
34853485

34863486
float target_width_f = static_cast<float>(inp_size.width) * scale;
34873487
float target_height_f = static_cast<float>(inp_size.height) * scale;
@@ -3644,7 +3644,7 @@ struct llava_uhd {
36443644

36453645
// resize to overview size
36463646
clip_image_u8_ptr resized_img(clip_image_u8_init());
3647-
image_manipulation::bicubic_resize(*img, *resized_img, inst.overview_size.width, inst.overview_size.height);
3647+
image_manipulation::resize_and_pad_image(*img, *resized_img, inst.overview_size);
36483648
output.push_back(std::move(resized_img));
36493649
if (inst.slices.empty()) {
36503650
// no slices, just return the resized image
@@ -3846,6 +3846,9 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
38463846
// CITE: https://github.com/huggingface/transformers/blob/main/src/transformers/models/idefics3/image_processing_idefics3.py#L737
38473847
const clip_image_size refined_size = image_manipulation::calc_size_preserved_ratio(
38483848
original_size, params.image_size, params.preproc_image_size);
3849+
// LOG_INF("%s: original size: %d x %d, refined size: %d x %d\n",
3850+
// __func__, original_size.width, original_size.height,
3851+
// refined_size.width, refined_size.height);
38493852

38503853
llava_uhd::slice_instructions instructions;
38513854
instructions.overview_size = clip_image_size{params.image_size, params.image_size};
@@ -3856,6 +3859,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
38563859
};
38573860
for (int y = 0; y < refined_size.height; y += params.image_size) {
38583861
for (int x = 0; x < refined_size.width; x += params.image_size) {
3862+
// LOG_INF("%s: adding slice at x=%d, y=%d\n", __func__, x, y);
38593863
instructions.slices.push_back(llava_uhd::slice_coordinates{
38603864
/* x */x,
38613865
/* y */y,

tools/mtmd/tests.sh

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -139,7 +139,10 @@ for i in "${!arr_hf[@]}"; do
139139

140140
echo "$output" > $SCRIPT_DIR/output/$bin-$(echo "$hf" | tr '/' '-').log
141141

142-
if echo "$output" | grep -iq "new york"; then
142+
# either contains "new york" or both "men" and "walk"
143+
if echo "$output" | grep -iq "new york" \
144+
|| (echo "$output" | grep -iq "men" && echo "$output" | grep -iq "walk")
145+
then
143146
result="$prefix \033[32mOK\033[0m: $bin $hf"
144147
else
145148
result="$prefix \033[31mFAIL\033[0m: $bin $hf"

0 commit comments

Comments
 (0)