Skip to content

Commit b74122f

Browse files
committed
add image_grid_pinpoints
1 parent 5b81972 commit b74122f

File tree

1 file changed

+26
-2
lines changed

1 file changed

+26
-2
lines changed

tools/mtmd/clip.cpp

Lines changed: 26 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2038,6 +2038,16 @@ struct clip_model_loader {
20382038
{
20392039
hparams.rope_theta = 10000.0f;
20402040
get_u32(KEY_PROJ_SCALE_FACTOR, hparams.proj_scale_factor);
2041+
2042+
// borrowed from llava-1.6
2043+
const int psize = hparams.patch_size;
2044+
hparams.image_grid_pinpoints = {
2045+
psize, psize*2, // 336, 672
2046+
psize*2, psize, // 672, 336
2047+
psize*2, psize*2, // 672, 672
2048+
psize*3, psize, // 1008, 336
2049+
psize, psize*3, // 336, 1008
2050+
};
20412051
} break;
20422052
default:
20432053
break;
@@ -3091,15 +3101,29 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
30913101
normalize_image_u8_to_f32(resized_image, *img_f32, ctx->image_mean, ctx->image_std);
30923102
res_imgs->entries.push_back(std::move(img_f32));
30933103
return true;
3094-
}
3095-
else if (ctx->proj_type == PROJECTOR_TYPE_PIXTRAL) {
3104+
3105+
} else if (ctx->proj_type == PROJECTOR_TYPE_PIXTRAL) {
30963106
clip_image_u8 resized_image;
30973107
auto new_size = image_manipulation::calc_size_preserved_ratio(original_size, params.patch_size, params.image_size);
30983108
image_manipulation::bilinear_resize(*img, resized_image, new_size.width, new_size.height);
30993109
clip_image_f32_ptr img_f32(clip_image_f32_init());
31003110
normalize_image_u8_to_f32(resized_image, *img_f32, ctx->image_mean, ctx->image_std);
31013111
res_imgs->entries.push_back(std::move(img_f32));
31023112
return true;
3113+
3114+
} else if (ctx->proj_type == PROJECTOR_TYPE_LLAMA4) {
3115+
GGML_ASSERT(!params.image_grid_pinpoints.empty());
3116+
auto const inst = llava_uhd::get_slice_instructions(ctx, original_size);
3117+
std::vector<clip_image_u8_ptr> imgs = llava_uhd::slice_image(img, inst);
3118+
3119+
for (size_t i = 0; i < imgs.size(); ++i) {
3120+
clip_image_f32_ptr res(clip_image_f32_init());
3121+
normalize_image_u8_to_f32(*imgs[i], *res, ctx->image_mean, ctx->image_std);
3122+
res_imgs->entries.push_back(std::move(res));
3123+
}
3124+
3125+
return true;
3126+
31033127
}
31043128

31053129
// the logic below is to pad the shorter side to the longer side with a background color: rgb(122, 116, 104)

0 commit comments

Comments
 (0)