-
Notifications
You must be signed in to change notification settings - Fork 13.4k
Model: Granite docling + Idefics3 preprocessing (SmolVLM) #16206
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 12 commits
64e10f5
428db16
c2202d2
4ef3128
0aef5e9
8819c96
e172313
64cef62
e1ba793
f5a7f4d
cb51d4e
08f3055
64fc676
899b48a
a966110
4be2ce9
72c6e67
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -31,6 +31,7 @@ | |
|
||
// vision-specific | ||
#define KEY_IMAGE_SIZE "clip.vision.image_size" | ||
#define KEY_PREPROC_IMAGE_SIZE "clip.vision.preproc_image_size" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I wasn't totally sure the right name for this one since it comes from |
||
#define KEY_PATCH_SIZE "clip.vision.patch_size" | ||
#define KEY_IMAGE_MEAN "clip.vision.image_mean" | ||
#define KEY_IMAGE_STD "clip.vision.image_std" | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -170,7 +170,9 @@ struct clip_hparams { | |
int32_t projection_dim; | ||
int32_t n_head; | ||
int32_t n_layer; | ||
int32_t proj_scale_factor = 0; // idefics3 | ||
// idefics3 | ||
int32_t preproc_image_size = 0; | ||
int32_t proj_scale_factor = 0; | ||
|
||
float image_mean[3]; | ||
float image_std[3]; | ||
|
@@ -2250,6 +2252,7 @@ struct clip_model_loader { | |
|
||
if (is_vision) { | ||
get_u32(KEY_IMAGE_SIZE, hparams.image_size); | ||
get_u32(KEY_PREPROC_IMAGE_SIZE, hparams.preproc_image_size, false); | ||
get_u32(KEY_PATCH_SIZE, hparams.patch_size); | ||
get_u32(KEY_IMAGE_CROP_RESOLUTION, hparams.image_crop_resolution, false); | ||
get_i32(KEY_MINICPMV_VERSION, hparams.minicpmv_version, false); // legacy | ||
|
@@ -3551,10 +3554,57 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str | |
// res_imgs->data[0] = *res; | ||
res_imgs->entries.push_back(std::move(img_f32)); | ||
return true; | ||
} | ||
else if (ctx->proj_type() == PROJECTOR_TYPE_GLM_EDGE | ||
} else if (ctx->proj_type() == PROJECTOR_TYPE_IDEFICS3) { | ||
// The refined size has two steps: | ||
// 1. Resize w/ aspect-ratio preserving such that the longer side is | ||
// the preprocessor longest size | ||
gabe-l-hart marked this conversation as resolved.
Show resolved
Hide resolved
|
||
// 2. Resize w/out preserving aspect ratio such that both sides are | ||
// multiples of image_size (always rounding up) | ||
// | ||
// CITE: https://github.com/huggingface/transformers/blob/main/src/transformers/models/idefics3/image_processing_idefics3.py#L737 | ||
const float scale = std::min( | ||
|
||
static_cast<float>(params.preproc_image_size) / original_size.width, | ||
static_cast<float>(params.preproc_image_size) / original_size.height); | ||
int refined_w = static_cast<int>(original_size.width * scale); | ||
int refined_h = static_cast<int>(original_size.height * scale); | ||
refined_w = static_cast<int>(params.image_size * std::ceil(static_cast<float>(refined_w) / params.image_size)); | ||
gabe-l-hart marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
refined_h = static_cast<int>(params.image_size * std::ceil(static_cast<float>(refined_h) / params.image_size)); | ||
const clip_image_size refined_size{refined_w, refined_h}; | ||
|
||
llava_uhd::slice_instructions instructions; | ||
instructions.overview_size = clip_image_size{params.image_size, params.image_size}; | ||
instructions.refined_size = refined_size; | ||
instructions.grid_size = clip_image_size{ | ||
static_cast<int>(std::ceil(static_cast<float>(refined_size.width) / params.image_size)), | ||
static_cast<int>(std::ceil(static_cast<float>(refined_size.height) / params.image_size)), | ||
}; | ||
for (int y = 0; y < refined_size.height; y += params.image_size) { | ||
for (int x = 0; x < refined_size.width; x += params.image_size) { | ||
instructions.slices.push_back(llava_uhd::slice_coordinates{ | ||
/* x */x, | ||
/* y */y, | ||
/* size */clip_image_size{ | ||
std::min(params.image_size, refined_size.width - x), | ||
std::min(params.image_size, refined_size.height - y) | ||
} | ||
}); | ||
} | ||
} | ||
auto imgs = llava_uhd::slice_image(img, instructions); | ||
|
||
// cast and normalize to f32 | ||
for (size_t i = 0; i < imgs.size(); ++i) { | ||
// clip_image_save_to_bmp(*imgs[i], "slice_" + std::to_string(i) + ".bmp"); | ||
clip_image_f32_ptr res(clip_image_f32_init()); | ||
normalize_image_u8_to_f32(*imgs[i], *res, params.image_mean, params.image_std); | ||
res_imgs->entries.push_back(std::move(res)); | ||
} | ||
|
||
res_imgs->grid_x = instructions.grid_size.width; | ||
res_imgs->grid_y = instructions.grid_size.height; | ||
return true; | ||
} else if (ctx->proj_type() == PROJECTOR_TYPE_GLM_EDGE | ||
|| ctx->proj_type() == PROJECTOR_TYPE_GEMMA3 | ||
|| ctx->proj_type() == PROJECTOR_TYPE_IDEFICS3 | ||
|| ctx->proj_type() == PROJECTOR_TYPE_INTERNVL // TODO @ngxson : support dynamic resolution | ||
) { | ||
clip_image_u8 resized_image; | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Us and our long names 😞. I assume vertical alignment is worth preserving, but happy to not touch the other lines if preferred.