Skip to content

Commit bfd03fb

Browse files
committed
get_merge_kernel_size()
1 parent 000d1d9 commit bfd03fb

File tree

1 file changed

+9
-9
lines changed

1 file changed

+9
-9
lines changed

tools/mtmd/clip.cpp

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -216,10 +216,10 @@ struct clip_hparams {
216216
void set_warmup_n_tokens(int n_tokens) {
217217
int n_tok_per_side = static_cast<int>(std::sqrt(n_tokens));
218218
GGML_ASSERT(n_tok_per_side * n_tok_per_side == n_tokens && "n_tokens must be n*n");
219-
warmup_image_size = n_tok_per_side * patch_size * get_scale_factor_per_side();
219+
warmup_image_size = n_tok_per_side * patch_size * get_merge_kernel_size();
220220
}
221221

222-
int get_scale_factor_per_side() const {
222+
int get_merge_kernel_size() const {
223223
return static_cast<int>(std::sqrt(proj_scale_factor));
224224
}
225225
};
@@ -550,7 +550,7 @@ struct clip_graph {
550550
const int batch_size = 1;
551551
GGML_ASSERT(n_patches_x == n_patches_y);
552552
const int patches_per_image = n_patches_x;
553-
const int kernel_size = hparams.get_scale_factor_per_side();
553+
const int kernel_size = hparams.get_merge_kernel_size();
554554

555555
cur = ggml_transpose(ctx0, cur);
556556
cur = ggml_cont_4d(ctx0, cur, patches_per_image, patches_per_image, n_embd, batch_size);
@@ -602,7 +602,7 @@ struct clip_graph {
602602
}
603603

604604
ggml_cgraph * build_pixtral() {
605-
const int n_merge = hparams.get_scale_factor_per_side();
605+
const int n_merge = hparams.get_merge_kernel_size();
606606

607607
// 2D input positions
608608
ggml_tensor * pos_h = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches);
@@ -4034,7 +4034,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
40344034
clip_image_u8 canvas;
40354035
const clip_image_size canvas_size = img_tool::calc_size_preserved_ratio(
40364036
original_size,
4037-
params.patch_size * params.get_scale_factor_per_side(),
4037+
params.patch_size * params.get_merge_kernel_size(),
40384038
params.image_min_pixels,
40394039
params.image_max_pixels);
40404040
canvas.nx = canvas_size.width;
@@ -4133,7 +4133,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
41334133
clip_image_u8 resized_image;
41344134
const clip_image_size target_size = img_tool::calc_size_preserved_ratio(
41354135
original_size,
4136-
params.patch_size * params.get_scale_factor_per_side(),
4136+
params.patch_size * params.get_merge_kernel_size(),
41374137
params.image_min_pixels,
41384138
params.image_max_pixels);
41394139
img_tool::resize(*img, resized_image, target_size, img_tool::RESIZE_ALGO_BILINEAR);
@@ -4164,7 +4164,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
41644164
GGML_ASSERT(params.image_min_pixels && params.image_max_pixels);
41654165
const clip_image_size target_size = img_tool::calc_size_preserved_ratio(
41664166
original_size,
4167-
params.patch_size * params.get_scale_factor_per_side(),
4167+
params.patch_size * params.get_merge_kernel_size(),
41684168
params.image_min_pixels,
41694169
params.image_max_pixels);
41704170
const std::array<uint8_t, 3> pad_color = {122, 116, 104};
@@ -4359,7 +4359,7 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
43594359
case PROJECTOR_TYPE_KIMIVL:
43604360
{
43614361
// dynamic size
4362-
int scale_factor = params.get_scale_factor_per_side();
4362+
int scale_factor = params.get_merge_kernel_size();
43634363
int out_patch_size = params.patch_size * scale_factor;
43644364
int x_patch = CLIP_ALIGN(img->nx, out_patch_size) / out_patch_size;
43654365
int y_patch = CLIP_ALIGN(img->ny, out_patch_size) / out_patch_size;
@@ -4369,7 +4369,7 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
43694369
case PROJECTOR_TYPE_LIGHTONOCR:
43704370
{
43714371
// dynamic size
4372-
int n_merge = params.get_scale_factor_per_side();
4372+
int n_merge = params.get_merge_kernel_size();
43734373
int n_patches_x = img->nx / patch_size / (n_merge > 0 ? n_merge : 1);
43744374
int n_patches_y = img->ny / patch_size / (n_merge > 0 ? n_merge : 1);
43754375
if (ctx->model.token_embd_img_break) {

0 commit comments

Comments
 (0)