@@ -216,10 +216,10 @@ struct clip_hparams {
216216 void set_warmup_n_tokens (int n_tokens) {
217217 int n_tok_per_side = static_cast <int >(std::sqrt (n_tokens));
218218 GGML_ASSERT (n_tok_per_side * n_tok_per_side == n_tokens && " n_tokens must be n*n" );
219- warmup_image_size = n_tok_per_side * patch_size * get_scale_factor_per_side ();
219+ warmup_image_size = n_tok_per_side * patch_size * get_merge_kernel_size ();
220220 }
221221
222- int get_scale_factor_per_side () const {
222+ int get_merge_kernel_size () const {
223223 return static_cast <int >(std::sqrt (proj_scale_factor));
224224 }
225225};
@@ -550,7 +550,7 @@ struct clip_graph {
550550 const int batch_size = 1 ;
551551 GGML_ASSERT (n_patches_x == n_patches_y);
552552 const int patches_per_image = n_patches_x;
553- const int kernel_size = hparams.get_scale_factor_per_side ();
553+ const int kernel_size = hparams.get_merge_kernel_size ();
554554
555555 cur = ggml_transpose (ctx0, cur);
556556 cur = ggml_cont_4d (ctx0, cur, patches_per_image, patches_per_image, n_embd, batch_size);
@@ -602,7 +602,7 @@ struct clip_graph {
602602 }
603603
604604 ggml_cgraph * build_pixtral () {
605- const int n_merge = hparams.get_scale_factor_per_side ();
605+ const int n_merge = hparams.get_merge_kernel_size ();
606606
607607 // 2D input positions
608608 ggml_tensor * pos_h = ggml_new_tensor_1d (ctx0, GGML_TYPE_I32, n_patches);
@@ -4034,7 +4034,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
40344034 clip_image_u8 canvas;
40354035 const clip_image_size canvas_size = img_tool::calc_size_preserved_ratio (
40364036 original_size,
4037- params.patch_size * params.get_scale_factor_per_side (),
4037+ params.patch_size * params.get_merge_kernel_size (),
40384038 params.image_min_pixels ,
40394039 params.image_max_pixels );
40404040 canvas.nx = canvas_size.width ;
@@ -4133,7 +4133,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
41334133 clip_image_u8 resized_image;
41344134 const clip_image_size target_size = img_tool::calc_size_preserved_ratio (
41354135 original_size,
4136- params.patch_size * params.get_scale_factor_per_side (),
4136+ params.patch_size * params.get_merge_kernel_size (),
41374137 params.image_min_pixels ,
41384138 params.image_max_pixels );
41394139 img_tool::resize (*img, resized_image, target_size, img_tool::RESIZE_ALGO_BILINEAR);
@@ -4164,7 +4164,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
41644164 GGML_ASSERT (params.image_min_pixels && params.image_max_pixels );
41654165 const clip_image_size target_size = img_tool::calc_size_preserved_ratio (
41664166 original_size,
4167- params.patch_size * params.get_scale_factor_per_side (),
4167+ params.patch_size * params.get_merge_kernel_size (),
41684168 params.image_min_pixels ,
41694169 params.image_max_pixels );
41704170 const std::array<uint8_t , 3 > pad_color = {122 , 116 , 104 };
@@ -4359,7 +4359,7 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
43594359 case PROJECTOR_TYPE_KIMIVL:
43604360 {
43614361 // dynamic size
4362- int scale_factor = params.get_scale_factor_per_side ();
4362+ int scale_factor = params.get_merge_kernel_size ();
43634363 int out_patch_size = params.patch_size * scale_factor;
43644364 int x_patch = CLIP_ALIGN (img->nx , out_patch_size) / out_patch_size;
43654365 int y_patch = CLIP_ALIGN (img->ny , out_patch_size) / out_patch_size;
@@ -4369,7 +4369,7 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
43694369 case PROJECTOR_TYPE_LIGHTONOCR:
43704370 {
43714371 // dynamic size
4372- int n_merge = params.get_scale_factor_per_side ();
4372+ int n_merge = params.get_merge_kernel_size ();
43734373 int n_patches_x = img->nx / patch_size / (n_merge > 0 ? n_merge : 1 );
43744374 int n_patches_y = img->ny / patch_size / (n_merge > 0 ? n_merge : 1 );
43754375 if (ctx->model .token_embd_img_break ) {
0 commit comments