llava: n_patches for clip_image_u8

mattjcly · mattjcly · commit c276a91c9b4b · 2025-04-14T10:50:00.000-04:00
diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
@@ -2259,11 +2259,37 @@ size_t clip_embd_nbytes(const struct clip_ctx * ctx) {
     return (clip_n_patches(ctx) + extra_tokens) * clip_n_mmproj_embd(ctx) * sizeof(float);
 }
 
+static int clip_n_patches_by_img_dims(const struct clip_ctx * ctx, int x, int y) {
+    const auto & params = ctx->vision_model.hparams;
+
+    int n_patches = (params.image_size / params.patch_size) * (params.image_size / params.patch_size);
+
+    if (ctx->proj_type == PROJECTOR_TYPE_LDP || ctx->proj_type == PROJECTOR_TYPE_LDPV2 || ctx->proj_type == PROJECTOR_TYPE_GLM_EDGE) {
+        n_patches /= 4;
+    } else if (ctx->proj_type == PROJECTOR_TYPE_RESAMPLER) {
+        if (ctx->minicpmv_version == 2) {
+            n_patches = 96;
+        }
+        else if (ctx->minicpmv_version == 3) {
+            n_patches = 64;
+        }
+        else if (ctx->minicpmv_version == 4) {
+            n_patches = 64;
+        }
+    } else if (ctx->proj_type == PROJECTOR_TYPE_MERGER) {
+        int patch_size = params.patch_size * 2;
+        int x_patch = x / patch_size + (int)(x % patch_size > 0);
+        int y_patch = y / patch_size + (int)(y % patch_size > 0);
+        n_patches = x_patch * y_patch;
+    } else if (ctx->proj_type == PROJECTOR_TYPE_GEMMA3) {
+        n_patches = 256;
+    }
+
+    return n_patches;
+}
+
 size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_h, int img_w) {
-    clip_image_f32 img;
-    img.nx = img_w;
-    img.ny = img_h;
-    return clip_n_patches_by_img(ctx, &img) * clip_n_mmproj_embd(ctx) * sizeof(float);
+    return clip_n_patches_by_img_dims(ctx, img_w, img_h) * clip_n_mmproj_embd(ctx) * sizeof(float);
 }
 
 int32_t clip_get_image_size(const struct clip_ctx * ctx) {
@@ -2294,39 +2320,15 @@ size_t get_clip_image_grid_size(const struct clip_ctx * ctx) {
 }
 
 int clip_n_patches(const struct clip_ctx * ctx) {
-    clip_image_f32 img;
-    img.nx = ctx->vision_model.hparams.image_size;
-    img.ny = ctx->vision_model.hparams.image_size;
-    return clip_n_patches_by_img(ctx, &img);
+    return clip_n_patches_by_img_dims(ctx, ctx->vision_model.hparams.image_size, ctx->vision_model.hparams.image_size);
 }
 
-int clip_n_patches_by_img(const struct clip_ctx * ctx, struct clip_image_f32 * img) {
-    const auto & params = ctx->vision_model.hparams;
-
-    int n_patches = (params.image_size / params.patch_size) * (params.image_size / params.patch_size);
-
-    if (ctx->proj_type == PROJECTOR_TYPE_LDP || ctx->proj_type == PROJECTOR_TYPE_LDPV2 || ctx->proj_type == PROJECTOR_TYPE_GLM_EDGE) {
-        n_patches /= 4;
-    } else if (ctx->proj_type == PROJECTOR_TYPE_RESAMPLER) {
-        if (ctx->minicpmv_version == 2) {
-            n_patches = 96;
-        }
-        else if (ctx->minicpmv_version == 3) {
-            n_patches = 64;
-        }
-        else if (ctx->minicpmv_version == 4) {
-            n_patches = 64;
-        }
-    } else if (ctx->proj_type == PROJECTOR_TYPE_MERGER) {
-        int patch_size = params.patch_size * 2;
-        int x_patch = img->nx / patch_size + (int)(img->nx % patch_size > 0);
-        int y_patch = img->ny / patch_size + (int)(img->ny % patch_size > 0);
-        n_patches = x_patch * y_patch;
-    } else if (ctx->proj_type == PROJECTOR_TYPE_GEMMA3) {
-        n_patches = 256;
-    }
+int clip_n_patches_by_img_f32(const struct clip_ctx * ctx, struct clip_image_f32 * img) {
+    return clip_n_patches_by_img_dims(ctx, img->nx, img->ny);
+}
 
-    return n_patches;
+int clip_n_patches_by_img_u8(const struct clip_ctx * ctx, struct clip_image_u8 * img) {
+    return clip_n_patches_by_img_dims(ctx, img->nx, img->ny);
 }
 
 static std::vector<std::vector<std::vector<float>>> get_1d_sincos_pos_embed_from_grid_new(int embed_dim, const std::vector<std::vector<float>> & pos) {
diff --git a/examples/llava/clip.h b/examples/llava/clip.h
@@ -58,9 +58,10 @@ CLIP_API const char * clip_patch_merge_type(const struct clip_ctx * ctx);
 CLIP_API const int32_t * clip_image_grid(const struct clip_ctx * ctx);
 CLIP_API size_t get_clip_image_grid_size(const struct clip_ctx * ctx);
 
-CLIP_API int clip_n_patches        (const struct clip_ctx * ctx);
-CLIP_API int clip_n_patches_by_img (const struct clip_ctx * ctx, struct clip_image_f32 * img);
-CLIP_API int clip_n_mmproj_embd    (const struct clip_ctx * ctx);
+CLIP_API int clip_n_patches(const struct clip_ctx * ctx);
+CLIP_API int clip_n_patches_by_img_f32(const struct clip_ctx * ctx, struct clip_image_f32 * img);
+CLIP_API int clip_n_patches_by_img_u8(const struct clip_ctx * ctx, struct clip_image_u8 * img);
+CLIP_API int clip_n_mmproj_embd(const struct clip_ctx * ctx);
 
 CLIP_API int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip);
 CLIP_API void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size * load_image_size);
@@ -74,10 +75,10 @@ CLIP_API struct clip_image_f32_batch * clip_image_f32_batch_init(); // only used
 // nx, ny are the output image dimensions
 CLIP_API unsigned char * clip_image_u8_get_data(struct clip_image_u8 * img, uint32_t * nx, uint32_t * ny);
 
-CLIP_API void clip_image_size_free (struct clip_image_size * img_size);
-CLIP_API void clip_image_u8_free (struct clip_image_u8  * img);
+CLIP_API void clip_image_size_free(struct clip_image_size * img_size);
+CLIP_API void clip_image_u8_free(struct clip_image_u8  * img);
 CLIP_API void clip_image_f32_free(struct clip_image_f32 * img);
-CLIP_API void clip_image_u8_batch_free (struct clip_image_u8_batch  * batch);
+CLIP_API void clip_image_u8_batch_free(struct clip_image_u8_batch  * batch);
 CLIP_API void clip_image_f32_batch_free(struct clip_image_f32_batch * batch);
 
 // use for accessing underlay data of clip_image_f32_batch
diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp
@@ -313,7 +313,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
                 image_embd + n_img_pos_out * clip_n_mmproj_embd(ctx_clip),
                 image_embd_v[i],
                 clip_embd_nbytes_by_img(ctx_clip, nx, ny));
-            n_img_pos_out += clip_n_patches_by_img(ctx_clip, img_res);
+            n_img_pos_out += clip_n_patches_by_img_f32(ctx_clip, img_res);
         }
         *n_img_pos = n_img_pos_out;
         for (size_t i = 0; i < image_embd_v.size(); i++) {

Original file line number	Diff line number	Diff line change
`@@ -313,7 +313,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli`
`313`	`313`	`image_embd + n_img_pos_out * clip_n_mmproj_embd(ctx_clip),`
`314`	`314`	`image_embd_v[i],`
`315`	`315`	`clip_embd_nbytes_by_img(ctx_clip, nx, ny));`
`316`		`- n_img_pos_out += clip_n_patches_by_img(ctx_clip, img_res);`
	`316`	`+ n_img_pos_out += clip_n_patches_by_img_f32(ctx_clip, img_res);`
`317`	`317`	`}`
`318`	`318`	`*n_img_pos = n_img_pos_out;`
`319`	`319`	`for (size_t i = 0; i < image_embd_v.size(); i++) {`