ggml-org · mattjcly · Apr 14, 2025 · Apr 14, 2025 · Apr 14, 2025 · Apr 14, 2025
diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
@@ -2256,14 +2256,40 @@ void clip_free(clip_ctx * ctx) {
 
 size_t clip_embd_nbytes(const struct clip_ctx * ctx) {
     int extra_tokens = ctx->has_glm_projector ? 2 : 0;
-    return (clip_n_patches(ctx) + extra_tokens) * clip_n_mmproj_embd(ctx) * sizeof(float);
+    return (clip_get_n_output_tokens(ctx) + extra_tokens) * clip_n_mmproj_embd(ctx) * sizeof(float);
+}
+
+static int clip_img_get_n_output_tokens_by_dims(const struct clip_ctx * ctx, int x, int y) {
+    const auto & params = ctx->vision_model.hparams;
+
+    int n_patches = (params.image_size / params.patch_size) * (params.image_size / params.patch_size);
+
+    if (ctx->proj_type == PROJECTOR_TYPE_LDP || ctx->proj_type == PROJECTOR_TYPE_LDPV2 || ctx->proj_type == PROJECTOR_TYPE_GLM_EDGE) {
+        n_patches /= 4;
+    } else if (ctx->proj_type == PROJECTOR_TYPE_RESAMPLER) {
+        if (ctx->minicpmv_version == 2) {
+            n_patches = 96;
+        }
+        else if (ctx->minicpmv_version == 3) {
+            n_patches = 64;
+        }
+        else if (ctx->minicpmv_version == 4) {
+            n_patches = 64;
+        }
+    } else if (ctx->proj_type == PROJECTOR_TYPE_MERGER) {
+        int patch_size = params.patch_size * 2;
+        int x_patch = x / patch_size + (int)(x % patch_size > 0);
+        int y_patch = y / patch_size + (int)(y % patch_size > 0);
+        n_patches = x_patch * y_patch;
+    } else if (ctx->proj_type == PROJECTOR_TYPE_GEMMA3) {
+        n_patches = 256;
+    }
+
+    return n_patches;
 }
 
 size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_h, int img_w) {
-    clip_image_f32 img;
-    img.nx = img_w;
-    img.ny = img_h;
-    return clip_n_patches_by_img(ctx, &img) * clip_n_mmproj_embd(ctx) * sizeof(float);
+    return clip_img_get_n_output_tokens_by_dims(ctx, img_w, img_h) * clip_n_mmproj_embd(ctx) * sizeof(float);
 }
 
 int32_t clip_get_image_size(const struct clip_ctx * ctx) {
@@ -2293,40 +2319,16 @@ size_t get_clip_image_grid_size(const struct clip_ctx * ctx) {
     return ctx->vision_model.hparams.image_grid_pinpoints.size();
 }
 
-int clip_n_patches(const struct clip_ctx * ctx) {
-    clip_image_f32 img;
-    img.nx = ctx->vision_model.hparams.image_size;
-    img.ny = ctx->vision_model.hparams.image_size;
-    return clip_n_patches_by_img(ctx, &img);
+int clip_get_n_output_tokens(const struct clip_ctx * ctx) {
+    return clip_img_get_n_output_tokens_by_dims(ctx, ctx->vision_model.hparams.image_size, ctx->vision_model.hparams.image_size);
 }
 
-int clip_n_patches_by_img(const struct clip_ctx * ctx, struct clip_image_f32 * img) {
-    const auto & params = ctx->vision_model.hparams;
-
-    int n_patches = (params.image_size / params.patch_size) * (params.image_size / params.patch_size);
-
-    if (ctx->proj_type == PROJECTOR_TYPE_LDP || ctx->proj_type == PROJECTOR_TYPE_LDPV2 || ctx->proj_type == PROJECTOR_TYPE_GLM_EDGE) {
-        n_patches /= 4;
-    } else if (ctx->proj_type == PROJECTOR_TYPE_RESAMPLER) {
-        if (ctx->minicpmv_version == 2) {
-            n_patches = 96;
-        }
-        else if (ctx->minicpmv_version == 3) {
-            n_patches = 64;
-        }
-        else if (ctx->minicpmv_version == 4) {
-            n_patches = 64;
-        }
-    } else if (ctx->proj_type == PROJECTOR_TYPE_MERGER) {
-        int patch_size = params.patch_size * 2;
-        int x_patch = img->nx / patch_size + (int)(img->nx % patch_size > 0);
-        int y_patch = img->ny / patch_size + (int)(img->ny % patch_size > 0);
-        n_patches = x_patch * y_patch;
-    } else if (ctx->proj_type == PROJECTOR_TYPE_GEMMA3) {
-        n_patches = 256;
-    }
+int clip_img_f32_get_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * img) {
+    return clip_img_get_n_output_tokens_by_dims(ctx, img->nx, img->ny);
+}
 
-    return n_patches;
+int clip_img_u8_get_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_u8 * img) {
+    return clip_img_get_n_output_tokens_by_dims(ctx, img->nx, img->ny);
 }
 
 static std::vector<std::vector<std::vector<float>>> get_1d_sincos_pos_embed_from_grid_new(int embed_dim, const std::vector<std::vector<float>> & pos) {

diff --git a/examples/llava/clip.h b/examples/llava/clip.h
@@ -58,9 +58,10 @@ CLIP_API const char * clip_patch_merge_type(const struct clip_ctx * ctx);
 CLIP_API const int32_t * clip_image_grid(const struct clip_ctx * ctx);
 CLIP_API size_t get_clip_image_grid_size(const struct clip_ctx * ctx);
 
-CLIP_API int clip_n_patches        (const struct clip_ctx * ctx);
-CLIP_API int clip_n_patches_by_img (const struct clip_ctx * ctx, struct clip_image_f32 * img);
-CLIP_API int clip_n_mmproj_embd    (const struct clip_ctx * ctx);
+CLIP_API int clip_get_n_output_tokens         (const struct clip_ctx * ctx);
+CLIP_API int clip_img_f32_get_n_output_tokens (const struct clip_ctx * ctx, struct clip_image_f32 * img);
+CLIP_API int clip_img_u8_get_n_output_tokens  (const struct clip_ctx * ctx, struct clip_image_u8 * img);
+CLIP_API int clip_n_mmproj_embd               (const struct clip_ctx * ctx);
 
 CLIP_API int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip);
 CLIP_API void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size * load_image_size);

diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp
@@ -175,7 +175,7 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *>
 
     model.ctx = ggml_init(params);
 
-    struct ggml_tensor * image_features = ggml_new_tensor_3d(model.ctx, GGML_TYPE_F32, clip_n_mmproj_embd(ctx_clip), clip_n_patches(ctx_clip), num_images - 1); // example: 4096 x 576 x 4
+    struct ggml_tensor * image_features = ggml_new_tensor_3d(model.ctx, GGML_TYPE_F32, clip_n_mmproj_embd(ctx_clip), clip_get_n_output_tokens(ctx_clip), num_images - 1); // example: 4096 x 576 x 4
     // ggml_tensor_printf(image_features,"image_features",__LINE__,false,false);
     // fill it with the image embeddings, ignoring the base
     for (size_t i = 1; i < num_images; i++) {
@@ -214,8 +214,8 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *>
 
     memcpy(image_embd_out, image_embd_v[0], clip_embd_nbytes(ctx_clip)); // main image as global context
     // append without newline tokens (default behavior in llava_arch when not using unpad ):
-    memcpy(image_embd_out + clip_n_patches(ctx_clip) * clip_n_mmproj_embd(ctx_clip), (float*)result->data, clip_embd_nbytes(ctx_clip) * (num_images-1)); // grid patches
-    *n_img_pos_out = static_cast<int>(result->ne[1]+clip_n_patches(ctx_clip));
+    memcpy(image_embd_out + clip_get_n_output_tokens(ctx_clip) * clip_n_mmproj_embd(ctx_clip), (float*)result->data, clip_embd_nbytes(ctx_clip) * (num_images-1)); // grid patches
+    *n_img_pos_out = static_cast<int>(result->ne[1]+clip_get_n_output_tokens(ctx_clip));
 
     // Debug: Test single segments
     // Current findings: sending base image, sending a segment embedding all works similar to python
@@ -313,7 +313,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
                 image_embd + n_img_pos_out * clip_n_mmproj_embd(ctx_clip),
                 image_embd_v[i],
                 clip_embd_nbytes_by_img(ctx_clip, nx, ny));
-            n_img_pos_out += clip_n_patches_by_img(ctx_clip, img_res);
+            n_img_pos_out += clip_img_f32_get_n_output_tokens(ctx_clip, img_res);
         }
         *n_img_pos = n_img_pos_out;
         for (size_t i = 0; i < image_embd_v.size(); i++) {
@@ -342,7 +342,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
     }
     else if (strcmp(mm_patch_merge_type, "spatial_unpad") != 0) {
         // flat / default llava-1.5 type embedding
-        *n_img_pos = clip_n_patches(ctx_clip);
+        *n_img_pos = clip_get_n_output_tokens(ctx_clip);
         clip_image_f32 * img_res = clip_image_f32_get_img(img_res_v.get(), 0);
         bool encoded = clip_image_encode(ctx_clip, n_threads, img_res, image_embd); // image_embd shape is 576 x 4096
         if (!encoded) {

diff --git a/examples/llava/minicpmv-cli.cpp b/examples/llava/minicpmv-cli.cpp
@@ -124,19 +124,19 @@ static bool eval_string(struct llama_context * ctx_llama, const char* str, int n
 
 static void process_eval_image_embed(struct llava_context * ctx_llava, const struct llava_image_embed * embeds, int n_batch, int * n_past, int idx) {
     float * image_embed = (float *)malloc(clip_embd_nbytes(ctx_llava->ctx_clip));
-    std::memcpy(image_embed, embeds->embed + idx * clip_n_patches(ctx_llava->ctx_clip) * clip_n_mmproj_embd(ctx_llava->ctx_clip), clip_embd_nbytes(ctx_llava->ctx_clip));
+    std::memcpy(image_embed, embeds->embed + idx * clip_get_n_output_tokens(ctx_llava->ctx_clip) * clip_n_mmproj_embd(ctx_llava->ctx_clip), clip_embd_nbytes(ctx_llava->ctx_clip));
 
     auto * slice_embed = (llava_image_embed*)malloc(sizeof(llava_image_embed));
     slice_embed->embed = image_embed;
-    slice_embed->n_image_pos = clip_n_patches(ctx_llava->ctx_clip);
+    slice_embed->n_image_pos = clip_get_n_output_tokens(ctx_llava->ctx_clip);
     llava_eval_image_embed(ctx_llava->ctx_llama, slice_embed, n_batch, n_past);
     llava_image_embed_free(slice_embed);
 }
 
 static void process_image(struct llava_context * ctx_llava, struct llava_image_embed * embeds, common_params * params, int &n_past) {
     std::string system_prompt;
     int idx = 0;
-    int num_image_embeds = embeds->n_image_pos / clip_n_patches(ctx_llava->ctx_clip);
+    int num_image_embeds = embeds->n_image_pos / clip_get_n_output_tokens(ctx_llava->ctx_clip);
     int has_minicpmv_projector = clip_is_minicpmv(ctx_llava->ctx_clip);
     if (has_minicpmv_projector == 2) {
         system_prompt = "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n";

diff --git a/examples/llava/mtmd.cpp b/examples/llava/mtmd.cpp
@@ -149,7 +149,7 @@ mtmd_input_chunks * mtmd_tokenize(mtmd_context * ctx,
             }
 
             mtmd_image_tokens * image_tokens = new mtmd_image_tokens;
-            image_tokens->nx = clip_n_patches(ctx->ctx_clip); // TODO @ngxson : use clip_n_patches_by_image
+            image_tokens->nx = clip_get_n_output_tokens(ctx->ctx_clip); // TODO @ngxson : use clip_n_patches_by_image
             image_tokens->ny = 1; // TODO
             image_tokens->batch_f32 = std::move(batch_f32);