Rename API

mattjcly · mattjcly · commit 6e79b925d959 · 2025-04-14T13:21:48.000-04:00
diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
@@ -2256,10 +2256,10 @@ void clip_free(clip_ctx * ctx) {
 
 size_t clip_embd_nbytes(const struct clip_ctx * ctx) {
     int extra_tokens = ctx->has_glm_projector ? 2 : 0;
-    return (clip_n_patches(ctx) + extra_tokens) * clip_n_mmproj_embd(ctx) * sizeof(float);
+    return (clip_get_n_output_tokens(ctx) + extra_tokens) * clip_n_mmproj_embd(ctx) * sizeof(float);
 }
 
-static int clip_n_patches_by_img_dims(const struct clip_ctx * ctx, int x, int y) {
+static int clip_img_get_n_output_tokens_by_dims(const struct clip_ctx * ctx, int x, int y) {
     const auto & params = ctx->vision_model.hparams;
 
     int n_patches = (params.image_size / params.patch_size) * (params.image_size / params.patch_size);
@@ -2289,7 +2289,7 @@ static int clip_n_patches_by_img_dims(const struct clip_ctx * ctx, int x, int y)
 }
 
 size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_h, int img_w) {
-    return clip_n_patches_by_img_dims(ctx, img_w, img_h) * clip_n_mmproj_embd(ctx) * sizeof(float);
+    return clip_img_get_n_output_tokens_by_dims(ctx, img_w, img_h) * clip_n_mmproj_embd(ctx) * sizeof(float);
 }
 
 int32_t clip_get_image_size(const struct clip_ctx * ctx) {
@@ -2319,16 +2319,16 @@ size_t get_clip_image_grid_size(const struct clip_ctx * ctx) {
     return ctx->vision_model.hparams.image_grid_pinpoints.size();
 }
 
-int clip_n_patches(const struct clip_ctx * ctx) {
-    return clip_n_patches_by_img_dims(ctx, ctx->vision_model.hparams.image_size, ctx->vision_model.hparams.image_size);
+int clip_get_n_output_tokens(const struct clip_ctx * ctx) {
+    return clip_img_get_n_output_tokens_by_dims(ctx, ctx->vision_model.hparams.image_size, ctx->vision_model.hparams.image_size);
 }
 
-int clip_n_patches_by_img(const struct clip_ctx * ctx, struct clip_image_f32 * img) {
-    return clip_n_patches_by_img_dims(ctx, img->nx, img->ny);
+int clip_img_f32_get_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * img) {
+    return clip_img_get_n_output_tokens_by_dims(ctx, img->nx, img->ny);
 }
 
-int clip_n_patches_by_img_u8(const struct clip_ctx * ctx, struct clip_image_u8 * img) {
-    return clip_n_patches_by_img_dims(ctx, img->nx, img->ny);
+int clip_img_u8_get_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_u8 * img) {
+    return clip_img_get_n_output_tokens_by_dims(ctx, img->nx, img->ny);
 }
 
 static std::vector<std::vector<std::vector<float>>> get_1d_sincos_pos_embed_from_grid_new(int embed_dim, const std::vector<std::vector<float>> & pos) {
diff --git a/examples/llava/clip.h b/examples/llava/clip.h
@@ -58,10 +58,10 @@ CLIP_API const char * clip_patch_merge_type(const struct clip_ctx * ctx);
 CLIP_API const int32_t * clip_image_grid(const struct clip_ctx * ctx);
 CLIP_API size_t get_clip_image_grid_size(const struct clip_ctx * ctx);
 
-CLIP_API int clip_n_patches           (const struct clip_ctx * ctx);
-CLIP_API int clip_n_patches_by_img    (const struct clip_ctx * ctx, struct clip_image_f32 * img);
-CLIP_API int clip_n_patches_by_img_u8 (const struct clip_ctx * ctx, struct clip_image_u8 * img);
-CLIP_API int clip_n_mmproj_embd       (const struct clip_ctx * ctx);
+CLIP_API int clip_get_n_output_tokens         (const struct clip_ctx * ctx);
+CLIP_API int clip_img_f32_get_n_output_tokens (const struct clip_ctx * ctx, struct clip_image_f32 * img);
+CLIP_API int clip_img_u8_get_n_output_tokens  (const struct clip_ctx * ctx, struct clip_image_u8 * img);
+CLIP_API int clip_n_mmproj_embd               (const struct clip_ctx * ctx);
 
 CLIP_API int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip);
 CLIP_API void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size * load_image_size);
diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp
@@ -175,7 +175,7 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *>
 
     model.ctx = ggml_init(params);
 
-    struct ggml_tensor * image_features = ggml_new_tensor_3d(model.ctx, GGML_TYPE_F32, clip_n_mmproj_embd(ctx_clip), clip_n_patches(ctx_clip), num_images - 1); // example: 4096 x 576 x 4
+    struct ggml_tensor * image_features = ggml_new_tensor_3d(model.ctx, GGML_TYPE_F32, clip_n_mmproj_embd(ctx_clip), clip_get_n_output_tokens(ctx_clip), num_images - 1); // example: 4096 x 576 x 4
     // ggml_tensor_printf(image_features,"image_features",__LINE__,false,false);
     // fill it with the image embeddings, ignoring the base
     for (size_t i = 1; i < num_images; i++) {
@@ -214,8 +214,8 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *>
 
     memcpy(image_embd_out, image_embd_v[0], clip_embd_nbytes(ctx_clip)); // main image as global context
     // append without newline tokens (default behavior in llava_arch when not using unpad ):
-    memcpy(image_embd_out + clip_n_patches(ctx_clip) * clip_n_mmproj_embd(ctx_clip), (float*)result->data, clip_embd_nbytes(ctx_clip) * (num_images-1)); // grid patches
-    *n_img_pos_out = static_cast<int>(result->ne[1]+clip_n_patches(ctx_clip));
+    memcpy(image_embd_out + clip_get_n_output_tokens(ctx_clip) * clip_n_mmproj_embd(ctx_clip), (float*)result->data, clip_embd_nbytes(ctx_clip) * (num_images-1)); // grid patches
+    *n_img_pos_out = static_cast<int>(result->ne[1]+clip_get_n_output_tokens(ctx_clip));
 
     // Debug: Test single segments
     // Current findings: sending base image, sending a segment embedding all works similar to python
@@ -313,7 +313,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
                 image_embd + n_img_pos_out * clip_n_mmproj_embd(ctx_clip),
                 image_embd_v[i],
                 clip_embd_nbytes_by_img(ctx_clip, nx, ny));
-            n_img_pos_out += clip_n_patches_by_img(ctx_clip, img_res);
+            n_img_pos_out += clip_img_f32_get_n_output_tokens(ctx_clip, img_res);
         }
         *n_img_pos = n_img_pos_out;
         for (size_t i = 0; i < image_embd_v.size(); i++) {
@@ -342,7 +342,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
     }
     else if (strcmp(mm_patch_merge_type, "spatial_unpad") != 0) {
         // flat / default llava-1.5 type embedding
-        *n_img_pos = clip_n_patches(ctx_clip);
+        *n_img_pos = clip_get_n_output_tokens(ctx_clip);
         clip_image_f32 * img_res = clip_image_f32_get_img(img_res_v.get(), 0);
         bool encoded = clip_image_encode(ctx_clip, n_threads, img_res, image_embd); // image_embd shape is 576 x 4096
         if (!encoded) {
diff --git a/examples/llava/minicpmv-cli.cpp b/examples/llava/minicpmv-cli.cpp
@@ -124,19 +124,19 @@ static bool eval_string(struct llama_context * ctx_llama, const char* str, int n
 
 static void process_eval_image_embed(struct llava_context * ctx_llava, const struct llava_image_embed * embeds, int n_batch, int * n_past, int idx) {
     float * image_embed = (float *)malloc(clip_embd_nbytes(ctx_llava->ctx_clip));
-    std::memcpy(image_embed, embeds->embed + idx * clip_n_patches(ctx_llava->ctx_clip) * clip_n_mmproj_embd(ctx_llava->ctx_clip), clip_embd_nbytes(ctx_llava->ctx_clip));
+    std::memcpy(image_embed, embeds->embed + idx * clip_get_n_output_tokens(ctx_llava->ctx_clip) * clip_n_mmproj_embd(ctx_llava->ctx_clip), clip_embd_nbytes(ctx_llava->ctx_clip));
 
     auto * slice_embed = (llava_image_embed*)malloc(sizeof(llava_image_embed));
     slice_embed->embed = image_embed;
-    slice_embed->n_image_pos = clip_n_patches(ctx_llava->ctx_clip);
+    slice_embed->n_image_pos = clip_get_n_output_tokens(ctx_llava->ctx_clip);
     llava_eval_image_embed(ctx_llava->ctx_llama, slice_embed, n_batch, n_past);
     llava_image_embed_free(slice_embed);
 }
 
 static void process_image(struct llava_context * ctx_llava, struct llava_image_embed * embeds, common_params * params, int &n_past) {
     std::string system_prompt;
     int idx = 0;
-    int num_image_embeds = embeds->n_image_pos / clip_n_patches(ctx_llava->ctx_clip);
+    int num_image_embeds = embeds->n_image_pos / clip_get_n_output_tokens(ctx_llava->ctx_clip);
     int has_minicpmv_projector = clip_is_minicpmv(ctx_llava->ctx_clip);
     if (has_minicpmv_projector == 2) {
         system_prompt = "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n";
diff --git a/examples/llava/mtmd.cpp b/examples/llava/mtmd.cpp
@@ -149,7 +149,7 @@ mtmd_input_chunks * mtmd_tokenize(mtmd_context * ctx,
             }
 
             mtmd_image_tokens * image_tokens = new mtmd_image_tokens;
-            image_tokens->nx = clip_n_patches(ctx->ctx_clip); // TODO @ngxson : use clip_n_patches_by_image
+            image_tokens->nx = clip_get_n_output_tokens(ctx->ctx_clip); // TODO @ngxson : use clip_n_patches_by_image
             image_tokens->ny = 1; // TODO
             image_tokens->batch_f32 = std::move(batch_f32);
 

Original file line number	Diff line number	Diff line change
`@@ -2256,10 +2256,10 @@ void clip_free(clip_ctx * ctx) {`
`2256`	`2256`
`2257`	`2257`	`size_t clip_embd_nbytes(const struct clip_ctx * ctx) {`
`2258`	`2258`	`int extra_tokens = ctx->has_glm_projector ? 2 : 0;`
`2259`		`- return (clip_n_patches(ctx) + extra_tokens) * clip_n_mmproj_embd(ctx) * sizeof(float);`
	`2259`	`+ return (clip_get_n_output_tokens(ctx) + extra_tokens) * clip_n_mmproj_embd(ctx) * sizeof(float);`
`2260`	`2260`	`}`
`2261`	`2261`
`2262`		`-static int clip_n_patches_by_img_dims(const struct clip_ctx * ctx, int x, int y) {`
	`2262`	`+static int clip_img_get_n_output_tokens_by_dims(const struct clip_ctx * ctx, int x, int y) {`
`2263`	`2263`	`const auto & params = ctx->vision_model.hparams;`
`2264`	`2264`
`2265`	`2265`	`int n_patches = (params.image_size / params.patch_size) * (params.image_size / params.patch_size);`
`@@ -2289,7 +2289,7 @@ static int clip_n_patches_by_img_dims(const struct clip_ctx * ctx, int x, int y)`
`2289`	`2289`	`}`
`2290`	`2290`
`2291`	`2291`	`size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_h, int img_w) {`
`2292`		`- return clip_n_patches_by_img_dims(ctx, img_w, img_h) * clip_n_mmproj_embd(ctx) * sizeof(float);`
	`2292`	`+ return clip_img_get_n_output_tokens_by_dims(ctx, img_w, img_h) * clip_n_mmproj_embd(ctx) * sizeof(float);`
`2293`	`2293`	`}`
`2294`	`2294`
`2295`	`2295`	`int32_t clip_get_image_size(const struct clip_ctx * ctx) {`
`@@ -2319,16 +2319,16 @@ size_t get_clip_image_grid_size(const struct clip_ctx * ctx) {`
`2319`	`2319`	`return ctx->vision_model.hparams.image_grid_pinpoints.size();`
`2320`	`2320`	`}`
`2321`	`2321`
`2322`		`-int clip_n_patches(const struct clip_ctx * ctx) {`
`2323`		`- return clip_n_patches_by_img_dims(ctx, ctx->vision_model.hparams.image_size, ctx->vision_model.hparams.image_size);`
	`2322`	`+int clip_get_n_output_tokens(const struct clip_ctx * ctx) {`
	`2323`	`+ return clip_img_get_n_output_tokens_by_dims(ctx, ctx->vision_model.hparams.image_size, ctx->vision_model.hparams.image_size);`
`2324`	`2324`	`}`
`2325`	`2325`
`2326`		`-int clip_n_patches_by_img(const struct clip_ctx * ctx, struct clip_image_f32 * img) {`
`2327`		`- return clip_n_patches_by_img_dims(ctx, img->nx, img->ny);`
	`2326`	`+int clip_img_f32_get_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * img) {`
	`2327`	`+ return clip_img_get_n_output_tokens_by_dims(ctx, img->nx, img->ny);`
`2328`	`2328`	`}`
`2329`	`2329`
`2330`		`-int clip_n_patches_by_img_u8(const struct clip_ctx * ctx, struct clip_image_u8 * img) {`
`2331`		`- return clip_n_patches_by_img_dims(ctx, img->nx, img->ny);`
	`2330`	`+int clip_img_u8_get_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_u8 * img) {`
	`2331`	`+ return clip_img_get_n_output_tokens_by_dims(ctx, img->nx, img->ny);`
`2332`	`2332`	`}`
`2333`	`2333`
`2334`	`2334`	`static std::vector<std::vector<std::vector<float>>> get_1d_sincos_pos_embed_from_grid_new(int embed_dim, const std::vector<std::vector<float>> & pos) {`
Original file line number	Diff line number	Diff line change
`@@ -149,7 +149,7 @@ mtmd_input_chunks * mtmd_tokenize(mtmd_context * ctx,`
`149`	`149`	`}`
`150`	`150`
`151`	`151`	`mtmd_image_tokens * image_tokens = new mtmd_image_tokens;`
`152`		`- image_tokens->nx = clip_n_patches(ctx->ctx_clip); // TODO @ngxson : use clip_n_patches_by_image`
	`152`	`+ image_tokens->nx = clip_get_n_output_tokens(ctx->ctx_clip); // TODO @ngxson : use clip_n_patches_by_image`
`153`	`153`	`image_tokens->ny = 1; // TODO`
`154`	`154`	`image_tokens->batch_f32 = std::move(batch_f32);`
`155`	`155`