Skip to content

Commit 6e79b92

Browse files
committed
Rename API
1 parent 941efbc commit 6e79b92

File tree

5 files changed

+22
-22
lines changed

5 files changed

+22
-22
lines changed

examples/llava/clip.cpp

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -2256,10 +2256,10 @@ void clip_free(clip_ctx * ctx) {
22562256

22572257
size_t clip_embd_nbytes(const struct clip_ctx * ctx) {
22582258
int extra_tokens = ctx->has_glm_projector ? 2 : 0;
2259-
return (clip_n_patches(ctx) + extra_tokens) * clip_n_mmproj_embd(ctx) * sizeof(float);
2259+
return (clip_get_n_output_tokens(ctx) + extra_tokens) * clip_n_mmproj_embd(ctx) * sizeof(float);
22602260
}
22612261

2262-
static int clip_n_patches_by_img_dims(const struct clip_ctx * ctx, int x, int y) {
2262+
static int clip_img_get_n_output_tokens_by_dims(const struct clip_ctx * ctx, int x, int y) {
22632263
const auto & params = ctx->vision_model.hparams;
22642264

22652265
int n_patches = (params.image_size / params.patch_size) * (params.image_size / params.patch_size);
@@ -2289,7 +2289,7 @@ static int clip_n_patches_by_img_dims(const struct clip_ctx * ctx, int x, int y)
22892289
}
22902290

22912291
size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_h, int img_w) {
2292-
return clip_n_patches_by_img_dims(ctx, img_w, img_h) * clip_n_mmproj_embd(ctx) * sizeof(float);
2292+
return clip_img_get_n_output_tokens_by_dims(ctx, img_w, img_h) * clip_n_mmproj_embd(ctx) * sizeof(float);
22932293
}
22942294

22952295
int32_t clip_get_image_size(const struct clip_ctx * ctx) {
@@ -2319,16 +2319,16 @@ size_t get_clip_image_grid_size(const struct clip_ctx * ctx) {
23192319
return ctx->vision_model.hparams.image_grid_pinpoints.size();
23202320
}
23212321

2322-
int clip_n_patches(const struct clip_ctx * ctx) {
2323-
return clip_n_patches_by_img_dims(ctx, ctx->vision_model.hparams.image_size, ctx->vision_model.hparams.image_size);
2322+
int clip_get_n_output_tokens(const struct clip_ctx * ctx) {
2323+
return clip_img_get_n_output_tokens_by_dims(ctx, ctx->vision_model.hparams.image_size, ctx->vision_model.hparams.image_size);
23242324
}
23252325

2326-
int clip_n_patches_by_img(const struct clip_ctx * ctx, struct clip_image_f32 * img) {
2327-
return clip_n_patches_by_img_dims(ctx, img->nx, img->ny);
2326+
int clip_img_f32_get_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * img) {
2327+
return clip_img_get_n_output_tokens_by_dims(ctx, img->nx, img->ny);
23282328
}
23292329

2330-
int clip_n_patches_by_img_u8(const struct clip_ctx * ctx, struct clip_image_u8 * img) {
2331-
return clip_n_patches_by_img_dims(ctx, img->nx, img->ny);
2330+
int clip_img_u8_get_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_u8 * img) {
2331+
return clip_img_get_n_output_tokens_by_dims(ctx, img->nx, img->ny);
23322332
}
23332333

23342334
static std::vector<std::vector<std::vector<float>>> get_1d_sincos_pos_embed_from_grid_new(int embed_dim, const std::vector<std::vector<float>> & pos) {

examples/llava/clip.h

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -58,10 +58,10 @@ CLIP_API const char * clip_patch_merge_type(const struct clip_ctx * ctx);
5858
CLIP_API const int32_t * clip_image_grid(const struct clip_ctx * ctx);
5959
CLIP_API size_t get_clip_image_grid_size(const struct clip_ctx * ctx);
6060

61-
CLIP_API int clip_n_patches (const struct clip_ctx * ctx);
62-
CLIP_API int clip_n_patches_by_img (const struct clip_ctx * ctx, struct clip_image_f32 * img);
63-
CLIP_API int clip_n_patches_by_img_u8 (const struct clip_ctx * ctx, struct clip_image_u8 * img);
64-
CLIP_API int clip_n_mmproj_embd (const struct clip_ctx * ctx);
61+
CLIP_API int clip_get_n_output_tokens (const struct clip_ctx * ctx);
62+
CLIP_API int clip_img_f32_get_n_output_tokens (const struct clip_ctx * ctx, struct clip_image_f32 * img);
63+
CLIP_API int clip_img_u8_get_n_output_tokens (const struct clip_ctx * ctx, struct clip_image_u8 * img);
64+
CLIP_API int clip_n_mmproj_embd (const struct clip_ctx * ctx);
6565

6666
CLIP_API int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip);
6767
CLIP_API void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size * load_image_size);

examples/llava/llava.cpp

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -175,7 +175,7 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *>
175175

176176
model.ctx = ggml_init(params);
177177

178-
struct ggml_tensor * image_features = ggml_new_tensor_3d(model.ctx, GGML_TYPE_F32, clip_n_mmproj_embd(ctx_clip), clip_n_patches(ctx_clip), num_images - 1); // example: 4096 x 576 x 4
178+
struct ggml_tensor * image_features = ggml_new_tensor_3d(model.ctx, GGML_TYPE_F32, clip_n_mmproj_embd(ctx_clip), clip_get_n_output_tokens(ctx_clip), num_images - 1); // example: 4096 x 576 x 4
179179
// ggml_tensor_printf(image_features,"image_features",__LINE__,false,false);
180180
// fill it with the image embeddings, ignoring the base
181181
for (size_t i = 1; i < num_images; i++) {
@@ -214,8 +214,8 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *>
214214

215215
memcpy(image_embd_out, image_embd_v[0], clip_embd_nbytes(ctx_clip)); // main image as global context
216216
// append without newline tokens (default behavior in llava_arch when not using unpad ):
217-
memcpy(image_embd_out + clip_n_patches(ctx_clip) * clip_n_mmproj_embd(ctx_clip), (float*)result->data, clip_embd_nbytes(ctx_clip) * (num_images-1)); // grid patches
218-
*n_img_pos_out = static_cast<int>(result->ne[1]+clip_n_patches(ctx_clip));
217+
memcpy(image_embd_out + clip_get_n_output_tokens(ctx_clip) * clip_n_mmproj_embd(ctx_clip), (float*)result->data, clip_embd_nbytes(ctx_clip) * (num_images-1)); // grid patches
218+
*n_img_pos_out = static_cast<int>(result->ne[1]+clip_get_n_output_tokens(ctx_clip));
219219

220220
// Debug: Test single segments
221221
// Current findings: sending base image, sending a segment embedding all works similar to python
@@ -313,7 +313,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
313313
image_embd + n_img_pos_out * clip_n_mmproj_embd(ctx_clip),
314314
image_embd_v[i],
315315
clip_embd_nbytes_by_img(ctx_clip, nx, ny));
316-
n_img_pos_out += clip_n_patches_by_img(ctx_clip, img_res);
316+
n_img_pos_out += clip_img_f32_get_n_output_tokens(ctx_clip, img_res);
317317
}
318318
*n_img_pos = n_img_pos_out;
319319
for (size_t i = 0; i < image_embd_v.size(); i++) {
@@ -342,7 +342,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
342342
}
343343
else if (strcmp(mm_patch_merge_type, "spatial_unpad") != 0) {
344344
// flat / default llava-1.5 type embedding
345-
*n_img_pos = clip_n_patches(ctx_clip);
345+
*n_img_pos = clip_get_n_output_tokens(ctx_clip);
346346
clip_image_f32 * img_res = clip_image_f32_get_img(img_res_v.get(), 0);
347347
bool encoded = clip_image_encode(ctx_clip, n_threads, img_res, image_embd); // image_embd shape is 576 x 4096
348348
if (!encoded) {

examples/llava/minicpmv-cli.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -124,19 +124,19 @@ static bool eval_string(struct llama_context * ctx_llama, const char* str, int n
124124

125125
static void process_eval_image_embed(struct llava_context * ctx_llava, const struct llava_image_embed * embeds, int n_batch, int * n_past, int idx) {
126126
float * image_embed = (float *)malloc(clip_embd_nbytes(ctx_llava->ctx_clip));
127-
std::memcpy(image_embed, embeds->embed + idx * clip_n_patches(ctx_llava->ctx_clip) * clip_n_mmproj_embd(ctx_llava->ctx_clip), clip_embd_nbytes(ctx_llava->ctx_clip));
127+
std::memcpy(image_embed, embeds->embed + idx * clip_get_n_output_tokens(ctx_llava->ctx_clip) * clip_n_mmproj_embd(ctx_llava->ctx_clip), clip_embd_nbytes(ctx_llava->ctx_clip));
128128

129129
auto * slice_embed = (llava_image_embed*)malloc(sizeof(llava_image_embed));
130130
slice_embed->embed = image_embed;
131-
slice_embed->n_image_pos = clip_n_patches(ctx_llava->ctx_clip);
131+
slice_embed->n_image_pos = clip_get_n_output_tokens(ctx_llava->ctx_clip);
132132
llava_eval_image_embed(ctx_llava->ctx_llama, slice_embed, n_batch, n_past);
133133
llava_image_embed_free(slice_embed);
134134
}
135135

136136
static void process_image(struct llava_context * ctx_llava, struct llava_image_embed * embeds, common_params * params, int &n_past) {
137137
std::string system_prompt;
138138
int idx = 0;
139-
int num_image_embeds = embeds->n_image_pos / clip_n_patches(ctx_llava->ctx_clip);
139+
int num_image_embeds = embeds->n_image_pos / clip_get_n_output_tokens(ctx_llava->ctx_clip);
140140
int has_minicpmv_projector = clip_is_minicpmv(ctx_llava->ctx_clip);
141141
if (has_minicpmv_projector == 2) {
142142
system_prompt = "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n";

examples/llava/mtmd.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -149,7 +149,7 @@ mtmd_input_chunks * mtmd_tokenize(mtmd_context * ctx,
149149
}
150150

151151
mtmd_image_tokens * image_tokens = new mtmd_image_tokens;
152-
image_tokens->nx = clip_n_patches(ctx->ctx_clip); // TODO @ngxson : use clip_n_patches_by_image
152+
image_tokens->nx = clip_get_n_output_tokens(ctx->ctx_clip); // TODO @ngxson : use clip_n_patches_by_image
153153
image_tokens->ny = 1; // TODO
154154
image_tokens->batch_f32 = std::move(batch_f32);
155155

0 commit comments

Comments
 (0)