Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
74 changes: 38 additions & 36 deletions examples/llava/clip.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2256,14 +2256,40 @@ void clip_free(clip_ctx * ctx) {

size_t clip_embd_nbytes(const struct clip_ctx * ctx) {
int extra_tokens = ctx->has_glm_projector ? 2 : 0;
return (clip_n_patches(ctx) + extra_tokens) * clip_n_mmproj_embd(ctx) * sizeof(float);
return (clip_get_n_output_tokens(ctx) + extra_tokens) * clip_n_mmproj_embd(ctx) * sizeof(float);
}

static int clip_img_get_n_output_tokens_by_dims(const struct clip_ctx * ctx, int x, int y) {
const auto & params = ctx->vision_model.hparams;

int n_patches = (params.image_size / params.patch_size) * (params.image_size / params.patch_size);

if (ctx->proj_type == PROJECTOR_TYPE_LDP || ctx->proj_type == PROJECTOR_TYPE_LDPV2 || ctx->proj_type == PROJECTOR_TYPE_GLM_EDGE) {
n_patches /= 4;
} else if (ctx->proj_type == PROJECTOR_TYPE_RESAMPLER) {
if (ctx->minicpmv_version == 2) {
n_patches = 96;
}
else if (ctx->minicpmv_version == 3) {
n_patches = 64;
}
else if (ctx->minicpmv_version == 4) {
n_patches = 64;
}
} else if (ctx->proj_type == PROJECTOR_TYPE_MERGER) {
int patch_size = params.patch_size * 2;
int x_patch = x / patch_size + (int)(x % patch_size > 0);
int y_patch = y / patch_size + (int)(y % patch_size > 0);
n_patches = x_patch * y_patch;
} else if (ctx->proj_type == PROJECTOR_TYPE_GEMMA3) {
n_patches = 256;
}

return n_patches;
}

size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_h, int img_w) {
clip_image_f32 img;
img.nx = img_w;
img.ny = img_h;
return clip_n_patches_by_img(ctx, &img) * clip_n_mmproj_embd(ctx) * sizeof(float);
return clip_img_get_n_output_tokens_by_dims(ctx, img_w, img_h) * clip_n_mmproj_embd(ctx) * sizeof(float);
}

int32_t clip_get_image_size(const struct clip_ctx * ctx) {
Expand Down Expand Up @@ -2293,40 +2319,16 @@ size_t get_clip_image_grid_size(const struct clip_ctx * ctx) {
return ctx->vision_model.hparams.image_grid_pinpoints.size();
}

int clip_n_patches(const struct clip_ctx * ctx) {
clip_image_f32 img;
img.nx = ctx->vision_model.hparams.image_size;
img.ny = ctx->vision_model.hparams.image_size;
return clip_n_patches_by_img(ctx, &img);
int clip_get_n_output_tokens(const struct clip_ctx * ctx) {
return clip_img_get_n_output_tokens_by_dims(ctx, ctx->vision_model.hparams.image_size, ctx->vision_model.hparams.image_size);
}

int clip_n_patches_by_img(const struct clip_ctx * ctx, struct clip_image_f32 * img) {
const auto & params = ctx->vision_model.hparams;

int n_patches = (params.image_size / params.patch_size) * (params.image_size / params.patch_size);

if (ctx->proj_type == PROJECTOR_TYPE_LDP || ctx->proj_type == PROJECTOR_TYPE_LDPV2 || ctx->proj_type == PROJECTOR_TYPE_GLM_EDGE) {
n_patches /= 4;
} else if (ctx->proj_type == PROJECTOR_TYPE_RESAMPLER) {
if (ctx->minicpmv_version == 2) {
n_patches = 96;
}
else if (ctx->minicpmv_version == 3) {
n_patches = 64;
}
else if (ctx->minicpmv_version == 4) {
n_patches = 64;
}
} else if (ctx->proj_type == PROJECTOR_TYPE_MERGER) {
int patch_size = params.patch_size * 2;
int x_patch = img->nx / patch_size + (int)(img->nx % patch_size > 0);
int y_patch = img->ny / patch_size + (int)(img->ny % patch_size > 0);
n_patches = x_patch * y_patch;
} else if (ctx->proj_type == PROJECTOR_TYPE_GEMMA3) {
n_patches = 256;
}
int clip_img_f32_get_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * img) {
return clip_img_get_n_output_tokens_by_dims(ctx, img->nx, img->ny);
}

return n_patches;
int clip_img_u8_get_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_u8 * img) {
return clip_img_get_n_output_tokens_by_dims(ctx, img->nx, img->ny);
}

static std::vector<std::vector<std::vector<float>>> get_1d_sincos_pos_embed_from_grid_new(int embed_dim, const std::vector<std::vector<float>> & pos) {
Expand Down
7 changes: 4 additions & 3 deletions examples/llava/clip.h
Original file line number Diff line number Diff line change
Expand Up @@ -58,9 +58,10 @@ CLIP_API const char * clip_patch_merge_type(const struct clip_ctx * ctx);
CLIP_API const int32_t * clip_image_grid(const struct clip_ctx * ctx);
CLIP_API size_t get_clip_image_grid_size(const struct clip_ctx * ctx);

CLIP_API int clip_n_patches (const struct clip_ctx * ctx);
CLIP_API int clip_n_patches_by_img (const struct clip_ctx * ctx, struct clip_image_f32 * img);
CLIP_API int clip_n_mmproj_embd (const struct clip_ctx * ctx);
CLIP_API int clip_get_n_output_tokens (const struct clip_ctx * ctx);
CLIP_API int clip_img_f32_get_n_output_tokens (const struct clip_ctx * ctx, struct clip_image_f32 * img);
CLIP_API int clip_img_u8_get_n_output_tokens (const struct clip_ctx * ctx, struct clip_image_u8 * img);
CLIP_API int clip_n_mmproj_embd (const struct clip_ctx * ctx);

CLIP_API int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip);
CLIP_API void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size * load_image_size);
Expand Down
10 changes: 5 additions & 5 deletions examples/llava/llava.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -175,7 +175,7 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *>

model.ctx = ggml_init(params);

struct ggml_tensor * image_features = ggml_new_tensor_3d(model.ctx, GGML_TYPE_F32, clip_n_mmproj_embd(ctx_clip), clip_n_patches(ctx_clip), num_images - 1); // example: 4096 x 576 x 4
struct ggml_tensor * image_features = ggml_new_tensor_3d(model.ctx, GGML_TYPE_F32, clip_n_mmproj_embd(ctx_clip), clip_get_n_output_tokens(ctx_clip), num_images - 1); // example: 4096 x 576 x 4
// ggml_tensor_printf(image_features,"image_features",__LINE__,false,false);
// fill it with the image embeddings, ignoring the base
for (size_t i = 1; i < num_images; i++) {
Expand Down Expand Up @@ -214,8 +214,8 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *>

memcpy(image_embd_out, image_embd_v[0], clip_embd_nbytes(ctx_clip)); // main image as global context
// append without newline tokens (default behavior in llava_arch when not using unpad ):
memcpy(image_embd_out + clip_n_patches(ctx_clip) * clip_n_mmproj_embd(ctx_clip), (float*)result->data, clip_embd_nbytes(ctx_clip) * (num_images-1)); // grid patches
*n_img_pos_out = static_cast<int>(result->ne[1]+clip_n_patches(ctx_clip));
memcpy(image_embd_out + clip_get_n_output_tokens(ctx_clip) * clip_n_mmproj_embd(ctx_clip), (float*)result->data, clip_embd_nbytes(ctx_clip) * (num_images-1)); // grid patches
*n_img_pos_out = static_cast<int>(result->ne[1]+clip_get_n_output_tokens(ctx_clip));

// Debug: Test single segments
// Current findings: sending base image, sending a segment embedding all works similar to python
Expand Down Expand Up @@ -313,7 +313,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
image_embd + n_img_pos_out * clip_n_mmproj_embd(ctx_clip),
image_embd_v[i],
clip_embd_nbytes_by_img(ctx_clip, nx, ny));
n_img_pos_out += clip_n_patches_by_img(ctx_clip, img_res);
n_img_pos_out += clip_img_f32_get_n_output_tokens(ctx_clip, img_res);
}
*n_img_pos = n_img_pos_out;
for (size_t i = 0; i < image_embd_v.size(); i++) {
Expand Down Expand Up @@ -342,7 +342,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
}
else if (strcmp(mm_patch_merge_type, "spatial_unpad") != 0) {
// flat / default llava-1.5 type embedding
*n_img_pos = clip_n_patches(ctx_clip);
*n_img_pos = clip_get_n_output_tokens(ctx_clip);
clip_image_f32 * img_res = clip_image_f32_get_img(img_res_v.get(), 0);
bool encoded = clip_image_encode(ctx_clip, n_threads, img_res, image_embd); // image_embd shape is 576 x 4096
if (!encoded) {
Expand Down
6 changes: 3 additions & 3 deletions examples/llava/minicpmv-cli.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -124,19 +124,19 @@ static bool eval_string(struct llama_context * ctx_llama, const char* str, int n

static void process_eval_image_embed(struct llava_context * ctx_llava, const struct llava_image_embed * embeds, int n_batch, int * n_past, int idx) {
float * image_embed = (float *)malloc(clip_embd_nbytes(ctx_llava->ctx_clip));
std::memcpy(image_embed, embeds->embed + idx * clip_n_patches(ctx_llava->ctx_clip) * clip_n_mmproj_embd(ctx_llava->ctx_clip), clip_embd_nbytes(ctx_llava->ctx_clip));
std::memcpy(image_embed, embeds->embed + idx * clip_get_n_output_tokens(ctx_llava->ctx_clip) * clip_n_mmproj_embd(ctx_llava->ctx_clip), clip_embd_nbytes(ctx_llava->ctx_clip));

auto * slice_embed = (llava_image_embed*)malloc(sizeof(llava_image_embed));
slice_embed->embed = image_embed;
slice_embed->n_image_pos = clip_n_patches(ctx_llava->ctx_clip);
slice_embed->n_image_pos = clip_get_n_output_tokens(ctx_llava->ctx_clip);
llava_eval_image_embed(ctx_llava->ctx_llama, slice_embed, n_batch, n_past);
llava_image_embed_free(slice_embed);
}

static void process_image(struct llava_context * ctx_llava, struct llava_image_embed * embeds, common_params * params, int &n_past) {
std::string system_prompt;
int idx = 0;
int num_image_embeds = embeds->n_image_pos / clip_n_patches(ctx_llava->ctx_clip);
int num_image_embeds = embeds->n_image_pos / clip_get_n_output_tokens(ctx_llava->ctx_clip);
int has_minicpmv_projector = clip_is_minicpmv(ctx_llava->ctx_clip);
if (has_minicpmv_projector == 2) {
system_prompt = "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n";
Expand Down
2 changes: 1 addition & 1 deletion examples/llava/mtmd.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,7 @@ mtmd_input_chunks * mtmd_tokenize(mtmd_context * ctx,
}

mtmd_image_tokens * image_tokens = new mtmd_image_tokens;
image_tokens->nx = clip_n_patches(ctx->ctx_clip); // TODO @ngxson : use clip_n_patches_by_image
image_tokens->nx = clip_get_n_output_tokens(ctx->ctx_clip); // TODO @ngxson : use clip_n_patches_by_image
image_tokens->ny = 1; // TODO
image_tokens->batch_f32 = std::move(batch_f32);

Expand Down
Loading