Skip to content

Commit f538bf5

Browse files
committed
fix bug in minicpm-v code
1 parent 3d804de commit f538bf5

File tree

3 files changed

+26
-11
lines changed

3 files changed

+26
-11
lines changed

examples/llava/clip.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1308,6 +1308,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
13081308
LOG_INF("%s: vision_encoder: %d\n", __func__, new_clip->has_vision_encoder);
13091309
LOG_INF("%s: llava_projector: %d\n", __func__, new_clip->has_llava_projector);
13101310
LOG_INF("%s: minicpmv_projector: %d\n", __func__, new_clip->has_minicpmv_projector);
1311+
LOG_INF("%s: minicpmv_version: %d\n", __func__, new_clip->minicpmv_version);
13111312
LOG_INF("%s: model size: %.2f MB\n", __func__, model_size / 1024.0 / 1024.0);
13121313
LOG_INF("%s: metadata size: %.2f MB\n", __func__, ggml_get_mem_size(meta) / 1024.0 / 1024.0);
13131314
}

examples/llava/minicpmv-cli.cpp

Lines changed: 25 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -148,19 +148,34 @@ static void process_image(struct llava_context * ctx_llava, struct llava_image_e
148148
process_eval_image_embed(ctx_llava, embeds, params->n_batch, &n_past, idx++);
149149
eval_string(ctx_llava->ctx_llama, std::string("</image>").c_str(), params->n_batch, &n_past, false);
150150
if (num_image_embeds > 1) {
151-
size_t num_image_embeds_col = clip_uhd_num_image_embeds_col(ctx_llava->ctx_clip);
152-
eval_string(ctx_llava->ctx_llama, std::string("<slice>").c_str(), params->n_batch, &n_past, false);
153-
for (size_t i = 0; i < (num_image_embeds-1)/num_image_embeds_col; ++i) {
154-
for (size_t j = 0; j < num_image_embeds_col; ++j) {
155-
eval_string(ctx_llava->ctx_llama, std::string("<image>").c_str(), params->n_batch, &n_past, false);
156-
process_eval_image_embed(ctx_llava, embeds, params->n_batch, &n_past, idx++);
157-
eval_string(ctx_llava->ctx_llama, std::string("</image>").c_str(), params->n_batch, &n_past, false);
158-
if (j == num_image_embeds_col - 1) {
159-
eval_string(ctx_llava->ctx_llama, std::string("\n").c_str(), params->n_batch, &n_past, false);
151+
if (has_minicpmv_projector == 2) {
152+
size_t num_image_embeds_col = clip_uhd_num_image_embeds_col(ctx_llava->ctx_clip);
153+
eval_string(ctx_llava->ctx_llama, std::string("<slice>").c_str(), params->n_batch, &n_past, false);
154+
for (size_t i = 0; i < (num_image_embeds-1)/num_image_embeds_col; ++i) {
155+
for (size_t j = 0; j < num_image_embeds_col; ++j) {
156+
eval_string(ctx_llava->ctx_llama, std::string("<image>").c_str(), params->n_batch, &n_past, false);
157+
process_eval_image_embed(ctx_llava, embeds, params->n_batch, &n_past, idx++);
158+
eval_string(ctx_llava->ctx_llama, std::string("</image>").c_str(), params->n_batch, &n_past, false);
159+
if (j == num_image_embeds_col - 1) {
160+
eval_string(ctx_llava->ctx_llama, std::string("\n").c_str(), params->n_batch, &n_past, false);
161+
}
162+
}
163+
}
164+
eval_string(ctx_llava->ctx_llama, std::string("</slice>").c_str(), params->n_batch, &n_past, false);
165+
}
166+
else if (has_minicpmv_projector == 3 || has_minicpmv_projector == 4) {
167+
size_t num_image_embeds_col = clip_uhd_num_image_embeds_col(ctx_llava->ctx_clip);
168+
for (size_t i = 0; i < (num_image_embeds-1)/num_image_embeds_col; ++i) {
169+
for (size_t j = 0; j < num_image_embeds_col; ++j) {
170+
eval_string(ctx_llava->ctx_llama, std::string("<slice>").c_str(), params->n_batch, &n_past, false);
171+
process_eval_image_embed(ctx_llava, embeds, params->n_batch, &n_past, idx++);
172+
eval_string(ctx_llava->ctx_llama, std::string("</slice>").c_str(), params->n_batch, &n_past, false);
173+
if (j == num_image_embeds_col - 1) {
174+
eval_string(ctx_llava->ctx_llama, std::string("\n").c_str(), params->n_batch, &n_past, false);
175+
}
160176
}
161177
}
162178
}
163-
eval_string(ctx_llava->ctx_llama, std::string("</slice>").c_str(), params->n_batch, &n_past, false);
164179
}
165180
LOG_INF("%s: image token past: %d\n", __func__, n_past);
166181
}

examples/llava/minicpmv-convert-image-encoder-to-gguf.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -597,7 +597,6 @@ def bytes_to_unicode():
597597
fname_middle = "mmproj-"
598598
has_text_encoder = False
599599
has_minicpmv_projector = True
600-
minicpmv_version = 4
601600
elif args.vision_only:
602601
fname_middle = "vision-"
603602
has_text_encoder = False

0 commit comments

Comments
 (0)