Skip to content

Commit da6b9eb

Browse files
committed
support for minicpmv
1 parent 4737bd0 commit da6b9eb

File tree

4 files changed

+200
-43
lines changed

4 files changed

+200
-43
lines changed

examples/llava/CMakeLists.txt

Lines changed: 3 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -61,15 +61,9 @@ if(TARGET BUILD_INFO)
6161
add_dependencies(mtmd BUILD_INFO)
6262
endif()
6363

64-
add_executable(llama-llava-cli deprecation-warning.cpp)
65-
add_executable(llama-gemma3-cli deprecation-warning.cpp)
66-
67-
set(TARGET llama-minicpmv-cli)
68-
add_executable(${TARGET} minicpmv-cli.cpp)
69-
set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-minicpmv-cli)
70-
install(TARGETS ${TARGET} RUNTIME)
71-
target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
72-
target_compile_features(${TARGET} PRIVATE cxx_std_17)
64+
add_executable(llama-llava-cli deprecation-warning.cpp)
65+
add_executable(llama-gemma3-cli deprecation-warning.cpp)
66+
add_executable(llama-minicpmv-cli deprecation-warning.cpp)
7367

7468
set(TARGET llama-qwen2vl-cli)
7569
add_executable(${TARGET} qwen2vl-cli.cpp)

examples/llava/mtmd-cli.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,13 @@ struct mtmd_cli_context {
8585
batch = llama_batch_init(params.n_batch, 0, 1);
8686
n_batch = params.n_batch;
8787

88+
if (!llama_model_chat_template(model, nullptr) && params.chat_template.empty()) {
89+
LOG_ERR("Model does not have chat template.\n");
90+
LOG_ERR(" For old llava models, you may need to use '--chat-template vicuna'\n");
91+
LOG_ERR(" For MobileVLM models, use '--chat-template deepseek'\n");
92+
exit(1);
93+
}
94+
8895
tmpls = common_chat_templates_init(model, params.chat_template);
8996
LOG_INF("%s: chat template example:\n%s\n", __func__, common_chat_format_example(tmpls.get(), params.use_jinja).c_str());
9097

examples/llava/mtmd.cpp

Lines changed: 179 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,15 @@
1212
#include <limits>
1313
#include <vector>
1414

15+
// slice template, used by some llava-uhd models to correctly place the special tokens around image embeddings
16+
// models not having it (llava-1.6) will process embeddings without any special tokens in-between
17+
enum mtmd_slice_tmpl {
18+
MTMD_SLICE_TMPL_NONE,
19+
MTMD_SLICE_TMPL_MINICPMV_2_5,
20+
MTMD_SLICE_TMPL_MINICPMV_2_6,
21+
// TODO @ngxson : add support for idefics (SmolVLM)
22+
};
23+
1524
struct mtmd_context {
1625
struct clip_ctx * ctx_clip;
1726
const struct llama_model * text_model;
@@ -21,6 +30,16 @@ struct mtmd_context {
2130
int n_threads;
2231
std::string image_marker;
2332

33+
// for minicpmv, we need special tokens in-between slices
34+
mtmd_slice_tmpl slice_tmpl = MTMD_SLICE_TMPL_NONE;
35+
llama_token tok_ov_img_start = LLAMA_TOKEN_NULL; // overview image
36+
llama_token tok_ov_img_end = LLAMA_TOKEN_NULL; // overview image
37+
llama_token tok_slices_start = LLAMA_TOKEN_NULL; // start of all slices
38+
llama_token tok_slices_end = LLAMA_TOKEN_NULL; // end of all slices
39+
llama_token tok_sli_img_start = LLAMA_TOKEN_NULL; // single slice
40+
llama_token tok_sli_img_end = LLAMA_TOKEN_NULL; // single slice
41+
llama_token tok_row_end = LLAMA_TOKEN_NULL; // end of row
42+
2443
// TODO @ngxson : add timings
2544

2645
mtmd_context(const char * mmproj_fname,
@@ -38,11 +57,64 @@ struct mtmd_context {
3857
throw std::runtime_error(string_format("Failed to load CLIP model from %s\n", mmproj_fname));
3958
}
4059
this->text_model = text_model;
60+
61+
int minicpmv_version = clip_is_minicpmv(ctx_clip);
62+
if (minicpmv_version == 2) {
63+
// minicpmv 2.5 format:
64+
// <image> (overview) </image><slice><image> (slice) </image><image> (slice) </image>\n ... </slice>
65+
slice_tmpl = MTMD_SLICE_TMPL_MINICPMV_2_5;
66+
tok_ov_img_start = lookup_token("<image>");
67+
tok_ov_img_end = lookup_token("</image>");
68+
tok_slices_start = lookup_token("<slice>");
69+
tok_slices_end = lookup_token("</slice>");
70+
tok_sli_img_start = tok_ov_img_start;
71+
tok_sli_img_end = tok_ov_img_end;
72+
tok_row_end = lookup_token("\n");
73+
74+
} else if (minicpmv_version == 3 || minicpmv_version == 4) {
75+
// minicpmv 2.6 format:
76+
// <image> (overview) </image><slice> (slice) </slice><slice> (slice) </slice>\n ...
77+
slice_tmpl = MTMD_SLICE_TMPL_MINICPMV_2_6;
78+
tok_ov_img_start = lookup_token("<image>");
79+
tok_ov_img_end = lookup_token("</image>");
80+
tok_sli_img_start = lookup_token("<slice>");
81+
tok_sli_img_end = lookup_token("</slice>");
82+
tok_row_end = lookup_token("\n");
83+
84+
} else if (minicpmv_version != 0) {
85+
GGML_ASSERT(false && "unsupported minicpmv version");
86+
}
4187
}
4288

4389
~mtmd_context() {
4490
clip_free(ctx_clip);
4591
}
92+
93+
private:
94+
llama_token lookup_token(const std::string & token_text) {
95+
const llama_vocab * vocab = llama_model_get_vocab(text_model);
96+
const int n_vocab = llama_vocab_n_tokens(vocab);
97+
for (int i = 0; i < n_vocab; i++) {
98+
if (token_to_piece(vocab, i, true) == token_text) {
99+
return i;
100+
}
101+
}
102+
return LLAMA_TOKEN_NULL;
103+
}
104+
105+
std::string token_to_piece(const struct llama_vocab * vocab, llama_token token, bool special) {
106+
std::string piece;
107+
piece.resize(piece.capacity()); // using string internal cache, 15 bytes + '\n'
108+
const int n_chars = llama_token_to_piece(vocab, token, &piece[0], piece.size(), 0, special);
109+
if (n_chars < 0) {
110+
piece.resize(-n_chars);
111+
int check = llama_token_to_piece(vocab, token, &piece[0], piece.size(), 0, special);
112+
GGML_ASSERT(check == -n_chars);
113+
} else {
114+
piece.resize(n_chars);
115+
}
116+
return piece;
117+
}
46118
};
47119

48120
struct mtmd_image_tokens_data {
@@ -122,6 +194,38 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
122194

123195
size_t i_img = 0;
124196

197+
// utility for adding raw tokens
198+
auto add_text_chunk = [&output](std::vector<llama_token> && tokens) {
199+
mtmd_input_chunk chunk{
200+
MTMD_INPUT_CHUNK_TYPE_TEXT,
201+
std::move(tokens),
202+
{},
203+
};
204+
output.emplace_back(std::move(chunk));
205+
};
206+
207+
// utility for splitting batch of multiple images into chunks of batch having single images
208+
auto split_batch_to_chunk = [&ctx](clip_image_f32_batch && batch_f32, const std::string & id) {
209+
std::vector<mtmd_input_chunk> chunks;
210+
211+
for (auto & entry : batch_f32.entries) {
212+
mtmd_image_tokens_ptr image_tokens(new mtmd_image_tokens);
213+
image_tokens->nx = clip_n_patches(ctx->ctx_clip);
214+
image_tokens->ny = 1;
215+
image_tokens->batch_f32.entries.push_back(std::move(entry));
216+
image_tokens->id = id;
217+
218+
mtmd_input_chunk chunk{
219+
MTMD_INPUT_CHUNK_TYPE_IMAGE,
220+
{},
221+
std::move(image_tokens),
222+
};
223+
chunks.emplace_back(std::move(chunk));
224+
}
225+
226+
return chunks;
227+
};
228+
125229
for (const auto & part : parts) {
126230
//printf("tokenizing part: %s\n", part.c_str());
127231
bool add_bos = &parts.front() == &part;
@@ -144,12 +248,13 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
144248
return 1;
145249
}
146250

147-
// shim layer
251+
// convert mtmd_bitmap to clip_image_u8
148252
clip_image_u8_ptr img_u8(clip_image_u8_init());
149253
img_u8->nx = bitmaps[i_img].nx;
150254
img_u8->ny = bitmaps[i_img].ny;
151255
img_u8->buf.resize(bitmaps[i_img].data.size());
152256
std::memcpy(img_u8->buf.data(), bitmaps[i_img].data.data(), img_u8->nx * img_u8->ny * 3);
257+
clip_image_size img_u8_size{img_u8->nx, img_u8->ny};
153258

154259
// preprocess image
155260
clip_image_f32_batch batch_f32;
@@ -159,28 +264,70 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
159264
return 2;
160265
}
161266

162-
mtmd_image_tokens_ptr image_tokens(new mtmd_image_tokens);
163-
image_tokens->nx = clip_n_patches(ctx->ctx_clip) * batch_f32.entries.size(); // TODO @ngxson : use clip_n_patches_by_image
164-
image_tokens->ny = 1; // TODO
165-
image_tokens->batch_f32 = std::move(batch_f32);
166-
image_tokens->id = bitmaps[i_img].id; // optional
167-
168-
LOG_DBG("image_tokens->nx = %d\n", image_tokens->nx);
169-
LOG_DBG("image_tokens->ny = %d\n", image_tokens->ny);
170-
LOG_DBG("batch_f32 size = %d\n", (int)image_tokens->batch_f32.entries.size());
171-
172-
if (clip_is_glm(ctx->ctx_clip)) {
173-
// glm-edge
174-
image_tokens->nx += 2; // add 2 for the begin_of_image and end_of_image token embeddings
267+
if (ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_5 || ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_6) {
268+
// split batch into chunks of single images
269+
auto chunks = split_batch_to_chunk(std::move(batch_f32), bitmaps[i_img].id);
270+
GGML_ASSERT(chunks.size() > 0);
271+
272+
// add overview image
273+
add_text_chunk({ctx->tok_ov_img_start});
274+
output.emplace_back(std::move(chunks.front()));
275+
chunks.erase(chunks.begin());
276+
add_text_chunk({ctx->tok_ov_img_end});
277+
278+
// add slices
279+
if (!chunks.empty()) {
280+
clip_add_load_image_size(ctx->ctx_clip, &img_u8_size);
281+
int n_col = clip_uhd_num_image_embeds_col(ctx->ctx_clip);
282+
int n_row = (int)chunks.size() / n_col;
283+
GGML_ASSERT(n_row * n_col == (int)chunks.size());
284+
if (ctx->tok_slices_start != LLAMA_TOKEN_NULL) {
285+
add_text_chunk({ctx->tok_slices_start});
286+
}
287+
for (int y = 0; y < n_row; y++) {
288+
for (int x = 0; x < n_col; x++) {
289+
if (ctx->tok_sli_img_start != LLAMA_TOKEN_NULL) {
290+
add_text_chunk({ctx->tok_sli_img_start});
291+
}
292+
output.emplace_back(std::move(chunks[y * n_col + x]));
293+
if (ctx->tok_sli_img_end != LLAMA_TOKEN_NULL) {
294+
add_text_chunk({ctx->tok_sli_img_end});
295+
}
296+
}
297+
if (ctx->tok_row_end != LLAMA_TOKEN_NULL && y != n_row - 1) {
298+
add_text_chunk({ctx->tok_row_end});
299+
}
300+
}
301+
if (ctx->tok_slices_end != LLAMA_TOKEN_NULL) {
302+
add_text_chunk({ctx->tok_slices_end});
303+
}
304+
}
305+
306+
} else {
307+
mtmd_image_tokens_ptr image_tokens(new mtmd_image_tokens);
308+
image_tokens->nx = clip_n_patches(ctx->ctx_clip) * batch_f32.entries.size(); // TODO @ngxson : use clip_n_patches_by_image
309+
image_tokens->ny = 1; // TODO
310+
image_tokens->batch_f32 = std::move(batch_f32);
311+
image_tokens->id = bitmaps[i_img].id; // optional
312+
313+
LOG_DBG("image_tokens->nx = %d\n", image_tokens->nx);
314+
LOG_DBG("image_tokens->ny = %d\n", image_tokens->ny);
315+
LOG_DBG("batch_f32 size = %d\n", (int)image_tokens->batch_f32.entries.size());
316+
317+
if (clip_is_glm(ctx->ctx_clip)) {
318+
// glm-edge
319+
image_tokens->nx += 2; // add 2 for the begin_of_image and end_of_image token embeddings
320+
}
321+
322+
mtmd_input_chunk chunk{
323+
MTMD_INPUT_CHUNK_TYPE_IMAGE,
324+
{},
325+
std::move(image_tokens),
326+
};
327+
output.emplace_back(std::move(chunk));
175328
}
176329

177-
mtmd_input_chunk chunk{
178-
MTMD_INPUT_CHUNK_TYPE_IMAGE,
179-
{},
180-
std::move(image_tokens),
181-
};
182-
output.emplace_back(std::move(chunk));
183-
i_img++;
330+
i_img++; // move to next image
184331
}
185332
}
186333

@@ -214,7 +361,15 @@ int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens)
214361
ctx->image_embd_v.resize(image_tokens->n_tokens() * n_mmproj_embd);
215362
bool ok = false;
216363

217-
if (clip_is_llava(ctx->ctx_clip)) {
364+
// only effective for minicpmv and qwen2vl, other models will ignore load_image_size
365+
{
366+
clip_image_size slice_size{
367+
image_tokens->batch_f32.entries[0]->nx,
368+
image_tokens->batch_f32.entries[0]->ny};
369+
clip_add_load_image_size(ctx->ctx_clip, &slice_size);
370+
}
371+
372+
if (clip_is_llava(ctx->ctx_clip) || clip_is_minicpmv(ctx->ctx_clip) || clip_is_glm(ctx->ctx_clip)) {
218373
// TODO @ngxson : llava does not support batched encoding ; this should be fixed inside clip_image_batch_encode()
219374
const auto & entries = image_tokens->batch_f32.entries;
220375
for (size_t i = 0; i < entries.size(); i++) {
@@ -330,7 +485,7 @@ int32_t mtmd_helper_eval(mtmd_context * ctx,
330485
GGML_ASSERT(chunk.tokens_image != nullptr);
331486
int64_t t0 = ggml_time_ms();
332487
if (ctx->print_timings) {
333-
LOG_INF("encoding image...\n");
488+
LOG_INF("encoding image or slice...\n");
334489
}
335490
ret = mtmd_encode(ctx, chunk.tokens_image.get());
336491
if (ret != 0) {
@@ -339,7 +494,7 @@ int32_t mtmd_helper_eval(mtmd_context * ctx,
339494
return ret;
340495
}
341496
if (ctx->print_timings) {
342-
LOG_INF("image encoded in %" PRId64 " ms\n", ggml_time_ms() - t0);
497+
LOG_INF("image/slice encoded in %" PRId64 " ms\n", ggml_time_ms() - t0);
343498
}
344499

345500
int32_t n_tokens = mtmd_image_tokens_get_n_tokens(chunk.tokens_image.get());

examples/llava/tests.sh

Lines changed: 11 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -28,18 +28,19 @@ add_test() {
2828
arr_tmpl+=("$tmpl")
2929
}
3030

31-
add_test "llama-mtmd-cli" "ggml-org/gemma-3-4b-it-GGUF:Q4_K_M"
32-
add_test "llama-mtmd-cli" "cmp-nct/Yi-VL-6B-GGUF:Q5_K" "vicuna"
33-
add_test "llama-mtmd-cli" "guinmoon/MobileVLM-3B-GGUF:Q4_K_M" "deepseek"
34-
add_test "llama-mtmd-cli" "THUDM/glm-edge-v-5b-gguf:Q4_K_M"
35-
add_test "llama-mtmd-cli" "second-state/Llava-v1.5-7B-GGUF:Q2_K" "vicuna"
36-
add_test "llama-mtmd-cli" "cjpais/llava-1.6-mistral-7b-gguf:Q3_K" "vicuna"
37-
add_test "llama-mtmd-cli" "ibm-research/granite-vision-3.2-2b-GGUF:Q4_K_M"
38-
add_test "llama-minicpmv-cli" "second-state/MiniCPM-Llama3-V-2_5-GGUF:Q2_K" # model from openbmb is corrupted
39-
add_test "llama-minicpmv-cli" "openbmb/MiniCPM-V-2_6-gguf:Q2_K"
40-
add_test "llama-minicpmv-cli" "openbmb/MiniCPM-o-2_6-gguf:Q4_0"
31+
add_test "llama-mtmd-cli" "ggml-org/gemma-3-4b-it-GGUF:Q4_K_M"
32+
add_test "llama-mtmd-cli" "guinmoon/MobileVLM-3B-GGUF:Q4_K_M" "deepseek"
33+
add_test "llama-mtmd-cli" "THUDM/glm-edge-v-5b-gguf:Q4_K_M"
34+
add_test "llama-mtmd-cli" "second-state/Llava-v1.5-7B-GGUF:Q2_K" "vicuna"
35+
add_test "llama-mtmd-cli" "cjpais/llava-1.6-mistral-7b-gguf:Q3_K" "vicuna"
36+
add_test "llama-mtmd-cli" "ibm-research/granite-vision-3.2-2b-GGUF:Q4_K_M"
37+
add_test "llama-mtmd-cli" "second-state/MiniCPM-Llama3-V-2_5-GGUF:Q2_K" # model from openbmb is corrupted
38+
add_test "llama-mtmd-cli" "openbmb/MiniCPM-V-2_6-gguf:Q2_K"
39+
add_test "llama-mtmd-cli" "openbmb/MiniCPM-o-2_6-gguf:Q4_0"
4140
add_test "llama-qwen2vl-cli" "bartowski/Qwen2-VL-2B-Instruct-GGUF:Q4_K_M"
4241

42+
# add_test "llama-mtmd-cli" "cmp-nct/Yi-VL-6B-GGUF:Q5_K" # this model has broken chat template, not usable
43+
4344
###############
4445

4546
cmake --build build -j --target "${arr_bin[@]}"

0 commit comments

Comments
 (0)