Skip to content

Commit a6625fa

Browse files
committed
change name llava2 --> mtmd
1 parent 117bf73 commit a6625fa

File tree

6 files changed

+87
-87
lines changed

6 files changed

+87
-87
lines changed

examples/llava/CMakeLists.txt

Lines changed: 18 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -24,41 +24,41 @@ if (BUILD_SHARED_LIBS)
2424
install(TARGETS llava_shared LIBRARY)
2525
endif()
2626

27-
# llava2
27+
# mtmd
2828

29-
add_library(llava2 OBJECT
30-
llava2.cpp
31-
llava2.h
29+
add_library(mtmd OBJECT
30+
mtmd.cpp
31+
mtmd.h
3232
clip.cpp
3333
clip.h
3434
clip-impl.h
3535
)
3636

37-
target_link_libraries(llava2 PRIVATE ggml llama ${CMAKE_THREAD_LIBS_INIT})
37+
target_link_libraries(mtmd PRIVATE ggml llama ${CMAKE_THREAD_LIBS_INIT})
3838

39-
target_include_directories(llava2 PUBLIC .)
40-
target_include_directories(llava2 PUBLIC ../..)
41-
target_include_directories(llava2 PUBLIC ../../common) # for stb_image.h
39+
target_include_directories(mtmd PUBLIC .)
40+
target_include_directories(mtmd PUBLIC ../..)
41+
target_include_directories(mtmd PUBLIC ../../common) # for stb_image.h
4242

43-
target_compile_features(llava2 PRIVATE cxx_std_17)
43+
target_compile_features(mtmd PRIVATE cxx_std_17)
4444

45-
add_library(llava2_static STATIC $<TARGET_OBJECTS:llava2>)
45+
add_library(mtmd_static STATIC $<TARGET_OBJECTS:mtmd>)
4646
if (BUILD_SHARED_LIBS)
47-
set_target_properties(llava2 PROPERTIES POSITION_INDEPENDENT_CODE ON)
48-
target_compile_definitions(llava2 PRIVATE LLAMA_SHARED LLAMA_BUILD)
49-
add_library(llava2_shared SHARED $<TARGET_OBJECTS:llava2>)
50-
target_link_libraries(llava2_shared PRIVATE ggml llama ${CMAKE_THREAD_LIBS_INIT})
51-
install(TARGETS llava2_shared LIBRARY)
47+
set_target_properties(mtmd PROPERTIES POSITION_INDEPENDENT_CODE ON)
48+
target_compile_definitions(mtmd PRIVATE LLAMA_SHARED LLAMA_BUILD)
49+
add_library(mtmd_shared SHARED $<TARGET_OBJECTS:mtmd>)
50+
target_link_libraries(mtmd_shared PRIVATE ggml llama ${CMAKE_THREAD_LIBS_INIT})
51+
install(TARGETS mtmd_shared LIBRARY)
5252
endif()
5353

5454
if (NOT MSVC)
5555
target_compile_options(llava PRIVATE -Wno-cast-qual) # stb_image.h
56-
target_compile_options(llava2 PRIVATE -Wno-cast-qual) # stb_image.h
56+
target_compile_options(mtmd PRIVATE -Wno-cast-qual) # stb_image.h
5757
endif()
5858

5959
if(TARGET BUILD_INFO)
6060
add_dependencies(llava BUILD_INFO)
61-
add_dependencies(llava2 BUILD_INFO)
61+
add_dependencies(mtmd BUILD_INFO)
6262
endif()
6363

6464
set(TARGET llama-llava-cli)
@@ -86,7 +86,7 @@ set(TARGET llama-gemma3-cli)
8686
add_executable(${TARGET} gemma3-cli.cpp)
8787
set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-gemma3-cli)
8888
install(TARGETS ${TARGET} RUNTIME)
89-
target_link_libraries(${TARGET} PRIVATE common llava2 ${CMAKE_THREAD_LIBS_INIT})
89+
target_link_libraries(${TARGET} PRIVATE common mtmd ${CMAKE_THREAD_LIBS_INIT})
9090
target_compile_features(${TARGET} PRIVATE cxx_std_17)
9191

9292
set(TARGET llama-llava-clip-quantize-cli)

examples/llava/clip-impl.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -329,7 +329,7 @@ static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) {
329329
}
330330

331331
//
332-
// API used internally with llava2
332+
// API used internally with mtmd
333333
//
334334

335335
projector_type clip_get_projector_type(const struct clip_ctx * ctx);

examples/llava/clip.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2886,7 +2886,7 @@ bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img,
28862886
}
28872887

28882888
//
2889-
// API used internally with llava2
2889+
// API used internally with mtmd
28902890
//
28912891

28922892
projector_type clip_get_projector_type(const struct clip_ctx * ctx) {

examples/llava/gemma3-cli.cpp

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
#include "ggml.h"
77
#include "console.h"
88
#include "chat.h"
9-
#include "llava2.h"
9+
#include "mtmd.h"
1010

1111
#include <vector>
1212
#include <limits.h>
@@ -57,7 +57,7 @@ static void sigint_handler(int signo) {
5757
#endif
5858

5959
struct gemma3_context {
60-
llava2_context_ptr ctx_vision;
60+
mtmd_context_ptr ctx_vision;
6161
common_init_result llama_init;
6262

6363
llama_model * model;
@@ -86,7 +86,7 @@ struct gemma3_context {
8686

8787
void init_vision_context(common_params & params) {
8888
const char * clip_path = params.mmproj.path.c_str();
89-
ctx_vision = llava2_init_from_file(clip_path, model, llava2_context_params{
89+
ctx_vision = mtmd_init_from_file(clip_path, model, mtmd_context_params{
9090
/* use_gpu */ true,
9191
/* timings */ true,
9292
/* n_threads */ params.cpuparams.n_threads,
@@ -162,7 +162,7 @@ static int generate_response(gemma3_context & ctx, common_sampler * smpl, int n_
162162
}
163163

164164
static int eval_message(gemma3_context & ctx, common_chat_msg & msg, std::vector<std::string> & images_fname, bool add_bos = false) {
165-
std::vector<llava2_bitmap> bitmaps;
165+
std::vector<mtmd_bitmap> bitmaps;
166166

167167
common_chat_templates_inputs tmpl_inputs;
168168
tmpl_inputs.messages = {msg};
@@ -172,30 +172,30 @@ static int eval_message(gemma3_context & ctx, common_chat_msg & msg, std::vector
172172
LOG_DBG("formatted_chat.prompt: %s\n", formatted_chat.prompt.c_str());
173173

174174
for (auto & fname : images_fname) {
175-
llava2_bitmap bitmap;
176-
if (llava2_bitmap_init_from_file(fname.c_str(), bitmap)) {
175+
mtmd_bitmap bitmap;
176+
if (mtmd_bitmap_init_from_file(fname.c_str(), bitmap)) {
177177
LOG_ERR("Unable to load image %s\n", fname.c_str());
178178
return 2; // image not found
179179
}
180180
bitmaps.push_back(std::move(bitmap));
181181
}
182182

183-
std::vector<llava2_input_chunk> chunks;
184-
llava2_input_text text;
183+
std::vector<mtmd_input_chunk> chunks;
184+
mtmd_input_text text;
185185
text.text = formatted_chat.prompt;
186186
text.add_special = add_bos;
187187
text.parse_special = true;
188-
if (llava2_tokenize(ctx.ctx_vision, chunks, text, bitmaps)) {
188+
if (mtmd_tokenize(ctx.ctx_vision, chunks, text, bitmaps)) {
189189
LOG_ERR("Unable to tokenize prompt\n");
190190
return 1;
191191
}
192192

193-
if (llava2_helper_eval(ctx.ctx_vision, ctx.lctx, chunks, ctx.n_past, 0, ctx.n_batch)) {
193+
if (mtmd_helper_eval(ctx.ctx_vision, ctx.lctx, chunks, ctx.n_past, 0, ctx.n_batch)) {
194194
LOG_ERR("Unable to eval prompt\n");
195195
return 1;
196196
}
197197

198-
ctx.n_past += llava2_helper_get_n_tokens(chunks);
198+
ctx.n_past += mtmd_helper_get_n_tokens(chunks);
199199

200200
return 0;
201201
}

examples/llava/llava2.cpp renamed to examples/llava/mtmd.cpp

Lines changed: 27 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
#include "clip.h"
22
#include "clip-impl.h"
3-
#include "llava2.h"
3+
#include "mtmd.h"
44

55
#include "llama.h"
66

@@ -12,7 +12,7 @@
1212
#include <limits>
1313
#include <vector>
1414

15-
struct llava2_context {
15+
struct mtmd_context {
1616
struct clip_ctx * ctx_clip;
1717
const struct llama_model * text_model;
1818
std::vector<float> image_embd_v; // image embedding vector
@@ -22,9 +22,9 @@ struct llava2_context {
2222

2323
// TODO @ngxson : add timings
2424

25-
llava2_context(const char * mmproj_fname,
25+
mtmd_context(const char * mmproj_fname,
2626
const struct llama_model * text_model,
27-
const struct llava2_context_params & ctx_params) : print_timings(ctx_params.print_timings), n_threads(ctx_params.n_threads), image_marker(ctx_params.image_marker) {
27+
const struct mtmd_context_params & ctx_params) : print_timings(ctx_params.print_timings), n_threads(ctx_params.n_threads), image_marker(ctx_params.image_marker) {
2828
clip_context_params ctx_clip_params;
2929
ctx_clip_params.use_gpu = ctx_params.use_gpu;
3030
ctx_clip_params.verbosity = ctx_params.verbosity;
@@ -35,28 +35,28 @@ struct llava2_context {
3535
this->text_model = text_model;
3636
}
3737

38-
~llava2_context() {
38+
~mtmd_context() {
3939
clip_free(ctx_clip);
4040
}
4141
};
4242

43-
struct llava2_image_tokens_data {
43+
struct mtmd_image_tokens_data {
4444
clip_image_f32_batch_ptr batch_f32; // preprocessed image patches
4545
};
4646

47-
llava2_context_ptr llava2_init_from_file(const char * mmproj_fname,
47+
mtmd_context_ptr mtmd_init_from_file(const char * mmproj_fname,
4848
const struct llama_model * text_model,
49-
const struct llava2_context_params ctx_params) {
49+
const struct mtmd_context_params ctx_params) {
5050
try {
51-
auto ctx = std::make_shared<llava2_context>(mmproj_fname, text_model, ctx_params);
51+
auto ctx = std::make_shared<mtmd_context>(mmproj_fname, text_model, ctx_params);
5252
return ctx;
5353
} catch (const std::exception & e) {
5454
LOG_ERR("%s: error: %s\n", __func__, e.what());
5555
return nullptr;
5656
}
5757
}
5858

59-
int32_t llava2_bitmap_init_from_file(const char * fname, llava2_bitmap & output) {
59+
int32_t mtmd_bitmap_init_from_file(const char * fname, mtmd_bitmap & output) {
6060
clip_image_u8_ptr img_u8(clip_image_u8_init());
6161
bool ok = clip_image_load_from_file(fname, img_u8.get());
6262
if (!ok) {
@@ -70,7 +70,7 @@ int32_t llava2_bitmap_init_from_file(const char * fname, llava2_bitmap & output)
7070
}
7171

7272
// copied from common_tokenize
73-
static std::vector<llama_token> llava2_tokenize_text_internal(
73+
static std::vector<llama_token> mtmd_tokenize_text_internal(
7474
const struct llama_vocab * vocab,
7575
const std::string & text,
7676
bool add_special,
@@ -89,10 +89,10 @@ static std::vector<llama_token> llava2_tokenize_text_internal(
8989
return result;
9090
}
9191

92-
int32_t llava2_tokenize(llava2_context_ptr & ctx,
93-
std::vector<llava2_input_chunk> & output,
94-
const llava2_input_text & text,
95-
const std::vector<llava2_bitmap> & bitmaps) {
92+
int32_t mtmd_tokenize(mtmd_context_ptr & ctx,
93+
std::vector<mtmd_input_chunk> & output,
94+
const mtmd_input_text & text,
95+
const std::vector<mtmd_bitmap> & bitmaps) {
9696
auto vocab = llama_model_get_vocab(ctx->text_model);
9797

9898
std::string prompt_modified(text.text);
@@ -115,7 +115,7 @@ int32_t llava2_tokenize(llava2_context_ptr & ctx,
115115
for (const auto & part : parts) {
116116
//printf("tokenizing part: %s\n", part.c_str());
117117
bool add_bos = &parts.front() == &part;
118-
auto tokens = llava2_tokenize_text_internal(vocab, part, text.add_special && add_bos, text.parse_special);
118+
auto tokens = mtmd_tokenize_text_internal(vocab, part, text.add_special && add_bos, text.parse_special);
119119
if (tokens.empty()) {
120120
continue;
121121
}
@@ -148,12 +148,12 @@ int32_t llava2_tokenize(llava2_context_ptr & ctx,
148148
return 1;
149149
}
150150

151-
llava2_image_tokens image_tokens;
151+
mtmd_image_tokens image_tokens;
152152
image_tokens.nx = 0; // TODO
153153
image_tokens.ny = 0; // TODO
154154
image_tokens.n_tokens = clip_n_patches(ctx->ctx_clip); // TODO @ngxson : use clip_n_patches_by_image
155-
image_tokens.data = std::unique_ptr<llava2_image_tokens_data>(
156-
new llava2_image_tokens_data{
155+
image_tokens.data = std::unique_ptr<mtmd_image_tokens_data>(
156+
new mtmd_image_tokens_data{
157157
std::move(batch_f32),
158158
}
159159
);
@@ -170,8 +170,8 @@ int32_t llava2_tokenize(llava2_context_ptr & ctx,
170170
return 0;
171171
}
172172

173-
LLAVA2_API int32_t llava2_encode(llava2_context_ptr & ctx,
174-
const llava2_image_tokens & image_tokens) {
173+
LLAVA2_API int32_t mtmd_encode(mtmd_context_ptr & ctx,
174+
const mtmd_image_tokens & image_tokens) {
175175
int n_mmproj_embd = clip_n_mmproj_embd(ctx->ctx_clip);
176176
ctx->image_embd_v.resize(image_tokens.n_tokens * n_mmproj_embd);
177177
bool ok = clip_image_batch_encode(
@@ -182,11 +182,11 @@ LLAVA2_API int32_t llava2_encode(llava2_context_ptr & ctx,
182182
return ok ? 0 : 1;
183183
}
184184

185-
LLAVA2_API float * llava2_get_output_embd(llava2_context_ptr & ctx) {
185+
LLAVA2_API float * mtmd_get_output_embd(mtmd_context_ptr & ctx) {
186186
return ctx->image_embd_v.data();
187187
}
188188

189-
size_t llava2_helper_get_n_tokens(std::vector<llava2_input_chunk> & chunks) {
189+
size_t mtmd_helper_get_n_tokens(std::vector<mtmd_input_chunk> & chunks) {
190190
size_t n_tokens = 0;
191191
for (auto & chunk : chunks) {
192192
if (chunk.type == LLAVA2_INPUT_CHUNK_TYPE_TEXT) {
@@ -235,9 +235,9 @@ struct decode_embd_batch {
235235
}
236236
};
237237

238-
int32_t llava2_helper_eval(llava2_context_ptr & ctx,
238+
int32_t mtmd_helper_eval(mtmd_context_ptr & ctx,
239239
llama_context * lctx,
240-
std::vector<llava2_input_chunk> & chunks,
240+
std::vector<mtmd_input_chunk> & chunks,
241241
llama_pos pos0,
242242
llama_seq_id seq_id,
243243
int32_t n_batch) {
@@ -274,7 +274,7 @@ int32_t llava2_helper_eval(llava2_context_ptr & ctx,
274274
if (ctx->print_timings) {
275275
LOG_INF("encoding image...\n");
276276
}
277-
ret = llava2_encode(ctx, chunk.tokens_image);
277+
ret = mtmd_encode(ctx, chunk.tokens_image);
278278
if (ret != 0) {
279279
LOG_ERR("failed to encode image\n");
280280
llama_batch_free(text_batch);
@@ -285,7 +285,7 @@ int32_t llava2_helper_eval(llava2_context_ptr & ctx,
285285
}
286286

287287
int32_t n_tokens = chunk.tokens_image.n_tokens;
288-
float * embd = llava2_get_output_embd(ctx);
288+
float * embd = mtmd_get_output_embd(ctx);
289289
decode_embd_batch batch_img(embd, n_tokens, n_past, 0);
290290
int64_t t1 = ggml_time_ms();
291291
ret = llama_decode(lctx, batch_img.batch);

0 commit comments

Comments
 (0)