11#include " clip.h"
22#include " clip-impl.h"
3- #include " llava2 .h"
3+ #include " mtmd .h"
44
55#include " llama.h"
66
1212#include < limits>
1313#include < vector>
1414
15- struct llava2_context {
15+ struct mtmd_context {
1616 struct clip_ctx * ctx_clip;
1717 const struct llama_model * text_model;
1818 std::vector<float > image_embd_v; // image embedding vector
@@ -22,9 +22,9 @@ struct llava2_context {
2222
2323 // TODO @ngxson : add timings
2424
25- llava2_context (const char * mmproj_fname,
25+ mtmd_context (const char * mmproj_fname,
2626 const struct llama_model * text_model,
27- const struct llava2_context_params & ctx_params) : print_timings(ctx_params.print_timings), n_threads(ctx_params.n_threads), image_marker(ctx_params.image_marker) {
27+ const struct mtmd_context_params & ctx_params) : print_timings(ctx_params.print_timings), n_threads(ctx_params.n_threads), image_marker(ctx_params.image_marker) {
2828 clip_context_params ctx_clip_params;
2929 ctx_clip_params.use_gpu = ctx_params.use_gpu ;
3030 ctx_clip_params.verbosity = ctx_params.verbosity ;
@@ -35,28 +35,28 @@ struct llava2_context {
3535 this ->text_model = text_model;
3636 }
3737
38- ~llava2_context () {
38+ ~mtmd_context () {
3939 clip_free (ctx_clip);
4040 }
4141};
4242
43- struct llava2_image_tokens_data {
43+ struct mtmd_image_tokens_data {
4444 clip_image_f32_batch_ptr batch_f32; // preprocessed image patches
4545};
4646
47- llava2_context_ptr llava2_init_from_file (const char * mmproj_fname,
47+ mtmd_context_ptr mtmd_init_from_file (const char * mmproj_fname,
4848 const struct llama_model * text_model,
49- const struct llava2_context_params ctx_params) {
49+ const struct mtmd_context_params ctx_params) {
5050 try {
51- auto ctx = std::make_shared<llava2_context >(mmproj_fname, text_model, ctx_params);
51+ auto ctx = std::make_shared<mtmd_context >(mmproj_fname, text_model, ctx_params);
5252 return ctx;
5353 } catch (const std::exception & e) {
5454 LOG_ERR (" %s: error: %s\n " , __func__, e.what ());
5555 return nullptr ;
5656 }
5757}
5858
59- int32_t llava2_bitmap_init_from_file (const char * fname, llava2_bitmap & output) {
59+ int32_t mtmd_bitmap_init_from_file (const char * fname, mtmd_bitmap & output) {
6060 clip_image_u8_ptr img_u8 (clip_image_u8_init ());
6161 bool ok = clip_image_load_from_file (fname, img_u8.get ());
6262 if (!ok) {
@@ -70,7 +70,7 @@ int32_t llava2_bitmap_init_from_file(const char * fname, llava2_bitmap & output)
7070}
7171
7272// copied from common_tokenize
73- static std::vector<llama_token> llava2_tokenize_text_internal (
73+ static std::vector<llama_token> mtmd_tokenize_text_internal (
7474 const struct llama_vocab * vocab,
7575 const std::string & text,
7676 bool add_special,
@@ -89,10 +89,10 @@ static std::vector<llama_token> llava2_tokenize_text_internal(
8989 return result;
9090}
9191
92- int32_t llava2_tokenize (llava2_context_ptr & ctx,
93- std::vector<llava2_input_chunk > & output,
94- const llava2_input_text & text,
95- const std::vector<llava2_bitmap > & bitmaps) {
92+ int32_t mtmd_tokenize (mtmd_context_ptr & ctx,
93+ std::vector<mtmd_input_chunk > & output,
94+ const mtmd_input_text & text,
95+ const std::vector<mtmd_bitmap > & bitmaps) {
9696 auto vocab = llama_model_get_vocab (ctx->text_model );
9797
9898 std::string prompt_modified (text.text );
@@ -115,7 +115,7 @@ int32_t llava2_tokenize(llava2_context_ptr & ctx,
115115 for (const auto & part : parts) {
116116 // printf("tokenizing part: %s\n", part.c_str());
117117 bool add_bos = &parts.front () == ∂
118- auto tokens = llava2_tokenize_text_internal (vocab, part, text.add_special && add_bos, text.parse_special );
118+ auto tokens = mtmd_tokenize_text_internal (vocab, part, text.add_special && add_bos, text.parse_special );
119119 if (tokens.empty ()) {
120120 continue ;
121121 }
@@ -148,12 +148,12 @@ int32_t llava2_tokenize(llava2_context_ptr & ctx,
148148 return 1 ;
149149 }
150150
151- llava2_image_tokens image_tokens;
151+ mtmd_image_tokens image_tokens;
152152 image_tokens.nx = 0 ; // TODO
153153 image_tokens.ny = 0 ; // TODO
154154 image_tokens.n_tokens = clip_n_patches (ctx->ctx_clip ); // TODO @ngxson : use clip_n_patches_by_image
155- image_tokens.data = std::unique_ptr<llava2_image_tokens_data >(
156- new llava2_image_tokens_data {
155+ image_tokens.data = std::unique_ptr<mtmd_image_tokens_data >(
156+ new mtmd_image_tokens_data {
157157 std::move (batch_f32),
158158 }
159159 );
@@ -170,8 +170,8 @@ int32_t llava2_tokenize(llava2_context_ptr & ctx,
170170 return 0 ;
171171}
172172
173- LLAVA2_API int32_t llava2_encode (llava2_context_ptr & ctx,
174- const llava2_image_tokens & image_tokens) {
173+ LLAVA2_API int32_t mtmd_encode (mtmd_context_ptr & ctx,
174+ const mtmd_image_tokens & image_tokens) {
175175 int n_mmproj_embd = clip_n_mmproj_embd (ctx->ctx_clip );
176176 ctx->image_embd_v .resize (image_tokens.n_tokens * n_mmproj_embd);
177177 bool ok = clip_image_batch_encode (
@@ -182,11 +182,11 @@ LLAVA2_API int32_t llava2_encode(llava2_context_ptr & ctx,
182182 return ok ? 0 : 1 ;
183183}
184184
185- LLAVA2_API float * llava2_get_output_embd (llava2_context_ptr & ctx) {
185+ LLAVA2_API float * mtmd_get_output_embd (mtmd_context_ptr & ctx) {
186186 return ctx->image_embd_v .data ();
187187}
188188
189- size_t llava2_helper_get_n_tokens (std::vector<llava2_input_chunk > & chunks) {
189+ size_t mtmd_helper_get_n_tokens (std::vector<mtmd_input_chunk > & chunks) {
190190 size_t n_tokens = 0 ;
191191 for (auto & chunk : chunks) {
192192 if (chunk.type == LLAVA2_INPUT_CHUNK_TYPE_TEXT) {
@@ -235,9 +235,9 @@ struct decode_embd_batch {
235235 }
236236};
237237
238- int32_t llava2_helper_eval (llava2_context_ptr & ctx,
238+ int32_t mtmd_helper_eval (mtmd_context_ptr & ctx,
239239 llama_context * lctx,
240- std::vector<llava2_input_chunk > & chunks,
240+ std::vector<mtmd_input_chunk > & chunks,
241241 llama_pos pos0,
242242 llama_seq_id seq_id,
243243 int32_t n_batch) {
@@ -274,7 +274,7 @@ int32_t llava2_helper_eval(llava2_context_ptr & ctx,
274274 if (ctx->print_timings ) {
275275 LOG_INF (" encoding image...\n " );
276276 }
277- ret = llava2_encode (ctx, chunk.tokens_image );
277+ ret = mtmd_encode (ctx, chunk.tokens_image );
278278 if (ret != 0 ) {
279279 LOG_ERR (" failed to encode image\n " );
280280 llama_batch_free (text_batch);
@@ -285,7 +285,7 @@ int32_t llava2_helper_eval(llava2_context_ptr & ctx,
285285 }
286286
287287 int32_t n_tokens = chunk.tokens_image .n_tokens ;
288- float * embd = llava2_get_output_embd (ctx);
288+ float * embd = mtmd_get_output_embd (ctx);
289289 decode_embd_batch batch_img (embd, n_tokens, n_past, 0 );
290290 int64_t t1 = ggml_time_ms ();
291291 ret = llama_decode (lctx, batch_img.batch );
0 commit comments