@@ -16,6 +16,7 @@ struct mtmd_context {
1616 struct clip_ctx * ctx_clip;
1717 const struct llama_model * text_model;
1818 std::vector<float > image_embd_v; // image embedding vector
19+
1920 bool print_timings;
2021 int n_threads;
2122 std::string image_marker;
@@ -24,7 +25,11 @@ struct mtmd_context {
2425
2526 mtmd_context (const char * mmproj_fname,
2627 const llama_model * text_model,
27- const mtmd_context_params & ctx_params) : print_timings(ctx_params.print_timings), n_threads(ctx_params.n_threads), image_marker(ctx_params.image_marker) {
28+ const mtmd_context_params & ctx_params) :
29+ print_timings (ctx_params.print_timings),
30+ n_threads (ctx_params.n_threads),
31+ image_marker (ctx_params.image_marker)
32+ {
2833 clip_context_params ctx_clip_params;
2934 ctx_clip_params.use_gpu = ctx_params.use_gpu ;
3035 ctx_clip_params.verbosity = ctx_params.verbosity ;
@@ -49,6 +54,7 @@ struct mtmd_image_tokens {
4954 uint32_t ny; // number of tokens in y direction
5055 uint32_t n_tokens () const { return nx * ny; }
5156 clip_image_f32_batch batch_f32; // preprocessed image patches
57+ std::string id; // optional user-defined ID, useful for KV cache tracking
5258};
5359
5460mtmd_context * mtmd_init_from_file (const char * mmproj_fname,
@@ -88,10 +94,10 @@ static std::vector<llama_token> mtmd_tokenize_text_internal(
8894 return result;
8995}
9096
91- mtmd_input_chunks * mtmd_tokenize (mtmd_context * ctx,
92- const mtmd_input_text & text ,
93- const std::vector<mtmd_bitmap> & bitmaps) {
94- mtmd_input_chunks * output = new mtmd_input_chunks;
97+ int32_t mtmd_tokenize (mtmd_context * ctx,
98+ std::vector<mtmd_input_chunk> & output ,
99+ const mtmd_input_text & text,
100+ const std::vector<mtmd_bitmap> & bitmaps) {
95101 auto vocab = llama_model_get_vocab (ctx->text_model );
96102
97103 std::string prompt_modified (text.text );
@@ -105,9 +111,9 @@ mtmd_input_chunks * mtmd_tokenize(mtmd_context * ctx,
105111 string_replace_all (prompt_modified, ctx->image_marker , marker_modified);
106112 }
107113
108- std::vector<std::string> parts = string_split_str (text. text , ctx->image_marker );
109- output-> clear ();
110- output-> reserve (parts.size ());
114+ std::vector<std::string> parts = string_split_str (prompt_modified , ctx->image_marker );
115+ output. clear ();
116+ output. reserve (parts.size ());
111117
112118 size_t i_img = 0 ;
113119
@@ -123,14 +129,14 @@ mtmd_input_chunks * mtmd_tokenize(mtmd_context * ctx,
123129 std::move (tokens),
124130 {},
125131 };
126- output-> emplace_back (std::move (chunk));
132+ output. emplace_back (std::move (chunk));
127133
128134 if (&parts.back () != &part) {
129135 // add image token to middle of 2 parts
130136
131137 if (i_img >= bitmaps.size ()) {
132138 LOG_ERR (" %s: error: not enough images for %d parts\n " , __func__, (int )parts.size ());
133- return nullptr ;
139+ return 1 ;
134140 }
135141
136142 // shim layer
@@ -145,34 +151,48 @@ mtmd_input_chunks * mtmd_tokenize(mtmd_context * ctx,
145151 bool ok = clip_image_preprocess (ctx->ctx_clip , img_u8.get (), &batch_f32);
146152 if (!ok) {
147153 LOG_ERR (" Unable to preprocess image\n " );
148- return nullptr ;
154+ return 2 ;
149155 }
150156
151- mtmd_image_tokens * image_tokens = new mtmd_image_tokens;
157+ mtmd_image_tokens_ptr image_tokens ( new mtmd_image_tokens) ;
152158 image_tokens->nx = clip_n_patches (ctx->ctx_clip ); // TODO @ngxson : use clip_n_patches_by_image
153159 image_tokens->ny = 1 ; // TODO
154160 image_tokens->batch_f32 = std::move (batch_f32);
161+ image_tokens->id = bitmaps[i_img].id ; // optional
155162
156163 mtmd_input_chunk chunk{
157164 MTMD_INPUT_CHUNK_TYPE_IMAGE,
158165 {},
159- image_tokens,
166+ std::move ( image_tokens) ,
160167 };
161- output-> emplace_back (std::move (chunk));
168+ output. emplace_back (std::move (chunk));
162169 i_img++;
163170 }
164171 }
165172
166- return output ;
173+ return 0 ;
167174}
168175
169- void mtmd_input_chunks_free (mtmd_input_chunks * chunks) {
170- for (auto & chunk : *chunks) {
171- if (chunk.type == MTMD_INPUT_CHUNK_TYPE_IMAGE && chunk.tokens_image ) {
172- delete chunk.tokens_image ;
173- }
176+ void mtmd_image_tokens_free (mtmd_image_tokens * image_tokens) {
177+ if (image_tokens) {
178+ delete image_tokens;
174179 }
175- delete chunks;
180+ }
181+
182+ size_t mtmd_image_tokens_get_n_tokens (const mtmd_image_tokens * image_tokens) {
183+ return image_tokens->n_tokens ();
184+ }
185+
186+ size_t mtmd_image_tokens_get_nx (const mtmd_image_tokens * image_tokens) {
187+ return image_tokens->nx ;
188+ }
189+
190+ size_t mtmd_image_tokens_get_ny (const mtmd_image_tokens * image_tokens) {
191+ return image_tokens->ny ;
192+ }
193+
194+ std::string mtmd_image_tokens_get_id (const mtmd_image_tokens * image_tokens) {
195+ return image_tokens->id ;
176196}
177197
178198int32_t mtmd_encode (mtmd_context * ctx, const mtmd_image_tokens * image_tokens) {
@@ -190,9 +210,9 @@ float * mtmd_get_output_embd(mtmd_context * ctx) {
190210 return ctx->image_embd_v .data ();
191211}
192212
193- size_t mtmd_helper_get_n_tokens (mtmd_input_chunks * chunks) {
213+ size_t mtmd_helper_get_n_tokens (mtmd_input_chunks & chunks) {
194214 size_t n_tokens = 0 ;
195- for (auto & chunk : * chunks) {
215+ for (auto & chunk : chunks) {
196216 if (chunk.type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
197217 n_tokens += chunk.tokens_text .size ();
198218 } else if (chunk.type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
@@ -241,16 +261,16 @@ struct decode_embd_batch {
241261
242262int32_t mtmd_helper_eval (mtmd_context * ctx,
243263 llama_context * lctx,
244- mtmd_input_chunks * chunks,
264+ mtmd_input_chunks & chunks,
245265 llama_pos pos0,
246266 llama_seq_id seq_id,
247267 int32_t n_batch) {
248268 int32_t ret;
249269 llama_pos n_past = pos0;
250270 llama_batch text_batch = llama_batch_init (n_batch, 0 , 1 );
251271
252- for (auto & chunk : * chunks) {
253- bool is_last = &chunk == &chunks-> back ();
272+ for (auto & chunk : chunks) {
273+ bool is_last = &chunk == &chunks. back ();
254274 if (chunk.type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
255275 // TODO @ngxson : may need to split into smaller batches
256276 text_batch.n_tokens = chunk.tokens_text .size ();
@@ -279,7 +299,7 @@ int32_t mtmd_helper_eval(mtmd_context * ctx,
279299 if (ctx->print_timings ) {
280300 LOG_INF (" encoding image...\n " );
281301 }
282- ret = mtmd_encode (ctx, chunk.tokens_image );
302+ ret = mtmd_encode (ctx, chunk.tokens_image . get () );
283303 if (ret != 0 ) {
284304 LOG_ERR (" failed to encode image\n " );
285305 llama_batch_free (text_batch);
@@ -289,7 +309,7 @@ int32_t mtmd_helper_eval(mtmd_context * ctx,
289309 LOG_INF (" image encoded in %" PRId64 " ms\n " , ggml_time_ms () - t0);
290310 }
291311
292- int32_t n_tokens = chunk.tokens_image -> n_tokens ( );
312+ int32_t n_tokens = mtmd_image_tokens_get_n_tokens ( chunk.tokens_image . get () );
293313 float * embd = mtmd_get_output_embd (ctx);
294314 decode_embd_batch batch_img (embd, n_tokens, n_past, 0 );
295315 int64_t t1 = ggml_time_ms ();
@@ -339,3 +359,15 @@ int32_t mtmd_helper_bitmap_init_from_file(const char * fname, mtmd_bitmap & outp
339359 std::memcpy (output.data .data (), data, output.nx * output.ny * 3 );
340360 return 0 ;
341361}
362+
363+ bool mtmd_decode_use_non_causal (mtmd_context * ctx) {
364+ projector_type proj_type = clip_get_projector_type (ctx->ctx_clip );
365+ if (proj_type == PROJECTOR_TYPE_GEMMA3) {
366+ return true ;
367+ }
368+ return false ;
369+ }
370+
371+ void mtmd_image_tokens_deleter::operator ()(mtmd_image_tokens * val) {
372+ mtmd_image_tokens_free (val);
373+ }
0 commit comments