@@ -167,7 +167,7 @@ struct mtmd_image_tokens {
167167 clip_image_f32_batch batch_f32; // preprocessed image patches
168168 std::string id; // optional user-defined ID, useful for KV cache tracking
169169
170- mtmd_image_tokens clone () const {
170+ mtmd_image_tokens clone () {
171171 return mtmd_image_tokens{
172172 nx,
173173 ny,
@@ -409,6 +409,12 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
409409 return 0 ;
410410}
411411
412+ static void mtmd_image_tokens_free (mtmd_image_tokens * image_tokens) {
413+ if (image_tokens) {
414+ delete image_tokens;
415+ }
416+ }
417+
412418int32_t mtmd_encode (mtmd_context * ctx, const mtmd_image_tokens * image_tokens) {
413419 int n_mmproj_embd = clip_n_mmproj_embd (ctx->ctx_clip );
414420 ctx->image_embd_v .resize (image_tokens->n_tokens () * n_mmproj_embd);
@@ -448,23 +454,6 @@ float * mtmd_get_output_embd(mtmd_context * ctx) {
448454 return ctx->image_embd_v .data ();
449455}
450456
451- float * mtmd_get_output_embd_copy (mtmd_context * ctx, size_t * n_embd_out) {
452- if (ctx->image_embd_v .empty ()) {
453- *n_embd_out = 0 ;
454- return NULL ;
455- }
456-
457- *n_embd_out = ctx->image_embd_v .size ();
458- float * copy = (float *) malloc (*n_embd_out * sizeof (float ));
459- if (copy == NULL ) {
460- *n_embd_out = 0 ;
461- return NULL ;
462- }
463-
464- memcpy (copy, ctx->image_embd_v .data (), ctx->image_embd_v .size () * sizeof (float ));
465- return copy;
466- }
467-
468457size_t mtmd_helper_get_n_tokens (const mtmd_input_chunks * chunks) {
469458 size_t n_tokens = 0 ;
470459 for (size_t i = 0 ; i < mtmd_input_chunks_size (chunks); i++) {
@@ -592,15 +581,26 @@ struct decode_embd_batch {
592581};
593582
594583// Helper function for decoding an image whose embeddings have already been calculated
595- int32_t mtmd_helper_decode_image (
584+ int32_t mtmd_helper_decode_image_chunk (
596585 mtmd_context * ctx,
597586 struct llama_context * lctx,
598- const mtmd_image_tokens * image_tokens ,
587+ const mtmd_input_chunk * chunk ,
599588 float * embd,
600589 llama_pos n_past,
601590 llama_seq_id seq_id,
602591 int32_t n_batch,
603592 llama_pos * new_n_past) {
593+
594+ if (chunk->type != MTMD_INPUT_CHUNK_TYPE_IMAGE) {
595+ LOG_ERR (" failed to decode image chunk: input chunk not of image type\n " );
596+ return -1 ;
597+ }
598+ if (!chunk->tokens_image ) {
599+ LOG_ERR (" failed to decode image chunk: image tokens are null\n " );
600+ return -1 ;
601+ }
602+ const auto image_tokens = chunk->tokens_image .get ();
603+
604604 int n_mmproj_embd = clip_n_mmproj_embd (ctx->ctx_clip );
605605 int n_pos_per_embd = mtmd_decode_use_mrope (ctx) ? 4 : 1 ;
606606
@@ -710,7 +710,7 @@ int32_t mtmd_helper_eval_chunk_single(mtmd_context * ctx,
710710 LOG_INF (" image/slice encoded in %" PRId64 " ms\n " , ggml_time_ms () - t0);
711711 }
712712 float * embd = mtmd_get_output_embd (ctx);
713- ret = mtmd_helper_decode_image (ctx, lctx, image_tokens , embd, n_past, seq_id, n_batch, new_n_past);
713+ ret = mtmd_helper_decode_image_chunk (ctx, lctx, chunk , embd, n_past, seq_id, n_batch, new_n_past);
714714 if (ret != 0 ) {
715715 LOG_ERR (" failed to decode image\n " );
716716 llama_batch_free (text_batch);
@@ -931,19 +931,6 @@ llama_pos mtmd_image_tokens_get_n_pos(const mtmd_image_tokens * image_tokens) {
931931 return image_tokens->n_tokens ();
932932}
933933
934- void mtmd_image_tokens_free (mtmd_image_tokens * image_tokens) {
935- if (image_tokens) {
936- delete image_tokens;
937- }
938- }
939-
940- mtmd_image_tokens * mtmd_image_tokens_copy (const mtmd_image_tokens * image_tokens) {
941- if (!image_tokens) {
942- return nullptr ;
943- }
944- return new mtmd_image_tokens (image_tokens->clone ());
945- }
946-
947934// test function
948935
949936mtmd_input_chunks * mtmd_test_create_input_chunks () {
0 commit comments