Slim down

mattjcly · mattjcly · commit 816a37520f80 · 2025-05-08T12:44:23.000-04:00
diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp
@@ -167,7 +167,7 @@ struct mtmd_image_tokens {
     clip_image_f32_batch batch_f32; // preprocessed image patches
     std::string id; // optional user-defined ID, useful for KV cache tracking
 
-    mtmd_image_tokens clone() const {
+    mtmd_image_tokens clone() {
         return mtmd_image_tokens{
             nx,
             ny,
@@ -409,6 +409,12 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
     return 0;
 }
 
+static void mtmd_image_tokens_free(mtmd_image_tokens * image_tokens) {
+    if (image_tokens) {
+        delete image_tokens;
+    }
+}
+
 int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens) {
     int n_mmproj_embd = clip_n_mmproj_embd(ctx->ctx_clip);
     ctx->image_embd_v.resize(image_tokens->n_tokens() * n_mmproj_embd);
@@ -448,23 +454,6 @@ float * mtmd_get_output_embd(mtmd_context * ctx) {
     return ctx->image_embd_v.data();
 }
 
-float * mtmd_get_output_embd_copy(mtmd_context * ctx, size_t * n_embd_out) {
-    if (ctx->image_embd_v.empty()) {
-        *n_embd_out = 0;
-        return NULL;
-    }
-
-    *n_embd_out = ctx->image_embd_v.size();
-    float * copy = (float *) malloc(*n_embd_out * sizeof(float));
-    if (copy == NULL) {
-        *n_embd_out = 0;
-        return NULL;
-    }
-
-    memcpy(copy, ctx->image_embd_v.data(), ctx->image_embd_v.size() * sizeof(float));
-    return copy;
-}
-
 size_t mtmd_helper_get_n_tokens(const mtmd_input_chunks * chunks) {
     size_t n_tokens = 0;
     for (size_t i = 0; i < mtmd_input_chunks_size(chunks); i++) {
@@ -592,15 +581,26 @@ struct decode_embd_batch {
 };
 
 // Helper function for decoding an image whose embeddings have already been calculated
-int32_t mtmd_helper_decode_image(
+int32_t mtmd_helper_decode_image_chunk(
         mtmd_context * ctx,
         struct llama_context * lctx,
-        const mtmd_image_tokens * image_tokens,
+        const mtmd_input_chunk * chunk,
         float * embd,
         llama_pos n_past,
         llama_seq_id seq_id,
         int32_t n_batch,
         llama_pos * new_n_past) {
+
+    if (chunk->type != MTMD_INPUT_CHUNK_TYPE_IMAGE) {
+        LOG_ERR("failed to decode image chunk: input chunk not of image type\n");
+        return -1;
+    }
+    if (!chunk->tokens_image) {
+        LOG_ERR("failed to decode image chunk: image tokens are null\n");
+        return -1;
+    }
+    const auto image_tokens = chunk->tokens_image.get();
+
     int n_mmproj_embd = clip_n_mmproj_embd(ctx->ctx_clip);
     int n_pos_per_embd = mtmd_decode_use_mrope(ctx) ? 4 : 1;
 
@@ -710,7 +710,7 @@ int32_t mtmd_helper_eval_chunk_single(mtmd_context * ctx,
             LOG_INF("image/slice encoded in %" PRId64 " ms\n", ggml_time_ms() - t0);
         }
         float * embd = mtmd_get_output_embd(ctx);
-        ret = mtmd_helper_decode_image(ctx, lctx, image_tokens, embd, n_past, seq_id, n_batch, new_n_past);
+        ret = mtmd_helper_decode_image_chunk(ctx, lctx, chunk, embd, n_past, seq_id, n_batch, new_n_past);
         if (ret != 0) {
             LOG_ERR("failed to decode image\n");
             llama_batch_free(text_batch);
@@ -931,19 +931,6 @@ llama_pos mtmd_image_tokens_get_n_pos(const mtmd_image_tokens * image_tokens) {
     return image_tokens->n_tokens();
 }
 
-void mtmd_image_tokens_free(mtmd_image_tokens * image_tokens) {
-    if (image_tokens) {
-        delete image_tokens;
-    }
-}
-
-mtmd_image_tokens * mtmd_image_tokens_copy(const mtmd_image_tokens * image_tokens) {
-    if (!image_tokens) {
-        return nullptr;
-    }
-    return new mtmd_image_tokens(image_tokens->clone());
-}
-
 // test function
 
 mtmd_input_chunks * mtmd_test_create_input_chunks() {
diff --git a/tools/mtmd/mtmd.h b/tools/mtmd/mtmd.h
@@ -143,14 +143,12 @@ MTMD_API void               mtmd_input_chunk_free(mtmd_input_chunk * chunk);
 //
 // the instance will be constructed via mtmd_tokenize()
 // it will be freed along with mtmd_input_chunk
-MTMD_API size_t              mtmd_image_tokens_get_n_tokens (const mtmd_image_tokens * image_tokens);
-MTMD_API size_t              mtmd_image_tokens_get_nx       (const mtmd_image_tokens * image_tokens);
-MTMD_API size_t              mtmd_image_tokens_get_ny       (const mtmd_image_tokens * image_tokens);
-MTMD_API const char *        mtmd_image_tokens_get_id       (const mtmd_image_tokens * image_tokens);
+MTMD_API size_t       mtmd_image_tokens_get_n_tokens(const mtmd_image_tokens * image_tokens);
+MTMD_API size_t       mtmd_image_tokens_get_nx      (const mtmd_image_tokens * image_tokens);
+MTMD_API size_t       mtmd_image_tokens_get_ny      (const mtmd_image_tokens * image_tokens);
+MTMD_API const char * mtmd_image_tokens_get_id      (const mtmd_image_tokens * image_tokens);
 // number of temporal positions (always 1 for M-RoPE, n_tokens otherwise)
-MTMD_API llama_pos           mtmd_image_tokens_get_n_pos    (const mtmd_image_tokens * image_tokens);
-MTMD_API mtmd_image_tokens * mtmd_image_tokens_copy         (const mtmd_image_tokens * image_tokens);
-MTMD_API void                mtmd_image_tokens_free         (mtmd_image_tokens * image_tokens);
+MTMD_API llama_pos    mtmd_image_tokens_get_n_pos   (const mtmd_image_tokens * image_tokens);
 
 // tokenize an input text prompt and an image
 // the prompt must have the input image marker (default: "<__image__>") in it
@@ -180,9 +178,6 @@ MTMD_API int32_t mtmd_encode(mtmd_context * ctx,
 // get output embeddings from the last encode pass
 MTMD_API float * mtmd_get_output_embd(mtmd_context * ctx);
 
-// returns a copy of output embeddings from the last encode pass, of size n_embd_out
-MTMD_API float * mtmd_get_output_embd_copy(mtmd_context * ctx, size_t * n_embd_out);
-
 /////////////////////////////////////////
 
 //
@@ -237,14 +232,15 @@ MTMD_API int32_t mtmd_helper_eval_chunk_single(mtmd_context * ctx,
                                                llama_pos * new_n_past);
 
 // helper function to decode an image whose embeddings have already been calculated
-MTMD_API int32_t mtmd_helper_decode_image(mtmd_context *ctx,
-                                          struct llama_context *lctx,
-                                          const mtmd_image_tokens *image_tokens,
-                                          float *embd,
-                                          llama_pos n_past,
-                                          llama_seq_id seq_id,
-                                          int32_t n_batch,
-                                          llama_pos *new_n_past);
+// ret 0 on success, -1 on chunk not being a valid image chunk, 1 on decode failure
+MTMD_API int32_t mtmd_helper_decode_image_chunk(mtmd_context *ctx,
+                                                struct llama_context *lctx,
+                                                const mtmd_input_chunk * chunk,
+                                                float *embd,
+                                                llama_pos n_past,
+                                                llama_seq_id seq_id,
+                                                int32_t n_batch,
+                                                llama_pos *new_n_past);
 
 /////////////////////////////////////////
 
@@ -283,11 +279,6 @@ struct mtmd_input_chunk_deleter {
 };
 using input_chunk_ptr = std::unique_ptr<mtmd_input_chunk, mtmd_input_chunk_deleter>;
 
-struct mtmd_image_tokens_deleter {
-    void operator()(mtmd_image_tokens * val) { mtmd_image_tokens_free(val); }
-};
-using image_tokens_ptr = std::unique_ptr<mtmd_image_tokens, mtmd_image_tokens_deleter>;
-
 struct bitmap {
     bitmap_ptr ptr;
     bitmap() : ptr(nullptr) {}