Skip to content

Commit 816a375

Browse files
committed
Slim down
1 parent 227e139 commit 816a375

File tree

2 files changed

+35
-57
lines changed

2 files changed

+35
-57
lines changed

tools/mtmd/mtmd.cpp

Lines changed: 21 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -167,7 +167,7 @@ struct mtmd_image_tokens {
167167
clip_image_f32_batch batch_f32; // preprocessed image patches
168168
std::string id; // optional user-defined ID, useful for KV cache tracking
169169

170-
mtmd_image_tokens clone() const {
170+
mtmd_image_tokens clone() {
171171
return mtmd_image_tokens{
172172
nx,
173173
ny,
@@ -409,6 +409,12 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
409409
return 0;
410410
}
411411

412+
static void mtmd_image_tokens_free(mtmd_image_tokens * image_tokens) {
413+
if (image_tokens) {
414+
delete image_tokens;
415+
}
416+
}
417+
412418
int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens) {
413419
int n_mmproj_embd = clip_n_mmproj_embd(ctx->ctx_clip);
414420
ctx->image_embd_v.resize(image_tokens->n_tokens() * n_mmproj_embd);
@@ -448,23 +454,6 @@ float * mtmd_get_output_embd(mtmd_context * ctx) {
448454
return ctx->image_embd_v.data();
449455
}
450456

451-
float * mtmd_get_output_embd_copy(mtmd_context * ctx, size_t * n_embd_out) {
452-
if (ctx->image_embd_v.empty()) {
453-
*n_embd_out = 0;
454-
return NULL;
455-
}
456-
457-
*n_embd_out = ctx->image_embd_v.size();
458-
float * copy = (float *) malloc(*n_embd_out * sizeof(float));
459-
if (copy == NULL) {
460-
*n_embd_out = 0;
461-
return NULL;
462-
}
463-
464-
memcpy(copy, ctx->image_embd_v.data(), ctx->image_embd_v.size() * sizeof(float));
465-
return copy;
466-
}
467-
468457
size_t mtmd_helper_get_n_tokens(const mtmd_input_chunks * chunks) {
469458
size_t n_tokens = 0;
470459
for (size_t i = 0; i < mtmd_input_chunks_size(chunks); i++) {
@@ -592,15 +581,26 @@ struct decode_embd_batch {
592581
};
593582

594583
// Helper function for decoding an image whose embeddings have already been calculated
595-
int32_t mtmd_helper_decode_image(
584+
int32_t mtmd_helper_decode_image_chunk(
596585
mtmd_context * ctx,
597586
struct llama_context * lctx,
598-
const mtmd_image_tokens * image_tokens,
587+
const mtmd_input_chunk * chunk,
599588
float * embd,
600589
llama_pos n_past,
601590
llama_seq_id seq_id,
602591
int32_t n_batch,
603592
llama_pos * new_n_past) {
593+
594+
if (chunk->type != MTMD_INPUT_CHUNK_TYPE_IMAGE) {
595+
LOG_ERR("failed to decode image chunk: input chunk not of image type\n");
596+
return -1;
597+
}
598+
if (!chunk->tokens_image) {
599+
LOG_ERR("failed to decode image chunk: image tokens are null\n");
600+
return -1;
601+
}
602+
const auto image_tokens = chunk->tokens_image.get();
603+
604604
int n_mmproj_embd = clip_n_mmproj_embd(ctx->ctx_clip);
605605
int n_pos_per_embd = mtmd_decode_use_mrope(ctx) ? 4 : 1;
606606

@@ -710,7 +710,7 @@ int32_t mtmd_helper_eval_chunk_single(mtmd_context * ctx,
710710
LOG_INF("image/slice encoded in %" PRId64 " ms\n", ggml_time_ms() - t0);
711711
}
712712
float * embd = mtmd_get_output_embd(ctx);
713-
ret = mtmd_helper_decode_image(ctx, lctx, image_tokens, embd, n_past, seq_id, n_batch, new_n_past);
713+
ret = mtmd_helper_decode_image_chunk(ctx, lctx, chunk, embd, n_past, seq_id, n_batch, new_n_past);
714714
if (ret != 0) {
715715
LOG_ERR("failed to decode image\n");
716716
llama_batch_free(text_batch);
@@ -931,19 +931,6 @@ llama_pos mtmd_image_tokens_get_n_pos(const mtmd_image_tokens * image_tokens) {
931931
return image_tokens->n_tokens();
932932
}
933933

934-
void mtmd_image_tokens_free(mtmd_image_tokens * image_tokens) {
935-
if (image_tokens) {
936-
delete image_tokens;
937-
}
938-
}
939-
940-
mtmd_image_tokens * mtmd_image_tokens_copy(const mtmd_image_tokens * image_tokens) {
941-
if (!image_tokens) {
942-
return nullptr;
943-
}
944-
return new mtmd_image_tokens(image_tokens->clone());
945-
}
946-
947934
// test function
948935

949936
mtmd_input_chunks * mtmd_test_create_input_chunks() {

tools/mtmd/mtmd.h

Lines changed: 14 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -143,14 +143,12 @@ MTMD_API void mtmd_input_chunk_free(mtmd_input_chunk * chunk);
143143
//
144144
// the instance will be constructed via mtmd_tokenize()
145145
// it will be freed along with mtmd_input_chunk
146-
MTMD_API size_t mtmd_image_tokens_get_n_tokens (const mtmd_image_tokens * image_tokens);
147-
MTMD_API size_t mtmd_image_tokens_get_nx (const mtmd_image_tokens * image_tokens);
148-
MTMD_API size_t mtmd_image_tokens_get_ny (const mtmd_image_tokens * image_tokens);
149-
MTMD_API const char * mtmd_image_tokens_get_id (const mtmd_image_tokens * image_tokens);
146+
MTMD_API size_t mtmd_image_tokens_get_n_tokens(const mtmd_image_tokens * image_tokens);
147+
MTMD_API size_t mtmd_image_tokens_get_nx (const mtmd_image_tokens * image_tokens);
148+
MTMD_API size_t mtmd_image_tokens_get_ny (const mtmd_image_tokens * image_tokens);
149+
MTMD_API const char * mtmd_image_tokens_get_id (const mtmd_image_tokens * image_tokens);
150150
// number of temporal positions (always 1 for M-RoPE, n_tokens otherwise)
151-
MTMD_API llama_pos mtmd_image_tokens_get_n_pos (const mtmd_image_tokens * image_tokens);
152-
MTMD_API mtmd_image_tokens * mtmd_image_tokens_copy (const mtmd_image_tokens * image_tokens);
153-
MTMD_API void mtmd_image_tokens_free (mtmd_image_tokens * image_tokens);
151+
MTMD_API llama_pos mtmd_image_tokens_get_n_pos (const mtmd_image_tokens * image_tokens);
154152

155153
// tokenize an input text prompt and an image
156154
// the prompt must have the input image marker (default: "<__image__>") in it
@@ -180,9 +178,6 @@ MTMD_API int32_t mtmd_encode(mtmd_context * ctx,
180178
// get output embeddings from the last encode pass
181179
MTMD_API float * mtmd_get_output_embd(mtmd_context * ctx);
182180

183-
// returns a copy of output embeddings from the last encode pass, of size n_embd_out
184-
MTMD_API float * mtmd_get_output_embd_copy(mtmd_context * ctx, size_t * n_embd_out);
185-
186181
/////////////////////////////////////////
187182

188183
//
@@ -237,14 +232,15 @@ MTMD_API int32_t mtmd_helper_eval_chunk_single(mtmd_context * ctx,
237232
llama_pos * new_n_past);
238233

239234
// helper function to decode an image whose embeddings have already been calculated
240-
MTMD_API int32_t mtmd_helper_decode_image(mtmd_context *ctx,
241-
struct llama_context *lctx,
242-
const mtmd_image_tokens *image_tokens,
243-
float *embd,
244-
llama_pos n_past,
245-
llama_seq_id seq_id,
246-
int32_t n_batch,
247-
llama_pos *new_n_past);
235+
// ret 0 on success, -1 on chunk not being a valid image chunk, 1 on decode failure
236+
MTMD_API int32_t mtmd_helper_decode_image_chunk(mtmd_context *ctx,
237+
struct llama_context *lctx,
238+
const mtmd_input_chunk * chunk,
239+
float *embd,
240+
llama_pos n_past,
241+
llama_seq_id seq_id,
242+
int32_t n_batch,
243+
llama_pos *new_n_past);
248244

249245
/////////////////////////////////////////
250246

@@ -283,11 +279,6 @@ struct mtmd_input_chunk_deleter {
283279
};
284280
using input_chunk_ptr = std::unique_ptr<mtmd_input_chunk, mtmd_input_chunk_deleter>;
285281

286-
struct mtmd_image_tokens_deleter {
287-
void operator()(mtmd_image_tokens * val) { mtmd_image_tokens_free(val); }
288-
};
289-
using image_tokens_ptr = std::unique_ptr<mtmd_image_tokens, mtmd_image_tokens_deleter>;
290-
291282
struct bitmap {
292283
bitmap_ptr ptr;
293284
bitmap() : ptr(nullptr) {}

0 commit comments

Comments
 (0)