Skip to content

Commit 89afda8

Browse files
committed
visual_model warmup (technically) works
1 parent 63a042f commit 89afda8

File tree

3 files changed

+9
-1
lines changed

3 files changed

+9
-1
lines changed

tools/mtmd/clip.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5412,6 +5412,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
54125412
case PROJECTOR_TYPE_VOXTRAL:
54135413
case PROJECTOR_TYPE_JANUS_PRO:
54145414
case PROJECTOR_TYPE_COGVLM:
5415+
case PROJECTOR_TYPE_DEEPSEEKOCR:
54155416
{
54165417
// do nothing
54175418
} break;
@@ -5554,6 +5555,10 @@ bool clip_is_gemma3(const struct clip_ctx * ctx) {
55545555
return ctx->proj_type() == PROJECTOR_TYPE_GEMMA3;
55555556
}
55565557

5558+
bool clip_is_deepseekocr(const struct clip_ctx * ctx) {
5559+
return ctx->proj_type() == PROJECTOR_TYPE_DEEPSEEKOCR;
5560+
}
5561+
55575562
bool clip_has_vision_encoder(const struct clip_ctx * ctx) {
55585563
return ctx->model.modality == CLIP_MODALITY_VISION;
55595564
}

tools/mtmd/clip.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,8 @@ bool clip_is_glm(const struct clip_ctx * ctx);
105105
bool clip_is_qwen2vl(const struct clip_ctx * ctx);
106106
bool clip_is_llava(const struct clip_ctx * ctx);
107107
bool clip_is_gemma3(const struct clip_ctx * ctx);
108+
bool clip_is_deepseekocr(const struct clip_ctx * ctx);
109+
108110

109111
bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec);
110112

tools/mtmd/mtmd.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -810,7 +810,8 @@ int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens)
810810

811811
if (clip_is_llava(ctx_clip)
812812
|| clip_is_minicpmv(ctx_clip)
813-
|| clip_is_glm(ctx_clip)) {
813+
|| clip_is_glm(ctx_clip)
814+
|| clip_is_deepseekocr(ctx_clip)) {
814815
// TODO @ngxson : llava does not support batched encoding ; this should be fixed inside clip_image_batch_encode()
815816
const auto & entries = image_tokens->batch_f32.entries;
816817
for (size_t i = 0; i < entries.size(); i++) {

0 commit comments

Comments
 (0)