visual_model warmup (technically) works

sfallah · sfallah · commit 89afda8da900 · 2025-11-18T10:26:32.000+01:00
diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
@@ -5412,6 +5412,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
         case PROJECTOR_TYPE_VOXTRAL:
         case PROJECTOR_TYPE_JANUS_PRO:
         case PROJECTOR_TYPE_COGVLM:
+        case PROJECTOR_TYPE_DEEPSEEKOCR:
             {
                 // do nothing
             } break;
@@ -5554,6 +5555,10 @@ bool clip_is_gemma3(const struct clip_ctx * ctx) {
     return ctx->proj_type() == PROJECTOR_TYPE_GEMMA3;
 }
 
+bool clip_is_deepseekocr(const struct clip_ctx * ctx) {
+    return ctx->proj_type() == PROJECTOR_TYPE_DEEPSEEKOCR;
+}
+
 bool clip_has_vision_encoder(const struct clip_ctx * ctx) {
     return ctx->model.modality == CLIP_MODALITY_VISION;
 }
diff --git a/tools/mtmd/clip.h b/tools/mtmd/clip.h
@@ -105,6 +105,8 @@ bool clip_is_glm(const struct clip_ctx * ctx);
 bool clip_is_qwen2vl(const struct clip_ctx * ctx);
 bool clip_is_llava(const struct clip_ctx * ctx);
 bool clip_is_gemma3(const struct clip_ctx * ctx);
+bool clip_is_deepseekocr(const struct clip_ctx * ctx);
+
 
 bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec);
 
diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp
@@ -810,7 +810,8 @@ int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens)
 
     if (clip_is_llava(ctx_clip)
         || clip_is_minicpmv(ctx_clip)
-        || clip_is_glm(ctx_clip)) {
+        || clip_is_glm(ctx_clip)
+        || clip_is_deepseekocr(ctx_clip)) {
         // TODO @ngxson : llava does not support batched encoding ; this should be fixed inside clip_image_batch_encode()
         const auto & entries = image_tokens->batch_f32.entries;
         for (size_t i = 0; i < entries.size(); i++) {

Original file line number	Diff line number	Diff line change
`@@ -5412,6 +5412,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima`
`5412`	`5412`	`case PROJECTOR_TYPE_VOXTRAL:`
`5413`	`5413`	`case PROJECTOR_TYPE_JANUS_PRO:`
`5414`	`5414`	`case PROJECTOR_TYPE_COGVLM:`
	`5415`	`+ case PROJECTOR_TYPE_DEEPSEEKOCR:`
`5415`	`5416`	`{`
`5416`	`5417`	`// do nothing`
`5417`	`5418`	`} break;`
`@@ -5554,6 +5555,10 @@ bool clip_is_gemma3(const struct clip_ctx * ctx) {`
`5554`	`5555`	`return ctx->proj_type() == PROJECTOR_TYPE_GEMMA3;`
`5555`	`5556`	`}`
`5556`	`5557`
	`5558`	`+bool clip_is_deepseekocr(const struct clip_ctx * ctx) {`
	`5559`	`+ return ctx->proj_type() == PROJECTOR_TYPE_DEEPSEEKOCR;`
	`5560`	`+}`
	`5561`	`+`
`5557`	`5562`	`bool clip_has_vision_encoder(const struct clip_ctx * ctx) {`
`5558`	`5563`	`return ctx->model.modality == CLIP_MODALITY_VISION;`
`5559`	`5564`	`}`