Skip to content

Commit 3f71188

Browse files
committed
mtmd: correct token order
1 parent 4cfa15f commit 3f71188

File tree

4 files changed

+20
-3
lines changed

4 files changed

+20
-3
lines changed

src/llama-vocab.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2347,6 +2347,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
23472347
|| t.first == "_<EOT>"
23482348
|| t.first == "<|end_of_text|>"
23492349
|| t.first == "<end_of_utterance>" // smoldocling
2350+
|| t.first == "<|end▁of▁sentence|>" // deepseek-ocr
23502351
) {
23512352
special_eog_ids.insert(t.second);
23522353
if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {

tools/mtmd/mtmd-cli.cpp

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -222,14 +222,18 @@ static std::string chat_add_and_format(mtmd_cli_context & ctx, common_chat_msg &
222222

223223
static int eval_message(mtmd_cli_context & ctx, common_chat_msg & msg) {
224224
bool add_bos = ctx.chat_history.empty();
225-
auto formatted_chat = chat_add_and_format(ctx, msg);
226-
LOG_DBG("formatted_chat.prompt: %s\n", formatted_chat.c_str());
227225

228226
mtmd_input_text text;
229-
text.text = formatted_chat.c_str();
227+
text.text = msg.content.c_str();
230228
text.add_special = add_bos;
231229
text.parse_special = true;
232230

231+
if (!mtmd_is_deepseekocr(ctx.ctx_vision.get())) {
232+
auto formatted_chat = chat_add_and_format(ctx, msg);
233+
LOG_DBG("formatted_chat.prompt: %s\n", formatted_chat.c_str());
234+
text.text = formatted_chat.c_str();
235+
}
236+
233237
if (g_is_interrupted) return 0;
234238

235239
mtmd::input_chunks chunks(mtmd_input_chunks_init());
@@ -332,6 +336,11 @@ int main(int argc, char ** argv) {
332336
}
333337

334338
} else {
339+
if (mtmd_is_deepseekocr(ctx.ctx_vision.get())) {
340+
LOG_ERR("\n DeepSeek-OCR doesn't support chat mode.");
341+
return 1;
342+
}
343+
335344
LOG("\n Running in chat mode, available commands:");
336345
if (mtmd_support_vision(ctx.ctx_vision.get())) {
337346
LOG("\n /image <path> load an image");

tools/mtmd/mtmd.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -864,6 +864,10 @@ int mtmd_get_audio_bitrate(mtmd_context * ctx) {
864864
return 16000; // 16kHz
865865
}
866866

867+
bool mtmd_is_deepseekocr(mtmd_context * ctx) {
868+
return ctx->ctx_v && clip_is_deepseekocr(ctx->ctx_v);
869+
}
870+
867871
//
868872
// public API functions
869873
//

tools/mtmd/mtmd.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -117,6 +117,9 @@ MTMD_API bool mtmd_support_audio(mtmd_context * ctx);
117117
// return -1 if audio is not supported
118118
MTMD_API int mtmd_get_audio_bitrate(mtmd_context * ctx);
119119

120+
// whether the current model is DeepSeek-OCR
121+
MTMD_API bool mtmd_is_deepseekocr(mtmd_context * ctx);
122+
120123
// mtmd_bitmap
121124
//
122125
// if bitmap is image:

0 commit comments

Comments
 (0)