Skip to content

Commit 949d32f

Browse files
committed
correct chat template
1 parent 37b24ec commit 949d32f

File tree

5 files changed

+32
-2
lines changed

5 files changed

+32
-2
lines changed

convert_hf_to_gguf.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1894,6 +1894,8 @@ def __init__(self, *args, **kwargs):
18941894
if self.hparams["model_type"] == "smolvlm_vision":
18951895
self.hparams["hidden_size"] = self.hparams.get("hidden_size", 1152)
18961896
self.hparams["num_attention_heads"] = self.hparams.get("num_attention_heads", 16)
1897+
self.hparams["intermediate_size"] = self.hparams.get("intermediate_size", 3072)
1898+
self.hparams["num_hidden_layers"] = self.hparams.get("num_hidden_layers", 12)
18971899

18981900
def set_gguf_parameters(self):
18991901
super().set_gguf_parameters()

examples/llava/mtmd.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -176,13 +176,20 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
176176

177177
std::string prompt_modified(text.text);
178178
std::string marker_modified(ctx->image_marker);
179+
projector_type proj_type = clip_get_projector_type(ctx->ctx_clip);
180+
179181
// a bit hacky here, but works for now
180182
// for some models, we need to add prefix and suffix to the image embeddings
181183
if (clip_is_gemma3(ctx->ctx_clip)) {
182184
// gemma 3
183185
// <start_of_image> ... (image embeddings) ... <end_of_image>
184186
marker_modified = "<start_of_image>" + ctx->image_marker + "<end_of_image>";
185187
string_replace_all(prompt_modified, ctx->image_marker, marker_modified);
188+
189+
} else if (proj_type == PROJECTOR_TYPE_IDEFICS3) {
190+
// https://github.com/huggingface/transformers/blob/a42ba80fa520c784c8f11a973ca9034e5f859b79/src/transformers/models/idefics3/processing_idefics3.py#L192-L215
191+
marker_modified = "<fake_token_around_image><global-img>" + ctx->image_marker + "<fake_token_around_image>";
192+
string_replace_all(prompt_modified, ctx->image_marker, marker_modified);
186193
}
187194

188195
// llava-1.5, llava-1.6, Yi-VL, Yi-34B, granite: don't need to add prefix and suffix

examples/llava/tests.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ add_test() {
2828
arr_tmpl+=("$tmpl")
2929
}
3030

31+
add_test "llama-mtmd-cli" "ggml-org/SmolVLM2-2.2B-Instruct-GGUF:Q8_0"
3132
add_test "llama-mtmd-cli" "ggml-org/gemma-3-4b-it-GGUF:Q4_K_M"
3233
add_test "llama-mtmd-cli" "guinmoon/MobileVLM-3B-GGUF:Q4_K_M" "deepseek"
3334
add_test "llama-mtmd-cli" "THUDM/glm-edge-v-5b-gguf:Q4_K_M"

src/llama-chat.cpp

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,7 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
6262
{ "yandex", LLM_CHAT_TEMPLATE_YANDEX },
6363
{ "bailing", LLM_CHAT_TEMPLATE_BAILING },
6464
{ "llama4", LLM_CHAT_TEMPLATE_LLAMA4 },
65+
{ "smolvlm", LLM_CHAT_TEMPLATE_SMOLVLM },
6566
};
6667

6768
llm_chat_template llm_chat_template_from_str(const std::string & name) {
@@ -81,7 +82,9 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
8182
if (tmpl_contains("<|im_start|>")) {
8283
return tmpl_contains("<|im_sep|>")
8384
? LLM_CHAT_TEMPLATE_PHI_4
84-
: LLM_CHAT_TEMPLATE_CHATML;
85+
: tmpl_contains("<end_of_utterance>")
86+
? LLM_CHAT_TEMPLATE_SMOLVLM // SmolVLM uses <|im_start|> as BOS, but it is NOT chatml
87+
: LLM_CHAT_TEMPLATE_CHATML;
8588
} else if (tmpl.find("mistral") == 0 || tmpl_contains("[INST]")) {
8689
if (tmpl_contains("[SYSTEM_PROMPT]")) {
8790
return LLM_CHAT_TEMPLATE_MISTRAL_V7;
@@ -622,7 +625,23 @@ int32_t llm_chat_apply_template(
622625
if (add_ass) {
623626
ss << "<|header_start|>assistant<|header_end|>\n\n";
624627
}
625-
} else {
628+
} else if (tmpl == LLM_CHAT_TEMPLATE_SMOLVLM) {
629+
// SmolVLM
630+
ss << "<|im_start|>"; // uses <|im_start|> as BOS, but the actual content is NOT chatml
631+
for (auto message : chat) {
632+
std::string role(message->role);
633+
if (role == "system") {
634+
ss << message->content << "\n\n";
635+
} else if (role == "user") {
636+
ss << "User: " << message->content << "<end_of_utterance>\n";
637+
} else {
638+
ss << "Assistant: " << message->content << "<end_of_utterance>\n";
639+
}
640+
}
641+
if (add_ass) {
642+
ss << "Assistant:";
643+
}
644+
} else {
626645
// template not supported
627646
return -1;
628647
}

src/llama-chat.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ enum llm_chat_template {
4141
LLM_CHAT_TEMPLATE_YANDEX,
4242
LLM_CHAT_TEMPLATE_BAILING,
4343
LLM_CHAT_TEMPLATE_LLAMA4,
44+
LLM_CHAT_TEMPLATE_SMOLVLM,
4445
LLM_CHAT_TEMPLATE_UNKNOWN,
4546
};
4647

0 commit comments

Comments
 (0)