Skip to content

Commit 0f28ebb

Browse files
committed
fix: deepseek token debug logs: "special_eos_id is not in special_eog_ids - the tokenizer config may be incorrect","'<|end▁of▁sentence|>' is not marked as EOG","'<|EOT|>' is not marked as EOG"
1 parent 7c7f3b7 commit 0f28ebb

File tree

1 file changed

+16
-2
lines changed

1 file changed

+16
-2
lines changed

src/llama-vocab.cpp

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1781,7 +1781,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
17811781
|| t.first == "<end_of_turn>"
17821782
|| t.first == "<|endoftext|>"
17831783
|| t.first == "<EOT>"
1784-
|| t.first == "<|end▁of▁sentence|>" // DeepSeek
1784+
|| t.first == "<|EOT|>"// DeepSeek-r1
17851785
) {
17861786
special_eot_id = t.second;
17871787
if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
@@ -1791,7 +1791,19 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
17911791
}
17921792
}
17931793
}
1794-
1794+
// find EOS token: "<|end▁of▁sentence|>", etc. // for deepseek
1795+
if (special_eos_id == LLAMA_TOKEN_NULL) {
1796+
if (false
1797+
|| t.first == "<|end▁of▁sentence|>" // DeepSeek
1798+
) {
1799+
special_eos_id = t.second;
1800+
if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
1801+
LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
1802+
__func__, t.second, t.first.c_str());
1803+
id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
1804+
}
1805+
}
1806+
}
17951807
// find EOM token: "<|eom_id|>"
17961808
if (special_eom_id == LLAMA_TOKEN_NULL) {
17971809
if (false
@@ -1931,6 +1943,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
19311943
|| t.first == "<|endoftext|>"
19321944
|| t.first == "<|eom_id|>"
19331945
|| t.first == "<EOT>"
1946+
|| t.first == "<|EOT|>"// DeepSeek
1947+
|| t.first == "<|end▁of▁sentence|>" // DeepSeek
19341948
) {
19351949
special_eog_ids.insert(t.second);
19361950
if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {

0 commit comments

Comments
 (0)