@@ -6509,21 +6509,21 @@ static void llm_load_vocab(
65096509 // for now, we apply this workaround to find the EOT token based on its text
65106510 if (vocab.special_eot_id == -1) {
65116511 for (const auto & t : vocab.token_to_id) {
6512- if (
6512+ if (false
65136513 // TODO: gemma "<end_of_turn>" is exported as a normal token, so the following check does not work
65146514 // need to fix convert script
65156515 //vocab.id_to_token[t.second].type == LLAMA_TOKEN_TYPE_CONTROL &&
6516- ( t.first == "<|eot_id|>" ||
6517- t.first == "<|im_end|>" ||
6518- t.first == "<|end|>" ||
6519- t.first == "<end_of_turn>" ||
6520- t.first == "<|endoftext|>"
6521- )
6516+ || t.first == "<|eot_id|>"
6517+ || t.first == "<|im_end|>"
6518+ || t.first == "<|end|>"
6519+ || t.first == "<end_of_turn>"
6520+ || t.first == "<|endoftext|>"
6521+ || t.first == "<EOT>"
65226522 ) {
65236523 vocab.special_eot_id = t.second;
65246524 if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
65256525 LLAMA_LOG_WARN("%s: control-looking token: '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
6526- __func__, t.first.c_str());
6526+ __func__, t.first.c_str());
65276527 vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
65286528 }
65296529 break;
@@ -6546,6 +6546,44 @@ static void llm_load_vocab(
65466546 }
65476547 }
65486548 }
6549+
6550+ // maintain a list of tokens that cause end-of-generation
6551+ // this is currently determined based on the token text, which is obviously not ideal
6552+ // ref: https://github.com/ggerganov/llama.cpp/issues/9606
6553+ vocab.special_eog_ids.clear();
6554+ for (const auto & t : vocab.token_to_id) {
6555+ if (false
6556+ || t.first == "<|eot_id|>"
6557+ || t.first == "<|im_end|>"
6558+ || t.first == "<|end|>"
6559+ || t.first == "<end_of_turn>"
6560+ || t.first == "<|endoftext|>"
6561+ || t.first == "<|eom_id|>"
6562+ || t.first == "<EOT>"
6563+ ) {
6564+ vocab.special_eog_ids.insert(t.second);
6565+ if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
6566+ LLAMA_LOG_WARN("%s: control-looking token: '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
6567+ __func__, t.first.c_str());
6568+ vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
6569+ }
6570+ }
6571+ }
6572+
6573+ if (vocab.special_eos_id != -1 && vocab.special_eog_ids.count(vocab.special_eos_id) == 0) {
6574+ vocab.special_eog_ids.insert(vocab.special_eos_id);
6575+ LLAMA_LOG_WARN("%s: special_eos_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__);
6576+ }
6577+
6578+ if (vocab.special_eot_id != -1 && vocab.special_eog_ids.count(vocab.special_eot_id) == 0) {
6579+ vocab.special_eog_ids.insert(vocab.special_eot_id);
6580+ LLAMA_LOG_WARN("%s: special_eot_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__);
6581+ }
6582+
6583+ if (vocab.special_eom_id != -1 && vocab.special_eog_ids.count(vocab.special_eom_id) == 0) {
6584+ vocab.special_eog_ids.insert(vocab.special_eom_id);
6585+ LLAMA_LOG_WARN("%s: special_eom_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__);
6586+ }
65496587 }
65506588
65516589 // build special tokens cache
@@ -6749,6 +6787,11 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
67496787 if (vocab.special_suffix_id != -1) { LLAMA_LOG_INFO( "%s: SUF token = %d '%s'\n", __func__, vocab.special_suffix_id, vocab.id_to_token[vocab.special_suffix_id].text.c_str() ); }
67506788 if (vocab.special_middle_id != -1) { LLAMA_LOG_INFO( "%s: MID token = %d '%s'\n", __func__, vocab.special_middle_id, vocab.id_to_token[vocab.special_middle_id].text.c_str() ); }
67516789 if (vocab.special_eot_id != -1) { LLAMA_LOG_INFO( "%s: EOT token = %d '%s'\n", __func__, vocab.special_eot_id, vocab.id_to_token[vocab.special_eot_id].text.c_str() ); }
6790+ if (vocab.special_eom_id != -1) { LLAMA_LOG_INFO( "%s: EOM token = %d '%s'\n", __func__, vocab.special_eom_id, vocab.id_to_token[vocab.special_eom_id].text.c_str() ); }
6791+
6792+ for (const auto & id : vocab.special_eog_ids) {
6793+ LLAMA_LOG_INFO( "%s: EOG token = %d '%s'\n", __func__, id, vocab.id_to_token[id].text.c_str() );
6794+ }
67526795
67536796 LLAMA_LOG_INFO("%s: max token length = %d\n", __func__, vocab.max_token_len);
67546797
0 commit comments