@@ -6596,8 +6596,8 @@ static void llm_load_vocab(
65966596 ) {
65976597 vocab.special_eot_id = t.second;
65986598 if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
6599- LLAMA_LOG_WARN("%s: control-looking token: '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
6600- __func__, t.first.c_str());
6599+ LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
6600+ __func__, t.second, t. first.c_str());
66016601 vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
66026602 }
66036603 }
@@ -6610,8 +6610,8 @@ static void llm_load_vocab(
66106610 ) {
66116611 vocab.special_eom_id = t.second;
66126612 if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
6613- LLAMA_LOG_WARN("%s: control-looking token: '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
6614- __func__, t.first.c_str());
6613+ LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
6614+ __func__, t.second, t. first.c_str());
66156615 vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
66166616 }
66176617 }
@@ -6627,8 +6627,8 @@ static void llm_load_vocab(
66276627 ) {
66286628 vocab.special_fim_pre_id = t.second;
66296629 if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
6630- LLAMA_LOG_WARN("%s: control-looking token: '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
6631- __func__, t.first.c_str());
6630+ LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
6631+ __func__, t.second, t. first.c_str());
66326632 vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
66336633 }
66346634 }
@@ -6644,8 +6644,8 @@ static void llm_load_vocab(
66446644 ) {
66456645 vocab.special_fim_suf_id = t.second;
66466646 if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
6647- LLAMA_LOG_WARN("%s: control-looking token: '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
6648- __func__, t.first.c_str());
6647+ LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
6648+ __func__, t.second, t. first.c_str());
66496649 vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
66506650 }
66516651 }
@@ -6661,8 +6661,8 @@ static void llm_load_vocab(
66616661 ) {
66626662 vocab.special_fim_mid_id = t.second;
66636663 if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
6664- LLAMA_LOG_WARN("%s: control-looking token: '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
6665- __func__, t.first.c_str());
6664+ LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
6665+ __func__, t.second, t. first.c_str());
66666666 vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
66676667 }
66686668 }
@@ -6677,8 +6677,8 @@ static void llm_load_vocab(
66776677 ) {
66786678 vocab.special_fim_pad_id = t.second;
66796679 if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
6680- LLAMA_LOG_WARN("%s: control-looking token: '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
6681- __func__, t.first.c_str());
6680+ LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
6681+ __func__, t.second, t. first.c_str());
66826682 vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
66836683 }
66846684 }
@@ -6694,8 +6694,8 @@ static void llm_load_vocab(
66946694 ) {
66956695 vocab.special_fim_rep_id = t.second;
66966696 if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
6697- LLAMA_LOG_WARN("%s: control-looking token: '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
6698- __func__, t.first.c_str());
6697+ LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
6698+ __func__, t.second, t. first.c_str());
66996699 vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
67006700 }
67016701 }
@@ -6708,8 +6708,8 @@ static void llm_load_vocab(
67086708 ) {
67096709 vocab.special_fim_sep_id = t.second;
67106710 if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
6711- LLAMA_LOG_WARN("%s: control-looking token: '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
6712- __func__, t.first.c_str());
6711+ LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
6712+ __func__, t.second, t. first.c_str());
67136713 vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
67146714 }
67156715 }
@@ -6720,6 +6720,19 @@ static void llm_load_vocab(
67206720 // this is currently determined based on the token text, which is obviously not ideal
67216721 // ref: https://github.com/ggerganov/llama.cpp/issues/9606
67226722 vocab.special_eog_ids.clear();
6723+
6724+ if (vocab.special_fim_pad_id != LLAMA_TOKEN_NULL && vocab.special_eog_ids.count(vocab.special_fim_pad_id) == 0) {
6725+ vocab.special_eog_ids.insert(vocab.special_fim_pad_id);
6726+ }
6727+
6728+ if (vocab.special_fim_rep_id != LLAMA_TOKEN_NULL && vocab.special_eog_ids.count(vocab.special_fim_rep_id) == 0) {
6729+ vocab.special_eog_ids.insert(vocab.special_fim_rep_id);
6730+ }
6731+
6732+ if (vocab.special_fim_sep_id != LLAMA_TOKEN_NULL && vocab.special_eog_ids.count(vocab.special_fim_sep_id) == 0) {
6733+ vocab.special_eog_ids.insert(vocab.special_fim_sep_id);
6734+ }
6735+
67236736 for (const auto & t : vocab.token_to_id) {
67246737 if (false
67256738 || t.first == "<|eot_id|>"
@@ -6732,13 +6745,20 @@ static void llm_load_vocab(
67326745 ) {
67336746 vocab.special_eog_ids.insert(t.second);
67346747 if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
6735- LLAMA_LOG_WARN("%s: control-looking token: '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
6736- __func__, t.first.c_str());
6748+ LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
6749+ __func__, t.second, t. first.c_str());
67376750 vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
67386751 }
6752+ } else {
6753+ // token is control, but not marked as EOG -> print a warning
6754+ if (vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL && vocab.special_eog_ids.count(t.second) == 0) {
6755+ LLAMA_LOG_WARN("%s: control token: %6d '%s' is not marked as EOG\n",
6756+ __func__, t.second, t.first.c_str());
6757+ }
67396758 }
67406759 }
67416760
6761+ // sanity checks
67426762 if (vocab.special_eos_id != LLAMA_TOKEN_NULL && vocab.special_eog_ids.count(vocab.special_eos_id) == 0) {
67436763 vocab.special_eog_ids.insert(vocab.special_eos_id);
67446764 LLAMA_LOG_WARN("%s: special_eos_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__);
0 commit comments