@@ -3314,7 +3314,12 @@ static void llm_load_vocab(
33143314
33153315 // determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'
33163316 if (vocab.type == LLAMA_VOCAB_TYPE_SPM) {
3317- vocab.linefeed_id = llama_byte_to_token (vocab, ' \n ' );
3317+ try {
3318+ vocab.linefeed_id = llama_byte_to_token (vocab, ' \n ' );
3319+ } catch (const std::exception & e) {
3320+ LLAMA_LOG_WARN (" %s: SPM vocabulary, but newline token not found: %s! Using special_pad_id instead." , __func__, e.what ());
3321+ vocab.linefeed_id = vocab.special_pad_id ;
3322+ }
33183323 } else if (vocab.type == LLAMA_VOCAB_TYPE_WPM) {
33193324 vocab.linefeed_id = vocab.special_pad_id ;
33203325 } else {
@@ -7746,7 +7751,13 @@ static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch) {
77467751 switch (llama_vocab_get_type (vocab)) {
77477752 case LLAMA_VOCAB_TYPE_SPM: {
77487753 const char buf[7 ] = { ' <' , ' 0' , ' x' , hex[ch >> 4 ], hex[ch & 15 ], ' >' , 0 };
7749- return vocab.token_to_id .at (buf);
7754+ auto token = vocab.token_to_id .find (buf);
7755+ if (token != vocab.token_to_id .end ()) {
7756+ return (*token).second ;
7757+ }
7758+ // Try to fall back to just the byte as a string
7759+ const char buf2[2 ] = { (char )ch, 0 };
7760+ return vocab.token_to_id .at (buf2);
77507761 }
77517762 case LLAMA_VOCAB_TYPE_WPM:
77527763 case LLAMA_VOCAB_TYPE_BPE: {
0 commit comments