Skip to content

Commit bf33e5a

Browse files
authored
add seed-coder vocab
1 parent 2c7c0f5 commit bf33e5a

File tree

1 file changed

+11
-0
lines changed

1 file changed

+11
-0
lines changed

src/llama-vocab.cpp

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -415,6 +415,13 @@ struct llm_tokenizer_bpe : llm_tokenizer {
415415
"'(?:[sSdDmMtT]|[lL][lL]|[vV][eE]|[rR][eE])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]|\\s+(?!\\S)|\\s+",
416416
};
417417
break;
418+
case LLAMA_VOCAB_PRE_TYPE_SEED_CODER:
419+
regex_exprs = {
420+
// original regex from tokenizer.json
421+
// "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1}| ?[^\\s\\p{L}\\p{N}\r\n]+|\\s*[\r\n]+|\\s+(?!\\S)|\\s+"
422+
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1}| ?[^\\s\\p{L}\\p{N}\\r\\n]+|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
423+
};
424+
break;
418425
default:
419426
// default regex for BPE tokenization pre-processing
420427
regex_exprs = {
@@ -1634,6 +1641,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
16341641
tokenizer_pre == "bailingmoe") {
16351642
pre_type = LLAMA_VOCAB_PRE_TYPE_BAILINGMOE;
16361643
clean_spaces = false;
1644+
} else if (
1645+
tokenizer_pre == "seed-coder") {
1646+
pre_type = LLAMA_VOCAB_PRE_TYPE_SEED_CODER;
1647+
clean_spaces = false;
16371648
} else {
16381649
throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
16391650
}

0 commit comments

Comments
 (0)