Skip to content

Commit e8981aa

Browse files
committed
upadated llama-vocab.cpp with velvet case
1 parent 0a8995a commit e8981aa

File tree

1 file changed

+8
-0
lines changed

1 file changed

+8
-0
lines changed

src/llama-vocab.cpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -392,6 +392,11 @@ struct llm_tokenizer_bpe : llm_tokenizer {
392392
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
393393
};
394394
break;
395+
case LLAMA_VOCAB_PRE_TYPE_VELVET:
396+
regex_exprs = {
397+
"[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+|[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*|\\p{N}|[\\p{P}\\p{S}]{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
398+
};
399+
break;
395400
default:
396401
// default regex for BPE tokenization pre-processing
397402
regex_exprs = {
@@ -1592,6 +1597,9 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
15921597
} else if (
15931598
tokenizer_pre == "megrez") {
15941599
pre_type = LLAMA_VOCAB_PRE_TYPE_QWEN2;
1600+
} else if (
1601+
tokenizer_pre == "velvet") {
1602+
pre_type = LLAMA_VOCAB_PRE_TYPE_VELVET;
15951603
} else {
15961604
throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
15971605
}

0 commit comments

Comments
 (0)