File tree Expand file tree Collapse file tree 4 files changed +462
-16
lines changed Expand file tree Collapse file tree 4 files changed +462
-16
lines changed Original file line number Diff line number Diff line change @@ -3459,11 +3459,15 @@ def set_vocab(self):
34593459 elif token_type_str == "BYTE" :
34603460 toktypes .append (gguf .TokenType .BYTE )
34613461 else :
3462- toktypes .append (gguf .TokenType .NORMAL )
3462+ # Check for PLaMo-2 special tokens
3463+ token_str = token_data [0 ]
3464+ if token_str .startswith ("<|plamo:" ) and token_str .endswith ("|>" ):
3465+ toktypes .append (gguf .TokenType .CONTROL )
3466+ else :
3467+ toktypes .append (gguf .TokenType .NORMAL )
34633468
3464- # Use "llama" (SPM) tokenizer type which doesn't require merges
3465- # PLaMo 2's tokenizer is more similar to SPM than GPT2
3466- self .gguf_writer .add_tokenizer_model ("llama" )
3469+ # Use "plamo2" tokenizer type for PLaMo-2's custom Aho-Corasick tokenizer
3470+ self .gguf_writer .add_tokenizer_model ("plamo2" )
34673471 self .gguf_writer .add_tokenizer_pre ("default" )
34683472 self .gguf_writer .add_token_list (tokens )
34693473 self .gguf_writer .add_token_scores (scores )
Original file line number Diff line number Diff line change @@ -71,12 +71,13 @@ extern "C" {
7171 typedef int32_t llama_seq_id;
7272
7373 enum llama_vocab_type {
74- LLAMA_VOCAB_TYPE_NONE = 0 , // For models without vocab
75- LLAMA_VOCAB_TYPE_SPM = 1 , // LLaMA tokenizer based on byte-level BPE with byte fallback
76- LLAMA_VOCAB_TYPE_BPE = 2 , // GPT-2 tokenizer based on byte-level BPE
77- LLAMA_VOCAB_TYPE_WPM = 3 , // BERT tokenizer based on WordPiece
78- LLAMA_VOCAB_TYPE_UGM = 4 , // T5 tokenizer based on Unigram
79- LLAMA_VOCAB_TYPE_RWKV = 5 , // RWKV tokenizer based on greedy tokenization
74+ LLAMA_VOCAB_TYPE_NONE = 0 , // For models without vocab
75+ LLAMA_VOCAB_TYPE_SPM = 1 , // LLaMA tokenizer based on byte-level BPE with byte fallback
76+ LLAMA_VOCAB_TYPE_BPE = 2 , // GPT-2 tokenizer based on byte-level BPE
77+ LLAMA_VOCAB_TYPE_WPM = 3 , // BERT tokenizer based on WordPiece
78+ LLAMA_VOCAB_TYPE_UGM = 4 , // T5 tokenizer based on Unigram
79+ LLAMA_VOCAB_TYPE_RWKV = 5 , // RWKV tokenizer based on greedy tokenization
80+ LLAMA_VOCAB_TYPE_PLAMO2 = 6 , // PLaMo-2 tokenizer based on Aho-Corasick with dynamic programming
8081 };
8182
8283 // pre-tokenization types
You can’t perform that action at this time.
0 commit comments