Skip to content

Commit 14df6ce

Browse files
committed
Add plamo2 tokenizer
1 parent 9741405 commit 14df6ce

File tree

4 files changed

+462
-16
lines changed

4 files changed

+462
-16
lines changed

convert_hf_to_gguf.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3459,11 +3459,15 @@ def set_vocab(self):
34593459
elif token_type_str == "BYTE":
34603460
toktypes.append(gguf.TokenType.BYTE)
34613461
else:
3462-
toktypes.append(gguf.TokenType.NORMAL)
3462+
# Check for PLaMo-2 special tokens
3463+
token_str = token_data[0]
3464+
if token_str.startswith("<|plamo:") and token_str.endswith("|>"):
3465+
toktypes.append(gguf.TokenType.CONTROL)
3466+
else:
3467+
toktypes.append(gguf.TokenType.NORMAL)
34633468

3464-
# Use "llama" (SPM) tokenizer type which doesn't require merges
3465-
# PLaMo 2's tokenizer is more similar to SPM than GPT2
3466-
self.gguf_writer.add_tokenizer_model("llama")
3469+
# Use "plamo2" tokenizer type for PLaMo-2's custom Aho-Corasick tokenizer
3470+
self.gguf_writer.add_tokenizer_model("plamo2")
34673471
self.gguf_writer.add_tokenizer_pre("default")
34683472
self.gguf_writer.add_token_list(tokens)
34693473
self.gguf_writer.add_token_scores(scores)

include/llama.h

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -71,12 +71,13 @@ extern "C" {
7171
typedef int32_t llama_seq_id;
7272

7373
enum llama_vocab_type {
74-
LLAMA_VOCAB_TYPE_NONE = 0, // For models without vocab
75-
LLAMA_VOCAB_TYPE_SPM = 1, // LLaMA tokenizer based on byte-level BPE with byte fallback
76-
LLAMA_VOCAB_TYPE_BPE = 2, // GPT-2 tokenizer based on byte-level BPE
77-
LLAMA_VOCAB_TYPE_WPM = 3, // BERT tokenizer based on WordPiece
78-
LLAMA_VOCAB_TYPE_UGM = 4, // T5 tokenizer based on Unigram
79-
LLAMA_VOCAB_TYPE_RWKV = 5, // RWKV tokenizer based on greedy tokenization
74+
LLAMA_VOCAB_TYPE_NONE = 0, // For models without vocab
75+
LLAMA_VOCAB_TYPE_SPM = 1, // LLaMA tokenizer based on byte-level BPE with byte fallback
76+
LLAMA_VOCAB_TYPE_BPE = 2, // GPT-2 tokenizer based on byte-level BPE
77+
LLAMA_VOCAB_TYPE_WPM = 3, // BERT tokenizer based on WordPiece
78+
LLAMA_VOCAB_TYPE_UGM = 4, // T5 tokenizer based on Unigram
79+
LLAMA_VOCAB_TYPE_RWKV = 5, // RWKV tokenizer based on greedy tokenization
80+
LLAMA_VOCAB_TYPE_PLAMO2 = 6, // PLaMo-2 tokenizer based on Aho-Corasick with dynamic programming
8081
};
8182

8283
// pre-tokenization types

0 commit comments

Comments
 (0)