25
25
26
26
from tiktoken .load import load_tiktoken_bpe
27
27
28
+ from .constants import CL100K_PAT_STR , LLAMA_SPECIAL_TOKENS
29
+
28
30
logger = getLogger (__name__ )
29
31
30
32
@@ -47,12 +49,6 @@ class TiktokenTokenizer:
47
49
WARNING: The regex and special tokens are hardcoded from Llama 3+.
48
50
"""
49
51
50
- special_tokens : Dict [str , int ]
51
-
52
- num_reserved_special_tokens = 256
53
-
54
- pat_str = r"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+" # noqa: E501
55
-
56
52
@classmethod
57
53
def get_instance (cls ):
58
54
global _INSTANCE
@@ -63,7 +59,12 @@ def get_instance(cls):
63
59
)
64
60
return _INSTANCE
65
61
66
- def __init__ (self , model_path : str ):
62
+ def __init__ (
63
+ self ,
64
+ model_path : str ,
65
+ pat_str : str = CL100K_PAT_STR ,
66
+ special_tokens : List [str ] = LLAMA_SPECIAL_TOKENS ,
67
+ ):
67
68
"""
68
69
Initializes the Tokenizer with a Tiktoken model.
69
70
@@ -74,32 +75,13 @@ def __init__(self, model_path: str):
74
75
75
76
mergeable_ranks = load_tiktoken_bpe (model_path )
76
77
num_base_tokens = len (mergeable_ranks )
77
- special_tokens = [
78
- "<|begin_of_text|>" ,
79
- "<|end_of_text|>" ,
80
- "<|reserved_special_token_0|>" ,
81
- "<|reserved_special_token_1|>" ,
82
- "<|finetune_right_pad_id|>" ,
83
- "<|step_id|>" ,
84
- "<|start_header_id|>" ,
85
- "<|end_header_id|>" ,
86
- "<|eom_id|>" , # end of message
87
- "<|eot_id|>" , # end of turn
88
- "<|python_tag|>" ,
89
- "<|image|>" ,
90
- ]
91
- reserved_tokens = [
92
- f"<|reserved_special_token_{ 2 + i } |>"
93
- for i in range (self .num_reserved_special_tokens - len (special_tokens ))
94
- ]
95
- special_tokens = special_tokens + reserved_tokens
96
78
97
79
self .special_tokens = {
98
80
token : num_base_tokens + i for i , token in enumerate (special_tokens )
99
81
}
100
82
self .model = tiktoken .Encoding (
101
83
name = Path (model_path ).name ,
102
- pat_str = self . pat_str ,
84
+ pat_str = pat_str ,
103
85
mergeable_ranks = mergeable_ranks ,
104
86
special_tokens = self .special_tokens ,
105
87
)
@@ -108,15 +90,6 @@ def __init__(self, model_path: str):
108
90
# BOS / EOS token IDs
109
91
self .bos_id : int = self .special_tokens ["<|begin_of_text|>" ]
110
92
self .eos_id : int = self .special_tokens ["<|end_of_text|>" ]
111
- self .eot_id : int = self .special_tokens ["<|eot_id|>" ]
112
- self .eom_id : int = self .special_tokens ["<|eom_id|>" ]
113
- self .python_tag_id = self .special_tokens ["<|python_tag|>" ]
114
- self .pad_id : int = self .special_tokens ["<|finetune_right_pad_id|>" ]
115
- self .stop_tokens = [
116
- self .eos_id ,
117
- self .special_tokens ["<|eom_id|>" ],
118
- self .special_tokens ["<|eot_id|>" ],
119
- ]
120
93
121
94
def encode (
122
95
self ,
0 commit comments