@@ -225,7 +225,7 @@ class FastTokenizer:
225225 def __init__ (self , file_name ):
226226 self .tok2val = [b'' ] * 65536
227227 self .tok2len = [0 ] * 65536
228- self .root = [ None ] * 256
228+ self .root = {}
229229
230230 with open (file_name , 'rt' , encoding = 'utf-8' ) as file :
231231 for line in file :
@@ -242,19 +242,13 @@ def add_token(self, token: int, value: bytes):
242242 self .tok2len [token ] = len (value )
243243
244244 pos = self .root
245-
246- for byte in value [:- 1 ]:
247- if pos [byte ] is None :
248- pos [byte ] = (None , [None ] * 256 )
249- pos = pos [byte ][1 ]
250-
251- if pos [value [- 1 ]] is None :
252- pos [value [- 1 ]] = (token , [None ] * 256 )
245+ for byte in value [:- 1 ]: pos = pos .setdefault (byte , (None , {}))[1 ]
246+ pos .setdefault (value [- 1 ], (token , {}))
253247
254248 def next_token (self , src : bytes ) -> int :
255249 last_token , last = None , self .root
256250 for i in range (0 , len (src )):
257- if current := last [ src [i ]] :
251+ if current := last . get ( src [i ]) :
258252 if token := current [0 ]: last_token = token
259253 last = current [1 ]
260254 else :
@@ -267,7 +261,7 @@ def encode_bytes(self, src: bytes) -> list[int]:
267261 last_token , last = None , self .root
268262
269263 for i in range (start , stop ):
270- if current := last [ src [i ]] :
264+ if current := last . get ( src [i ]) :
271265 if token := current [0 ]:
272266 last_token = token
273267 start = i + 1
0 commit comments