Skip to content

Commit 8ccb10a

Browse files
committed
Back to dictionary
Sacrifice little runtime performance (~10%) for much faster loading (~50%).
1 parent 6359bfe commit 8ccb10a

File tree

1 file changed

+5
-11
lines changed

1 file changed

+5
-11
lines changed

tokenizer/rwkv_tokenizer.py

Lines changed: 5 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -225,7 +225,7 @@ class FastTokenizer:
225225
def __init__(self, file_name):
226226
self.tok2val = [b''] * 65536
227227
self.tok2len = [0] * 65536
228-
self.root = [None] * 256
228+
self.root = {}
229229

230230
with open(file_name, 'rt', encoding = 'utf-8') as file:
231231
for line in file:
@@ -242,19 +242,13 @@ def add_token(self, token: int, value: bytes):
242242
self.tok2len[token] = len(value)
243243

244244
pos = self.root
245-
246-
for byte in value[:-1]:
247-
if pos[byte] is None:
248-
pos[byte] = (None, [None] * 256)
249-
pos = pos[byte][1]
250-
251-
if pos[value[-1]] is None:
252-
pos[value[-1]] = (token, [None] * 256)
245+
for byte in value[:-1]: pos = pos.setdefault(byte, (None, {}))[1]
246+
pos.setdefault(value[-1], (token, {}))
253247

254248
def next_token(self, src: bytes) -> int:
255249
last_token, last = None, self.root
256250
for i in range(0, len(src)):
257-
if current := last[src[i]]:
251+
if current := last.get(src[i]):
258252
if token := current[0]: last_token = token
259253
last = current[1]
260254
else:
@@ -267,7 +261,7 @@ def encode_bytes(self, src: bytes) -> list[int]:
267261
last_token, last = None, self.root
268262

269263
for i in range(start, stop):
270-
if current := last[src[i]]:
264+
if current := last.get(src[i]):
271265
if token := current[0]:
272266
last_token = token
273267
start = i + 1

0 commit comments

Comments
 (0)