Skip to content
This repository was archived by the owner on Sep 10, 2025. It is now read-only.

Commit 5c41015

Browse files
committed
feat(builder): Add support for using the TokenizersTokenizer in builder
Branch: GraniteCodeSupport Signed-off-by: Gabe Goodhart <[email protected]>
1 parent 2483486 commit 5c41015

File tree

1 file changed

+35
-5
lines changed

1 file changed

+35
-5
lines changed

torchchat/cli/builder.py

Lines changed: 35 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -204,6 +204,7 @@ class TokenizerArgs:
204204
tokenizer_path: Optional[Union[Path, str]] = None
205205
is_sentencepiece: bool = False
206206
is_tiktoken: bool = False
207+
is_tokenizers: bool = False
207208
t: Optional[Any] = None
208209

209210
def __post_init__(self):
@@ -213,6 +214,7 @@ def __post_init__(self):
213214
self.t = TiktokenTokenizer(model_path=str(self.tokenizer_path))
214215
self.is_tiktoken = True
215216
self.is_sentencepiece = False
217+
self.is_tokenizers = False
216218
return
217219
except:
218220
pass
@@ -223,12 +225,25 @@ def __post_init__(self):
223225
self.t = SentencePieceProcessor(model_file=str(self.tokenizer_path))
224226
self.is_tiktoken = False
225227
self.is_sentencepiece = True
228+
self.is_tokenizers = False
229+
return
230+
except:
231+
pass
232+
233+
try:
234+
from tokenizer.tokenizers import TokenizersTokenizer
235+
236+
self.t = TokenizersTokenizer(str(self.tokenizer_path))
237+
self.is_tiktoken = False
238+
self.is_sentencepiece = False
239+
self.is_tokenizers = True
226240
return
227241
except:
228242
pass
229243

230244
self.is_tiktoken = False
231245
self.is_sentencepiece = False
246+
self.is_tokenizers = False
232247
self.t = None
233248
return
234249

@@ -240,16 +255,27 @@ def validate_model(
240255
if model is None:
241256
return
242257

243-
if self.is_tiktoken == self.is_sentencepiece:
258+
if len(list(filter(lambda x: x, [self.is_tiktoken, self.is_tokenizers, self.is_sentencepiece]))) != 1:
244259
raise RuntimeError(f"no tokenizer was found at {self.tokenizer_path}")
245260

246261
is_tiktoken = self.is_tiktoken
247262
is_sentencepiece = self.is_sentencepiece
263+
is_tokenizers = self.is_tokenizers
248264
use_tiktoken = model.config.use_tiktoken
265+
use_tokenizers = model.config.use_tokenizers
266+
use_sentencepiece = not (use_tiktoken or use_tokenizers)
249267

250-
if not (is_tiktoken == use_tiktoken) or not (is_sentencepiece != use_tiktoken):
268+
if (
269+
(is_tiktoken and not use_tiktoken) or
270+
(is_tokenizers and not use_tokenizers) or
271+
(is_sentencepiece and not use_sentencepiece)
272+
):
251273
raise RuntimeError(
252-
f"model-specified tokenizer ({tokenizer_setting_to_name(use_tiktoken)}) does not match provided tokenizer ({tokenizer_setting_to_name(is_tiktoken)}) for {model_description}"
274+
"model-specified tokenizer ({}) does not match provided tokenizer ({}) for {}".format(
275+
tokenizer_setting_to_name(use_tiktoken, use_tokenizers),
276+
tokenizer_setting_to_name(is_tiktoken, is_tokenizers),
277+
model_description,
278+
)
253279
)
254280

255281
return
@@ -605,5 +631,9 @@ def _initialize_model(
605631
return model
606632

607633

608-
def tokenizer_setting_to_name(tiktoken: bool = False) -> str:
609-
return "TikToken" if tiktoken else "SentencePiece"
634+
def tokenizer_setting_to_name(tiktoken: bool, tokenizers: bool) -> str:
635+
if tiktoken:
636+
return "TikToken"
637+
if tokenizers:
638+
return "Tokenizers"
639+
return "SentencePiece"

0 commit comments

Comments
 (0)