Skip to content

Commit 62324f9

Browse files
committed
Merge branch 'main' into vocquant
2 parents e1a5ce5 + e0fe9d1 commit 62324f9

File tree

3 files changed

+1072
-864
lines changed

3 files changed

+1072
-864
lines changed

model2vec/tokenizer/tokenizer.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
from tokenizers.pre_tokenizers import (
1111
PreTokenizer,
1212
)
13-
from transformers import PreTrainedTokenizerFast
13+
from transformers.tokenization_utils_fast import PreTrainedTokenizerFast
1414

1515
from model2vec.tokenizer.datamodels import Token
1616
from model2vec.tokenizer.model import process_tokenizer
@@ -392,4 +392,7 @@ def create_tokenizer(
392392
cleaned_vocabulary, backend_tokenizer = clean_and_create_vocabulary(tokenizer, vocabulary, token_remove_regex)
393393
new_tokenizer = replace_vocabulary(backend_tokenizer, cleaned_vocabulary, unk_token, pad_token)
394394

395-
return PreTrainedTokenizerFast(tokenizer_object=new_tokenizer)
395+
tokenizer_object = PreTrainedTokenizerFast(tokenizer_object=new_tokenizer)
396+
tokenizer_object.add_special_tokens({"pad_token": "[PAD]", "unk_token": "[UNK]"})
397+
398+
return tokenizer_object

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ dev = [
5959
"ruff",
6060
]
6161

62-
distill = ["torch", "transformers<=4.52.1", "scikit-learn"]
62+
distill = ["torch", "transformers", "scikit-learn"]
6363
onnx = ["onnx", "torch"]
6464
# train also installs inference
6565
train = ["torch", "lightning", "scikit-learn", "skops"]

0 commit comments

Comments
 (0)