-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtrain_tokenizer.py
More file actions
29 lines (26 loc) · 830 Bytes
/
train_tokenizer.py
File metadata and controls
29 lines (26 loc) · 830 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
from tokenizers import ByteLevelBPETokenizer
from pathlib import Path
import torch
torch.cuda.empty_cache()
torch.cuda.memory_summary(device=None, abbreviated=False)
paths = [str(x) for x in Path("nepali-text").glob("**/*.txt")]
# Initialize a tokenizer
tokenizer = ByteLevelBPETokenizer(lowercase=False, add_prefix_space=False)
vocab_size = 50000
min_frequency = 3
special_tokens=["<s>",
"<pad>",
"</s>",
"<unk>",
"<mask>",
]
# Customize training
tokenizer.train(files=paths,
vocab_size=vocab_size,
min_frequency=3,
special_tokens=special_tokens,
show_progress=True,
)
#Save the Tokenizer to disk
tokenizer.save_model("Robert")
tokenizer.save("Robert/config.json")