Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
61 commits
Select commit Hold shift + click to select a range
e610539
Clean code
thomasw21 Feb 3, 2022
6b1fd3e
Update vocabulary size
thomasw21 Feb 4, 2022
8f4e8ae
Remove duplicate .slurm
thomasw21 Feb 4, 2022
71a06ff
Woops
thomasw21 Feb 4, 2022
5e489be
Tokenization should have 200 sentinel tokens for potential SC adaptat…
thomasw21 Feb 5, 2022
c67f544
Woops
thomasw21 Feb 5, 2022
80d1aca
unknown has to be specified
thomasw21 Feb 5, 2022
44bd107
Remove empty strings
thomasw21 Feb 6, 2022
55e28f9
Use max length
thomasw21 Feb 6, 2022
5af250b
Somehow there's None in the dataset
thomasw21 Feb 6, 2022
0f5567b
Try optimizing throughput
thomasw21 Feb 6, 2022
5bb315c
Woops
thomasw21 Feb 6, 2022
65d9efe
Woops
thomasw21 Feb 6, 2022
41e180f
Test
thomasw21 Feb 6, 2022
12fbb25
Woops
thomasw21 Feb 6, 2022
e88e959
Damn this is slow
thomasw21 Feb 6, 2022
91f4a2f
Woops
thomasw21 Feb 6, 2022
48e0f78
Unecessary
thomasw21 Feb 6, 2022
604c65b
maybe we don't need to take in account special tokens
thomasw21 Feb 6, 2022
37e92ab
Reduce sequence length
thomasw21 Feb 7, 2022
933ea75
Oops prevent infinite loop
thomasw21 Feb 7, 2022
f1c2a44
Oops prevent infinite loop
thomasw21 Feb 7, 2022
c4bdcbb
Woops
thomasw21 Feb 7, 2022
6fd2cfa
Woops
thomasw21 Feb 7, 2022
239593b
Woops
thomasw21 Feb 7, 2022
b9177f8
Nit
thomasw21 Feb 7, 2022
5a7191d
Not need to check for whitespace anymore
thomasw21 Feb 7, 2022
767303e
Potential infinite loop
thomasw21 Feb 7, 2022
e4bd957
I need to figure out how to obtain the bytes, ie don't know how sente…
thomasw21 Feb 7, 2022
50f349d
string needs to be encoded in utf-8
thomasw21 Feb 7, 2022
a681e79
Revert "string needs to be encoded in utf-8"
thomasw21 Feb 7, 2022
3e3748d
Try lowering the amount of sequence length
thomasw21 Feb 7, 2022
b2a3fa2
Depending on the result, might need to for 3 as the average byte to e…
thomasw21 Feb 7, 2022
ee14366
Revert "Depending on the result, might need to for 3 as the average b…
thomasw21 Feb 7, 2022
c9b0939
Try training tokenizer with about a 10th of the dataset:
thomasw21 Feb 7, 2022
6af1898
WIP
thomasw21 Feb 8, 2022
3b8b71c
Make a script to test out tokenization
thomasw21 Feb 8, 2022
18e2441
Add slurm job
thomasw21 Feb 8, 2022
bc78f3d
Woops
thomasw21 Feb 8, 2022
44e7bfd
Set HF_DATASETS_OFFLINE=1
thomasw21 Feb 8, 2022
c44dec0
Integrate Roman's fix
thomasw21 Feb 11, 2022
0bdf0c2
New path for saving tokenizer
thomasw21 Feb 11, 2022
79835d8
Woops
thomasw21 Feb 11, 2022
49b178d
Try running youtokentome for parallel training
thomasw21 Feb 11, 2022
cc8fbe4
We might need nfkc instead
thomasw21 Feb 11, 2022
04cc2e0
Remove extra whitespaces option set to False
thomasw21 Feb 11, 2022
2fdef0e
Fix
thomasw21 Feb 11, 2022
1a19fca
Fix
thomasw21 Feb 11, 2022
8f82c1b
Reduce the sequence length
thomasw21 Feb 14, 2022
261bacb
More tokens
thomasw21 Feb 14, 2022
aa875c8
Add option to remove the split by number
thomasw21 Feb 14, 2022
5879426
Add requirement file
thomasw21 Feb 14, 2022
0bfae56
Double the amount to trained tokens to see
thomasw21 Feb 14, 2022
287fdc7
Install tokenizers from source
thomasw21 Feb 14, 2022
04eaa8f
Add script to train tokenizers
thomasw21 Feb 14, 2022
45a17e1
Add logs
thomasw21 Feb 14, 2022
24c1903
Woops
thomasw21 Feb 14, 2022
f109a0d
WIP: hf tokenizer fixer
thomasw21 Feb 15, 2022
1bd33ab
Try fixing space replacement
thomasw21 Feb 16, 2022
3418d49
Fix tokenizer
thomasw21 Feb 16, 2022
607b85b
Add condition to change the tokenizer
thomasw21 Feb 16, 2022
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
107 changes: 107 additions & 0 deletions compare_tokenizers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
import argparse
from functools import partial
from typing import Set, List, Dict

from datasets import load_dataset
from transformers import AutoTokenizer, PreTrainedTokenizerFast


def get_args():
parser = argparse.ArgumentParser()
parser.add_argument("--interactive", action="store_true")
parser.add_argument("--dataset-name", type=str)
parser.add_argument("--subset-name", type=str)
parser.add_argument("--text-columns", type=lambda x: set(x.split(",")))
parser.add_argument("--num-proc", type=int, default=1)

args = parser.parse_args()

if args.dataset_name is None:
assert args.interactive
else:
assert args.text_columns is not None and len(args.text_columns) > 0

return args

def check_encoding(tokenizer, text):
print(tokenizer.convert_ids_to_tokens(tokenizer.encode(text)))

# def check_spm_is_equal_hf(hf_tokenizer, spm_tokenizer, text):
# hf_tokenized = hf_tokenizer.convert_ids_to_tokens(hf_tokenizer.encode(text))
# spm_tokenized = spm_tokenizer.encode(text, out_type=str)
# print(f"Difference between my tokenizer vs multilingual one: {len(hf_tokenized)} vs {len(spm_tokenized)}")
# print(hf_tokenized)
# print(spm_tokenized)

def compare_to_previous_multilingual_tokenizer(hf_tokenizer, mul_tokenizer, text):
hf_tokenized = hf_tokenizer.convert_ids_to_tokens(hf_tokenizer.encode(text))
mul_tokenized = mul_tokenizer.convert_ids_to_tokens(mul_tokenizer.encode(text))
print(f"Difference between my tokenizer vs multilingual one: {len(hf_tokenized)} vs {len(mul_tokenized)}")
print(hf_tokenized)
print(mul_tokenized)

def interactively_test_tokenizer(hf_tokenizer, mul_tokenizer):
while True:
print(" ++++++ New input +++")
text = input()
# check encoding
print(" ++++++ Check encoding +++++")
check_encoding(hf_tokenizer, text)
print(" ++++++ Compare with previous alpha tokenizer +++++")
compare_to_previous_multilingual_tokenizer(hf_tokenizer, mul_tokenizer, text)

def batch_tokenize(batch, tokenizer_name: str, tokenizer: PreTrainedTokenizerFast, text_columns: Set[str]):
for text_column in text_columns:
batch[f"{tokenizer_name}_{text_column}"] = tokenizer.batch_decode(tokenizer(batch[text_column]).input_ids)

return batch

def batch_tokenize_on_all_tokenizers(batch, tokenizers: Dict[str, PreTrainedTokenizerFast], text_columns):
for tokenizer_name, tokenizer in tokenizers.items():
batch = batch_tokenize(batch, tokenizer_name, tokenizer, text_columns)
return batch

def run_on_dataset(tokenizers: Dict[str, PreTrainedTokenizerFast], dataset, text_columns, num_proc):
dataset = dataset.map(
partial(batch_tokenize_on_all_tokenizers, tokenizers=tokenizers, text_columns=text_columns),
batched=True,
num_proc=num_proc
)
return dataset

def compute_metrics(dataset):
# compute number of tokens (the lower the better)
number_of_tokens = {}
for column_name in dataset.column_names:
number_of_tokens[column_name] = sum([len(elt) for elt in dataset[column_name]])
print(number_of_tokens)

def main():
args = get_args()
# save_tokenizer()

# Use samson's tokenized
mul_tokenizer = AutoTokenizer.from_pretrained("bigscience/oscar_13_languages_alpha_weight")

# Use HF tokenizer
tokenizer = AutoTokenizer.from_pretrained("bigscience-catalogue-lm-data/tokenizer_v0")

if args.interactive:
interactively_test_tokenizer(tokenizer, mul_tokenizer)
else:
dataset = load_dataset(args.dataset_name, args.subset_name, split="train")
dataset = dataset.remove_columns(set(dataset.column_names) - args.text_columns)

tokenizers = {
"bs_tokenizer_v0": tokenizer,
"samson_tokenizer": tokenizer,
}

dataset = run_on_dataset(tokenizers, dataset, text_columns=args.text_columns, num_proc=args.num_proc)

compute_metrics(dataset)

pass

if __name__ == "__main__":
main()
84 changes: 84 additions & 0 deletions fix_hf_tokenizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
import json
from argparse import ArgumentParser
import itertools


def get_args():
parser = ArgumentParser()
parser.add_argument("--tokenizer-json-path", type=str, help="Path to `tokenizer.json` file")
return parser.parse_args()

def _remove_replace(data):
normalizer = data["normalizer"]
if normalizer["type"] == "Sequence":
normalizers = normalizer["normalizers"]
assert len(normalizers) == 2

updated_normalizers = [elt for elt in normalizers if elt["type"] != "Replace"]

assert len(updated_normalizers) == 1

data["normalizer"] = updated_normalizers[0]
normalizer = data["normalizer"]

assert normalizer["type"] == "Precompiled"
return data

def _add_empty_strings(data):
# Adding spaces to vocabulary
num_max_spaces = 20
space_char = "▁"

if space_char * 2 not in data["model"]["vocab"]:
offset_idx = len(data["model"]["vocab"]) - 2
for idx in range(num_max_spaces, 1, -1):
print(idx + offset_idx, " : ", space_char * idx, " : ", len(space_char * idx))
data["model"]["vocab"][space_char * idx] = idx + offset_idx

lines_to_append = []
for tup in itertools.product([space_char * idx for idx in range(1, num_max_spaces - 1)], repeat=2):
merge_rule = " ".join(tup)
if len(merge_rule) < num_max_spaces + 1:
lines_to_append.append(merge_rule)
lines_to_append = sorted(lines_to_append, key=lambda x: len(x))

data["model"]["merges"].extend(lines_to_append)

# Fixing the whole tokenizer.
data["normalizer"] = {
"type": "Sequence",
"normalizers": [
data["normalizer"],
{"type": "Replace", "pattern": {"Regex": "\n"}, "content": "\n "},
# ^ matches beginning of string as well as beginning of lines in multiline mode.
{"type": "Replace", "pattern": {"Regex": "^ "}, "content": ""}, # add_prefix_space
{"type": "Replace", "pattern": {"Regex": "^"}, "content": " "},
{"type": "Replace", "pattern": {"Regex": "\n "}, "content": "\n"},
# ^ matches beginning of string as well as beginning of lines in multiline mode.
{"type": "Replace", "pattern": {"String": " "}, "content": "▁"},
]}

data["pre_tokenizer"] = None
data["decoder"] = {
"type": "Metaspace",
"replacement": "▁",
"add_prefix_space": True
}
return data


def main():
args = get_args()

with open(args.tokenizer_json_path, "r") as fi:
data = json.load(fi)

data = _remove_replace(data)
data = _add_empty_strings(data)

with open(args.tokenizer_json_path, "w") as fo:
json.dump(data, fo, indent=2)


if __name__ == "__main__":
main()
67 changes: 67 additions & 0 deletions get_problematic_samples.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
import logging
from functools import partial
from pathlib import Path

from datasets import load_dataset
from datasets.utils.logging import set_verbosity_info
import argparse, os

from .train_convert_tokenizer_simple import preprocess_text

set_verbosity_info()
logger = logging.getLogger(__name__)

def get_args():
parser = argparse.ArgumentParser()
parser.add_argument("--data_name", "-d", type=str, required=True)
parser.add_argument("--pathological_samples_path", "-o", type=Path, required=True)
parser.add_argument("--load_batch_size", type=int, default=1)
parser.add_argument("--max_sequence_length", type=int, required=True)
parser.add_argument("--input_sentence_size", type=int, required=True)
parser.add_argument("--num_proc", type=int, required=True)

return parser.parse_args()

def get_not_utf_8_compatible(batch, sequence_length: int):
batch_results = preprocess_text(batch, sequence_length=sequence_length)

return {
"utf-8-not-compatible": [text.encode() for row_results in batch_results for text in row_results if text]
}


def main():
# Setup logging
logging.basicConfig(
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
datefmt="%m/%d/%Y %H:%M:%S",
level=logging.INFO,
)
args = get_args()
logger.info(
f"** The job is runned with the following arguments: **\n{args}\n **** "
)

dataset = load_dataset(args.data_name, data_files="**.jsonl.gz", split="train")

logger.info(f"Dataset length: {len(dataset)}")

# Try to find all that are not castable to utf-8
# FIXME: we use an approximation of byte length vs byte sequence
sequence_length = args.input_sentence_size // 2
dataset = dataset.map(
partial(get_not_utf_8_compatible, sequence_length=sequence_length),
batched=True,
num_proc=args.pathological_samples_path,
remove_columns=dataset.column_names
)

logger.info(f"Invalid text: {dataset}")
dataset.to_json(
args.save_path,
num_proc=args.num_proc
)


if __name__ == "__main__":
main()
3 changes: 3 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
sentencepiece
datasets
transformers
34 changes: 34 additions & 0 deletions scripts/train_no_split_numbers.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
set -x -e

TOKENIZATION_REPO=~/code/tokenization

cd $TOKENIZATION_REPO

TOKENIZER_NAME=tokenizer_alpha_weight_fixed_NFKC_no_split_numbers

DATASET_PATH=~/tokenization_dataset/alpha # TODO: define where is concatenated dataset
SAVE_TOKENIZER_PATH=~/tokenizer/tokenizer_alpha_weight_fixed_NFKC_no_split_numbers
LOGS_PATH=~/logs

mkdir -p $SAVE_TOKENIZER_PATH

export HF_DATASETS_OFFLINE=1

# Tokenization vocabulary size:
# - ceil(150_000 / (8 * 128)) * 8 * 128
# - special tokens:
# - 200 sentinel tokens
# ceil(150_000 / (8 * 128)) * 8 * 128 - 200 = 150328

# --max_sequence_length 65536
# --input_sentence_size 12000000
python train_convert_tokenizer_simple.py \
--vocab_size 150328 \
--data_name ${DATASET_PATH} \
--output_folder ${SAVE_TOKENIZER_PATH} \
--load_batch_size 1000 \
--input_sentence_size 24000000 \
--max_sequence_length 8192 \
--num_threads 48 \
--normalizer nfkc \
2>&1 | tee $LOGS_PATH/$TOKENIZER_NAME.txt
35 changes: 35 additions & 0 deletions scripts/train_split_digits.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
set -x -e

TOKENIZATION_REPO=~/code/tokenization

cd $TOKENIZATION_REPO

TOKENIZER_NAME=tokenizer_alpha_weight_fixed_NFKC_split_digits

DATASET_PATH=~/tokenization_dataset/alpha # TODO: define where is concatenated dataset
SAVE_TOKENIZER_PATH=~/tokenizer/$TOKENIZER_NAME
LOGS_PATH=~/logs

mkdir -p $SAVE_TOKENIZER_PATH

export HF_DATASETS_OFFLINE=1

# Tokenization vocabulary size:
# - ceil(150_000 / (8 * 128)) * 8 * 128
# - special tokens:
# - 200 sentinel tokens
# ceil(150_000 / (8 * 128)) * 8 * 128 - 200 = 150328

# --max_sequence_length 65536
# --input_sentence_size 12000000
python train_convert_tokenizer_simple.py \
--vocab_size 150328 \
--data_name ${DATASET_PATH} \
--output_folder ${SAVE_TOKENIZER_PATH} \
--load_batch_size 1000 \
--input_sentence_size 24000000 \
--max_sequence_length 8192 \
--num_threads 48 \
--normalizer nfkc \
--split_digits \
2>&1 | tee $LOGS_PATH/$TOKENIZER_NAME.txt
40 changes: 40 additions & 0 deletions setup_gcp.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
sudo sudo apt-get update
sudo apt-get install git -y
sudo apt-get install wget -y
sudo apth-get install tmux -y

# Install Rust
sudo apt install build-essential -y
sudo apt-get install pkg-config
sudo apt-get install libssl-dev
curl https://sh.rustup.rs -sSf | sh -s -- -y
export PATH="$HOME/.cargo/bin:$PATH"

# Setup conda
cd ~
wget https://repo.anaconda.com/miniconda/Miniconda3-py38_4.10.3-Linux-x86_64.sh
sh Miniconda3-py38_4.10.3-Linux-x86_64.sh
# You should enter a bunch of things manually

# Clone tokenizers
mkdir ~/code
cd ~/code
git clone https://github.com/huggingface/tokenizers.git
cd tokenizers
git checkout bigscience_fork
cd bindings/python
pip install setuptools_rust
pip install -e .

# Setup tokenization repo
mkdir ~/code
cd ~/code
git clone https://github.com/bigscience-workshop/tokenization.git
cd tokenization
git checkout thomas/train
pip install -r requirements.txt

# install datasets locally
mkdir -p ~/tokenization_dataset
cd ~/tokenization_dataset
gsutil -m cp -r gs://bigscience-backups/dataset/tokenization_dataset/* .
Loading