diff --git a/compare_tokenizers.py b/compare_tokenizers.py new file mode 100644 index 0000000..deb0b2b --- /dev/null +++ b/compare_tokenizers.py @@ -0,0 +1,107 @@ +import argparse +from functools import partial +from typing import Set, List, Dict + +from datasets import load_dataset +from transformers import AutoTokenizer, PreTrainedTokenizerFast + + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument("--interactive", action="store_true") + parser.add_argument("--dataset-name", type=str) + parser.add_argument("--subset-name", type=str) + parser.add_argument("--text-columns", type=lambda x: set(x.split(","))) + parser.add_argument("--num-proc", type=int, default=1) + + args = parser.parse_args() + + if args.dataset_name is None: + assert args.interactive + else: + assert args.text_columns is not None and len(args.text_columns) > 0 + + return args + +def check_encoding(tokenizer, text): + print(tokenizer.convert_ids_to_tokens(tokenizer.encode(text))) + +# def check_spm_is_equal_hf(hf_tokenizer, spm_tokenizer, text): +# hf_tokenized = hf_tokenizer.convert_ids_to_tokens(hf_tokenizer.encode(text)) +# spm_tokenized = spm_tokenizer.encode(text, out_type=str) +# print(f"Difference between my tokenizer vs multilingual one: {len(hf_tokenized)} vs {len(spm_tokenized)}") +# print(hf_tokenized) +# print(spm_tokenized) + +def compare_to_previous_multilingual_tokenizer(hf_tokenizer, mul_tokenizer, text): + hf_tokenized = hf_tokenizer.convert_ids_to_tokens(hf_tokenizer.encode(text)) + mul_tokenized = mul_tokenizer.convert_ids_to_tokens(mul_tokenizer.encode(text)) + print(f"Difference between my tokenizer vs multilingual one: {len(hf_tokenized)} vs {len(mul_tokenized)}") + print(hf_tokenized) + print(mul_tokenized) + +def interactively_test_tokenizer(hf_tokenizer, mul_tokenizer): + while True: + print(" ++++++ New input +++") + text = input() + # check encoding + print(" ++++++ Check encoding +++++") + check_encoding(hf_tokenizer, text) + print(" ++++++ Compare with previous alpha tokenizer +++++") + compare_to_previous_multilingual_tokenizer(hf_tokenizer, mul_tokenizer, text) + +def batch_tokenize(batch, tokenizer_name: str, tokenizer: PreTrainedTokenizerFast, text_columns: Set[str]): + for text_column in text_columns: + batch[f"{tokenizer_name}_{text_column}"] = tokenizer.batch_decode(tokenizer(batch[text_column]).input_ids) + + return batch + +def batch_tokenize_on_all_tokenizers(batch, tokenizers: Dict[str, PreTrainedTokenizerFast], text_columns): + for tokenizer_name, tokenizer in tokenizers.items(): + batch = batch_tokenize(batch, tokenizer_name, tokenizer, text_columns) + return batch + +def run_on_dataset(tokenizers: Dict[str, PreTrainedTokenizerFast], dataset, text_columns, num_proc): + dataset = dataset.map( + partial(batch_tokenize_on_all_tokenizers, tokenizers=tokenizers, text_columns=text_columns), + batched=True, + num_proc=num_proc + ) + return dataset + +def compute_metrics(dataset): + # compute number of tokens (the lower the better) + number_of_tokens = {} + for column_name in dataset.column_names: + number_of_tokens[column_name] = sum([len(elt) for elt in dataset[column_name]]) + print(number_of_tokens) + +def main(): + args = get_args() + # save_tokenizer() + + # Use samson's tokenized + mul_tokenizer = AutoTokenizer.from_pretrained("bigscience/oscar_13_languages_alpha_weight") + + # Use HF tokenizer + tokenizer = AutoTokenizer.from_pretrained("bigscience-catalogue-lm-data/tokenizer_v0") + + if args.interactive: + interactively_test_tokenizer(tokenizer, mul_tokenizer) + else: + dataset = load_dataset(args.dataset_name, args.subset_name, split="train") + dataset = dataset.remove_columns(set(dataset.column_names) - args.text_columns) + + tokenizers = { + "bs_tokenizer_v0": tokenizer, + "samson_tokenizer": tokenizer, + } + + dataset = run_on_dataset(tokenizers, dataset, text_columns=args.text_columns, num_proc=args.num_proc) + + compute_metrics(dataset) + + pass + +if __name__ == "__main__": + main() diff --git a/fix_hf_tokenizer.py b/fix_hf_tokenizer.py new file mode 100644 index 0000000..2385446 --- /dev/null +++ b/fix_hf_tokenizer.py @@ -0,0 +1,84 @@ +import json +from argparse import ArgumentParser +import itertools + + +def get_args(): + parser = ArgumentParser() + parser.add_argument("--tokenizer-json-path", type=str, help="Path to `tokenizer.json` file") + return parser.parse_args() + +def _remove_replace(data): + normalizer = data["normalizer"] + if normalizer["type"] == "Sequence": + normalizers = normalizer["normalizers"] + assert len(normalizers) == 2 + + updated_normalizers = [elt for elt in normalizers if elt["type"] != "Replace"] + + assert len(updated_normalizers) == 1 + + data["normalizer"] = updated_normalizers[0] + normalizer = data["normalizer"] + + assert normalizer["type"] == "Precompiled" + return data + +def _add_empty_strings(data): + # Adding spaces to vocabulary + num_max_spaces = 20 + space_char = "▁" + + if space_char * 2 not in data["model"]["vocab"]: + offset_idx = len(data["model"]["vocab"]) - 2 + for idx in range(num_max_spaces, 1, -1): + print(idx + offset_idx, " : ", space_char * idx, " : ", len(space_char * idx)) + data["model"]["vocab"][space_char * idx] = idx + offset_idx + + lines_to_append = [] + for tup in itertools.product([space_char * idx for idx in range(1, num_max_spaces - 1)], repeat=2): + merge_rule = " ".join(tup) + if len(merge_rule) < num_max_spaces + 1: + lines_to_append.append(merge_rule) + lines_to_append = sorted(lines_to_append, key=lambda x: len(x)) + + data["model"]["merges"].extend(lines_to_append) + + # Fixing the whole tokenizer. + data["normalizer"] = { + "type": "Sequence", + "normalizers": [ + data["normalizer"], + {"type": "Replace", "pattern": {"Regex": "\n"}, "content": "\n "}, + # ^ matches beginning of string as well as beginning of lines in multiline mode. + {"type": "Replace", "pattern": {"Regex": "^ "}, "content": ""}, # add_prefix_space + {"type": "Replace", "pattern": {"Regex": "^"}, "content": " "}, + {"type": "Replace", "pattern": {"Regex": "\n "}, "content": "\n"}, + # ^ matches beginning of string as well as beginning of lines in multiline mode. + {"type": "Replace", "pattern": {"String": " "}, "content": "▁"}, + ]} + + data["pre_tokenizer"] = None + data["decoder"] = { + "type": "Metaspace", + "replacement": "▁", + "add_prefix_space": True + } + return data + + +def main(): + args = get_args() + + with open(args.tokenizer_json_path, "r") as fi: + data = json.load(fi) + + data = _remove_replace(data) + data = _add_empty_strings(data) + + with open(args.tokenizer_json_path, "w") as fo: + json.dump(data, fo, indent=2) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/get_problematic_samples.py b/get_problematic_samples.py new file mode 100644 index 0000000..87f52c5 --- /dev/null +++ b/get_problematic_samples.py @@ -0,0 +1,67 @@ +import logging +from functools import partial +from pathlib import Path + +from datasets import load_dataset +from datasets.utils.logging import set_verbosity_info +import argparse, os + +from .train_convert_tokenizer_simple import preprocess_text + +set_verbosity_info() +logger = logging.getLogger(__name__) + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument("--data_name", "-d", type=str, required=True) + parser.add_argument("--pathological_samples_path", "-o", type=Path, required=True) + parser.add_argument("--load_batch_size", type=int, default=1) + parser.add_argument("--max_sequence_length", type=int, required=True) + parser.add_argument("--input_sentence_size", type=int, required=True) + parser.add_argument("--num_proc", type=int, required=True) + + return parser.parse_args() + +def get_not_utf_8_compatible(batch, sequence_length: int): + batch_results = preprocess_text(batch, sequence_length=sequence_length) + + return { + "utf-8-not-compatible": [text.encode() for row_results in batch_results for text in row_results if text] + } + + +def main(): + # Setup logging + logging.basicConfig( + format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", + datefmt="%m/%d/%Y %H:%M:%S", + level=logging.INFO, + ) + args = get_args() + logger.info( + f"** The job is runned with the following arguments: **\n{args}\n **** " + ) + + dataset = load_dataset(args.data_name, data_files="**.jsonl.gz", split="train") + + logger.info(f"Dataset length: {len(dataset)}") + + # Try to find all that are not castable to utf-8 + # FIXME: we use an approximation of byte length vs byte sequence + sequence_length = args.input_sentence_size // 2 + dataset = dataset.map( + partial(get_not_utf_8_compatible, sequence_length=sequence_length), + batched=True, + num_proc=args.pathological_samples_path, + remove_columns=dataset.column_names + ) + + logger.info(f"Invalid text: {dataset}") + dataset.to_json( + args.save_path, + num_proc=args.num_proc + ) + + +if __name__ == "__main__": + main() diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..602dce7 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,3 @@ +sentencepiece +datasets +transformers \ No newline at end of file diff --git a/scripts/train_no_split_numbers.sh b/scripts/train_no_split_numbers.sh new file mode 100644 index 0000000..97c3224 --- /dev/null +++ b/scripts/train_no_split_numbers.sh @@ -0,0 +1,34 @@ +set -x -e + +TOKENIZATION_REPO=~/code/tokenization + +cd $TOKENIZATION_REPO + +TOKENIZER_NAME=tokenizer_alpha_weight_fixed_NFKC_no_split_numbers + +DATASET_PATH=~/tokenization_dataset/alpha # TODO: define where is concatenated dataset +SAVE_TOKENIZER_PATH=~/tokenizer/tokenizer_alpha_weight_fixed_NFKC_no_split_numbers +LOGS_PATH=~/logs + +mkdir -p $SAVE_TOKENIZER_PATH + +export HF_DATASETS_OFFLINE=1 + +# Tokenization vocabulary size: +# - ceil(150_000 / (8 * 128)) * 8 * 128 +# - special tokens: +# - 200 sentinel tokens +# ceil(150_000 / (8 * 128)) * 8 * 128 - 200 = 150328 + +# --max_sequence_length 65536 +# --input_sentence_size 12000000 +python train_convert_tokenizer_simple.py \ + --vocab_size 150328 \ + --data_name ${DATASET_PATH} \ + --output_folder ${SAVE_TOKENIZER_PATH} \ + --load_batch_size 1000 \ + --input_sentence_size 24000000 \ + --max_sequence_length 8192 \ + --num_threads 48 \ + --normalizer nfkc \ + 2>&1 | tee $LOGS_PATH/$TOKENIZER_NAME.txt diff --git a/scripts/train_split_digits.sh b/scripts/train_split_digits.sh new file mode 100644 index 0000000..b47c286 --- /dev/null +++ b/scripts/train_split_digits.sh @@ -0,0 +1,35 @@ +set -x -e + +TOKENIZATION_REPO=~/code/tokenization + +cd $TOKENIZATION_REPO + +TOKENIZER_NAME=tokenizer_alpha_weight_fixed_NFKC_split_digits + +DATASET_PATH=~/tokenization_dataset/alpha # TODO: define where is concatenated dataset +SAVE_TOKENIZER_PATH=~/tokenizer/$TOKENIZER_NAME +LOGS_PATH=~/logs + +mkdir -p $SAVE_TOKENIZER_PATH + +export HF_DATASETS_OFFLINE=1 + +# Tokenization vocabulary size: +# - ceil(150_000 / (8 * 128)) * 8 * 128 +# - special tokens: +# - 200 sentinel tokens +# ceil(150_000 / (8 * 128)) * 8 * 128 - 200 = 150328 + +# --max_sequence_length 65536 +# --input_sentence_size 12000000 +python train_convert_tokenizer_simple.py \ + --vocab_size 150328 \ + --data_name ${DATASET_PATH} \ + --output_folder ${SAVE_TOKENIZER_PATH} \ + --load_batch_size 1000 \ + --input_sentence_size 24000000 \ + --max_sequence_length 8192 \ + --num_threads 48 \ + --normalizer nfkc \ + --split_digits \ + 2>&1 | tee $LOGS_PATH/$TOKENIZER_NAME.txt \ No newline at end of file diff --git a/setup_gcp.sh b/setup_gcp.sh new file mode 100644 index 0000000..28ee918 --- /dev/null +++ b/setup_gcp.sh @@ -0,0 +1,40 @@ +sudo sudo apt-get update +sudo apt-get install git -y +sudo apt-get install wget -y +sudo apth-get install tmux -y + +# Install Rust +sudo apt install build-essential -y +sudo apt-get install pkg-config +sudo apt-get install libssl-dev +curl https://sh.rustup.rs -sSf | sh -s -- -y +export PATH="$HOME/.cargo/bin:$PATH" + +# Setup conda +cd ~ +wget https://repo.anaconda.com/miniconda/Miniconda3-py38_4.10.3-Linux-x86_64.sh +sh Miniconda3-py38_4.10.3-Linux-x86_64.sh +# You should enter a bunch of things manually + +# Clone tokenizers +mkdir ~/code +cd ~/code +git clone https://github.com/huggingface/tokenizers.git +cd tokenizers +git checkout bigscience_fork +cd bindings/python +pip install setuptools_rust +pip install -e . + +# Setup tokenization repo +mkdir ~/code +cd ~/code +git clone https://github.com/bigscience-workshop/tokenization.git +cd tokenization +git checkout thomas/train +pip install -r requirements.txt + +# install datasets locally +mkdir -p ~/tokenization_dataset +cd ~/tokenization_dataset +gsutil -m cp -r gs://bigscience-backups/dataset/tokenization_dataset/* . diff --git a/slurm/compare_tokenizer.slurm b/slurm/compare_tokenizer.slurm new file mode 100644 index 0000000..9951a3e --- /dev/null +++ b/slurm/compare_tokenizer.slurm @@ -0,0 +1,27 @@ +#!/bin/bash +#SBATCH --job-name=compare_tokenizers +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node! +#SBATCH --cpus-per-task=40 # number of cores per tasks +#SBATCH --hint=nomultithread # we get physical cores not logical +#SBATCH --partition=cpu_p1 +#SBATCH --time 20:00:00 # maximum execution time (HH:MM:SS) +#SBATCH --output=logs/compare_tokenizers/%x-%j.out # output file name +#SBATCH --account=six@cpu + +set -x -e + +source $six_ALL_CCFRWORK/start-prod +conda activate thomas_data_tooling # Debug deepspeed temporarily + +TOKENIZATION_REPO=$WORK/code/big_science/tokenization + +pushd $TOKENIZATION_REPO + +export HF_DATASETS_OFFLINE=1 + +python compare_tokenizers.py \ + --dataset-name xnli \ + --subset-name en \ + --text-columns hypothesis,premise \ + --num-proc 40 \ No newline at end of file diff --git a/slurm/get_problematic_samples.slurm b/slurm/get_problematic_samples.slurm new file mode 100644 index 0000000..5908f6d --- /dev/null +++ b/slurm/get_problematic_samples.slurm @@ -0,0 +1,42 @@ +#!/bin/bash +#SBATCH --job-name=train_tokenizer +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node! +#SBATCH --cpus-per-task=40 # number of cores per tasks +#SBATCH --hint=nomultithread # we get physical cores not logical +#SBATCH --partition=cpu_p1 +#SBATCH --time 20:00:00 # maximum execution time (HH:MM:SS) +#SBATCH --output=logs/train_tokenizer/%x-%j.out # output file name +#SBATCH --account=six@cpu + +set -x -e + +source $six_ALL_CCFRWORK/start-prod +conda activate thomas_data_tooling # Debug deepspeed temporarily + +TOKENIZATION_REPO=$WORK/code/big_science/tokenization + +pushd $TOKENIZATION_REPO + +echo "Sharding and compressing seed id ${SEED_ID}" + +DATASET_PATH=$six_ALL_CCFRSCRATCH/tokenizer/dataset/tokenization_dataset # TODO: define where is concatenated dataset + +mkdir -p $SAVE_TOKENIZER_PATH + +export HF_DATASETS_OFFLINE=1 +export HF_DATASETS_CACHE=$SCRATCH/to_delete + +# Tokenization vocabulary size: +# - ceil(150_000 / (8 * 128)) * 8 * 128 +# - special tokens: +# - 200 sentinel tokens +# ceil(150_000 / (8 * 128)) * 8 * 128 - 200 = 150328 + +python train_convert_tokenizer_simple.py \ + --vocab_size 150328 \ + --data_name ${DATASET_PATH} \ + --load_batch_size 1000 \ + --input_sentence_size 12000000 \ + --max_sequence_length 65536 \ + --num_threads 80 diff --git a/slurm/train_tokenizer.slurm b/slurm/train_tokenizer.slurm new file mode 100644 index 0000000..bb9d59a --- /dev/null +++ b/slurm/train_tokenizer.slurm @@ -0,0 +1,48 @@ +#!/bin/bash +#SBATCH --job-name=train_tokenizer +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node! +#SBATCH --cpus-per-task=40 # number of cores per tasks +#SBATCH --hint=nomultithread # we get physical cores not logical +#SBATCH --partition=cpu_p1 +#SBATCH --time 20:00:00 # maximum execution time (HH:MM:SS) +#SBATCH --output=logs/train_tokenizer/%x-%j.out # output file name +#SBATCH --account=six@cpu + +set -x -e + +source $six_ALL_CCFRWORK/start-prod +conda activate thomas_data_tooling # Debug deepspeed temporarily + +TOKENIZATION_REPO=$WORK/code/big_science/tokenization + +pushd $TOKENIZATION_REPO + +echo "Sharding and compressing seed id ${SEED_ID}" + +DATASET_PATH=$six_ALL_CCFRSCRATCH/tokenizer/dataset/tokenization_dataset # TODO: define where is concatenated dataset +SAVE_TOKENIZER_PATH=$six_ALL_CCFRSCRATCH/tokenizer/tokenizer_equal_weight_fixed_NFKC + +mkdir -p $SAVE_TOKENIZER_PATH + +export HF_DATASETS_OFFLINE=1 +export HF_DATASETS_CACHE=$SCRATCH/to_delete + +# Tokenization vocabulary size: +# - ceil(150_000 / (8 * 128)) * 8 * 128 +# - special tokens: +# - 200 sentinel tokens +# ceil(150_000 / (8 * 128)) * 8 * 128 - 200 = 150328 + +# --max_sequence_length 65536 +# --input_sentence_size 12000000 +python train_convert_tokenizer_simple.py \ + --vocab_size 150328 \ + --data_name ${DATASET_PATH} \ + --output_folder ${SAVE_TOKENIZER_PATH} \ + --load_batch_size 1000 \ + --input_sentence_size 24000000 \ + --max_sequence_length 8192 \ + --num_threads 80 \ + --normalizer nfkc + diff --git a/slurm/train_yttm_tokenizer.slurm b/slurm/train_yttm_tokenizer.slurm new file mode 100644 index 0000000..a888e49 --- /dev/null +++ b/slurm/train_yttm_tokenizer.slurm @@ -0,0 +1,41 @@ +#!/bin/bash +#SBATCH --job-name=train_tokenizer +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node! +#SBATCH --cpus-per-task=40 # number of cores per tasks +#SBATCH --hint=nomultithread # we get physical cores not logical +#SBATCH --partition=cpu_p1 +#SBATCH --time 20:00:00 # maximum execution time (HH:MM:SS) +#SBATCH --output=logs/train_tokenizer/%x-%j.out # output file name +#SBATCH --account=six@cpu + +set -x -e + +source $six_ALL_CCFRWORK/start-prod +conda activate thomas_data_tooling # Debug deepspeed temporarily + +TOKENIZATION_REPO=$WORK/code/big_science/tokenization + +pushd $TOKENIZATION_REPO + +echo "Sharding and compressing seed id ${SEED_ID}" + +TXT_DATASET_PATH=$six_ALL_CCFRSCRATCH/tokenizer/dataset/tokenization_dataset/data.txt # TODO: define where is concatenated dataset +SAVE_TOKENIZER_PATH=$six_ALL_CCFRSCRATCH/tokenizer/tokenizer_equal_weight_fixed + +mkdir -p $SAVE_TOKENIZER_PATH + +export HF_DATASETS_OFFLINE=1 +export HF_DATASETS_CACHE=$SCRATCH/to_delete + +# Tokenization vocabulary size: +# - ceil(150_000 / (8 * 128)) * 8 * 128 +# - special tokens: +# - 200 sentinel tokens +# ceil(150_000 / (8 * 128)) * 8 * 128 - 200 = 150328 + +python train_convert_tokenizer_simple.py \ + --vocab-size 150328 \ + --txt-data $TXT_DATASET_PATH \ + --output-folder $SAVE_TOKENIZER_PATH \ + --num-proc 80 diff --git a/train_convert_tokenizer_simple.py b/train_convert_tokenizer_simple.py index 51343d5..90427f7 100644 --- a/train_convert_tokenizer_simple.py +++ b/train_convert_tokenizer_simple.py @@ -1,40 +1,181 @@ +import logging +import math +from pathlib import Path +from typing import List + import sentencepiece as spm -from datasets import load_dataset +from datasets import load_dataset, utils +from datasets.utils.logging import set_verbosity_info from transformers.convert_slow_tokenizer import SpmConverter -import argparse, os +from transformers import PreTrainedTokenizerFast +import argparse + +set_verbosity_info() +logger = logging.getLogger(__name__) + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument("--vocab_size", "-v", type=int, required=True) + parser.add_argument("--data_name", "-d", type=str, required=True) + parser.add_argument("--output_folder", "-o", type=Path, required=True) + parser.add_argument("--num_threads", "-th", type=int, required=True) + parser.add_argument("--load_batch_size", type=int, default=1) + parser.add_argument("--max_sequence_length", type=int, required=True) + parser.add_argument("--input_sentence_size", type=int, required=True) + parser.add_argument("--normalizer", type=str, required=True) + parser.add_argument("--remove-extra-whitespaces", action="store_true") + parser.add_argument("--split_by_number", action="store_true") + parser.add_argument("--split_digits", action="store_true") + + return parser.parse_args() + +def dataset_iterator(dataset, batch_size: int, sequence_length_in_byte: int): + slices = [(start, min(len(dataset), start + batch_size)) for start in range(0, len(dataset), batch_size)] + for start, end in utils.tqdm( + slices, + total=len(slices), + unit="ba", + disable=bool(utils.logging.get_verbosity() == utils.logging.NOTSET), + desc="Loading dataset to sentencepiece", + ): + # Load things by batch. + batch = dataset[start: end] + batch_results = preprocess_text(batch, sequence_length_in_byte) + for row_results in batch_results: + for text in row_results: + yield text + +def preprocess_text(batch, sequence_length_in_byte: int) -> List[List[str]]: + batch_results = [] + for text in batch["text"]: + row_results = [] + + # Removes None + if not text: + continue + text = text.strip() -parser = argparse.ArgumentParser() -parser.add_argument("--vocab_size", "-v", type=int, required=False, default=150000) -parser.add_argument("--data_name", "-d", type=str, required=True) -parser.add_argument("--output_folder", "-o", type=str, required=False, default='./') -parser.add_argument("--num_threads", "-th", type=int, required=False, default=90) + if len(text) == 0: + continue + # Compute an average of the number of bytes needed to encode a character for that sequence + # Needed since it will vary a lot depending on the language. + avg_bytes_per_character = math.ceil(len(text.encode('utf8')) / len(text)) -tokenizer_path = os.path.join(args.output_folder, "tokenizer") + sequence_length = sequence_length_in_byte // avg_bytes_per_character -def dataset_iterator(self, dataset): - for i in range(len(dataset)): - yield dataset[i]["text"] # assume relevant data is stored in 'text' field (datasets convention) + # shard text to be into substrings of size < sequence length + start = 0 + end = min(sequence_length, len(text)) + while end - start != 0: + if end - start < sequence_length: + # Short sequence: we fit everything in size one line + row_results.append(text[start: end]) + start = end + else: + candidates = text[start:end] + matches = candidates.rsplit(" ", 1) + if matches[0] == "": + # If whitespace is the first and only occurence in the sequence, We just feed everything + substring = candidates + else: + substring = matches[0] + start += len(substring) + end = min(start + sequence_length, len(text)) + row_results.append(substring) + + batch_results.append(row_results) + return batch_results class SPMTokenizer: def __init__(self, vocab_file): self.vocab_file = vocab_file +# def reduce_max_text_length_on_shard(index:int, num_shards: int, dataset: Dataset, batch_size: int): +# shard = dataset.shard(num_shards=num_shards, index=index) +# return max([len(text) for text in dataset_iterator(shard, batch_size)]) + +def main(): + # Setup logging + logging.basicConfig( + format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", + datefmt="%m/%d/%Y %H:%M:%S", + level=logging.INFO, + ) + args = get_args() + logger.info( + f"** The job is runned with the following arguments: **\n{args}\n **** " + ) + tokenizer_path = args.output_folder / "tokenizer" + + dataset = load_dataset(args.data_name, data_files="**.jsonl", split="train") + + logger.info(f"Dataset length: {len(dataset)}") + # max_length = 0 + # for text in dataset_iterator(dataset, args.load_batch_size): + # length = len(text) + # if max_length < length: + # max_length = length + + # # Parallel version + # with Pool(args.num_threads) as pool: + # max_per_shard = pool.map( + # partial( + # reduce_max_text_length_on_shard, + # num_shards=args.num_threads, + # dataset=dataset, + # batch_size=args.load_batch_size, + # ), + # range(args.num_threads) + # ) + # max_length=max(max_per_shard) + # logger.info(f"Max length: {max_length}") + + spm.SentencePieceTrainer.train( + sentence_iterator=dataset_iterator( + dataset, + args.load_batch_size, + sequence_length_in_byte=args.max_sequence_length + ), + input_sentence_size=args.input_sentence_size, + shuffle_input_sentence=True, + model_prefix=str(tokenizer_path.absolute()), + vocab_size=args.vocab_size, + model_type="bpe", + max_sentence_length=args.max_sequence_length, + num_threads=args.num_threads, + unk_id=0, + bos_id=1, + eos_id=2, + pad_id=3, + byte_fallback=True, + train_extremely_large_corpus=True, + normalization_rule_name=args.normalizer, + remove_extra_whitespaces=args.remove_extra_whitespaces, + split_by_number=args.split_by_number, + split_digits=args.split_digits, + ) -dataset = load_dataset(args.data_name) + spm_model_path = args.output_folder / f"tokenizer.model" + original_tokenizer = SPMTokenizer(str(spm_model_path.absolute())) + converter = SpmConverter(original_tokenizer) + hf_tokenizer = converter.converted() + tokenizer_json = args.output_folder / f"tokenizer.json" + hf_tokenizer.save(str(tokenizer_json.absolute())) -spm.SentencePieceTrainer.train(sentence_iterator=dataset_iterator(dataset), - model_prefix=tokenizer_path, - vocab_size=args.vocab_size, - model_type="bpe", - max_sentence_length=4096, - num_threads=args.num_threads, - byte_fallback=True, - train_extremely_large_corpus=True) + # WIP: + tokenizer = PreTrainedTokenizerFast( + tokenizer_file=str(tokenizer_json.absolute()), + unk_token="", + eos_token="", + bos_token="", + pad_token="", + ) + tokenizer.save_pretrained( + args.output_folder / f"tokenizer_hf" + ) -original_tokenizer = SPMTokenizer(tokenizer_path + ".model") -converter = SpmConverter(original_tokenizer) -hf_tokenizer = converter.converted() -hf_tokenizer.save(tokenizer_path + ".json") +if __name__ == "__main__": + main() diff --git a/train_youtokentome_tokenizer.py b/train_youtokentome_tokenizer.py new file mode 100644 index 0000000..98e03c2 --- /dev/null +++ b/train_youtokentome_tokenizer.py @@ -0,0 +1,46 @@ +import argparse +import logging +from pathlib import Path + +import youtokentome as yttm +from datasets.utils.logging import set_verbosity_info + +set_verbosity_info() +logger = logging.getLogger(__name__) + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument("--vocab-size", type=int, required=True) + parser.add_argument("--txt-data", type=str, required=True) + parser.add_argument("--output-folder", type=Path, required=True) + parser.add_argument("--num-proc", type=int, required=True) + + return parser.parse_args() + +def main(): + # Setup logging + logging.basicConfig( + format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", + datefmt="%m/%d/%Y %H:%M:%S", + level=logging.INFO, + ) + args = get_args() + logger.info( + f"** The job is runned with the following arguments: **\n{args}\n **** " + ) + tokenizer_path = args.output_folder / "tokenizer_yttm" + + yttm.BPE.train( + data=args.txt_data, + model=str(tokenizer_path.absolute()), + vocab=args.vocab_size, + coverage=0.9995, + n_threads=args.num_proc, + pad_id=0, + unk_id=1, + bos_id=2, + eos_id=3 + ) + +if __name__ == "__main__": + main()