From ba433eac5ef5cbf45833900b3883e3392c845854 Mon Sep 17 00:00:00 2001 From: RomanCast Date: Thu, 10 Feb 2022 18:27:47 +0100 Subject: [PATCH 1/5] small changes --- slurm/train_tokenizer.slurm | 17 ++++++++++------- train_convert_tokenizer_simple.py | 5 ++++- 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/slurm/train_tokenizer.slurm b/slurm/train_tokenizer.slurm index 2e52fc0..58ee9bd 100644 --- a/slurm/train_tokenizer.slurm +++ b/slurm/train_tokenizer.slurm @@ -2,10 +2,10 @@ #SBATCH --job-name=train_tokenizer #SBATCH --nodes=1 #SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node! -#SBATCH --cpus-per-task=40 # number of cores per tasks +#SBATCH --cpus-per-task=1 # number of cores per tasks #SBATCH --hint=nomultithread # we get physical cores not logical #SBATCH --partition=cpu_p1 -#SBATCH --time 20:00:00 # maximum execution time (HH:MM:SS) +#SBATCH --time 2:00:00 # maximum execution time (HH:MM:SS) #SBATCH --output=logs/train_tokenizer/%x-%j.out # output file name #SBATCH --account=six@cpu @@ -14,14 +14,14 @@ set -x -e source $six_ALL_CCFRWORK/start-prod conda activate thomas_data_tooling # Debug deepspeed temporarily -TOKENIZATION_REPO=$WORK/code/big_science/tokenization +TOKENIZATION_REPO=$WORK/tokenization pushd $TOKENIZATION_REPO echo "Sharding and compressing seed id ${SEED_ID}" DATASET_PATH=$six_ALL_CCFRSCRATCH/tokenizer/dataset/tokenization_dataset # TODO: define where is concatenated dataset -SAVE_TOKENIZER_PATH=$six_ALL_CCFRSCRATCH/tokenizer/tokenizer +SAVE_TOKENIZER_PATH=$six_ALL_CCFRSCRATCH/tokenizer/tokenizer_roman mkdir -p $SAVE_TOKENIZER_PATH @@ -35,11 +35,14 @@ export HF_DATASETS_CACHE=$SCRATCH/to_delete # ceil(150_000 / (8 * 128)) * 8 * 128 - 200 = 150328 python train_convert_tokenizer_simple.py \ + --input_sentence_size 0 \ --vocab_size 150328 \ --data_name ${DATASET_PATH} \ --output_folder ${SAVE_TOKENIZER_PATH} \ --load_batch_size 1000 \ - --input_sentence_size 12000000 \ - --max_sequence_length 65536 \ - --num_threads 80 + --max_sequence_length 8192 \ + --num_threads 1 + + + # --input_sentence_size 12000000 \ diff --git a/train_convert_tokenizer_simple.py b/train_convert_tokenizer_simple.py index 1178f02..29a0d44 100644 --- a/train_convert_tokenizer_simple.py +++ b/train_convert_tokenizer_simple.py @@ -129,7 +129,7 @@ def main(): sequence_length_in_byte=args.max_sequence_length ), input_sentence_size=args.input_sentence_size, - shuffle_input_sentence=True, + shuffle_input_sentence=args.input_sentence_size > 0, model_prefix=str(tokenizer_path.absolute()), vocab_size=args.vocab_size, model_type="bpe", @@ -143,6 +143,7 @@ def main(): train_extremely_large_corpus=True ) + logger.info("Done training the tokenizer. Starting tokenizer conversion") spm_model_path = tokenizer_path / f"tokenizer.model" original_tokenizer = SPMTokenizer(str(spm_model_path.absolute())) converter = SpmConverter(original_tokenizer) @@ -162,5 +163,7 @@ def main(): tokenizer_path / f"tokenizer_hf" ) + logger.info("Done converting and saving the tokenizer.") + if __name__ == "__main__": main() From a69da530e178f734d86df718f42f5e8d160c48cf Mon Sep 17 00:00:00 2001 From: RomanCast Date: Fri, 11 Feb 2022 10:23:53 +0100 Subject: [PATCH 2/5] fixed bugs in preprocess text --- slurm/train_tokenizer.slurm | 8 ++++---- train_convert_tokenizer_simple.py | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/slurm/train_tokenizer.slurm b/slurm/train_tokenizer.slurm index 58ee9bd..b78354b 100644 --- a/slurm/train_tokenizer.slurm +++ b/slurm/train_tokenizer.slurm @@ -2,10 +2,10 @@ #SBATCH --job-name=train_tokenizer #SBATCH --nodes=1 #SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node! -#SBATCH --cpus-per-task=1 # number of cores per tasks +#SBATCH --cpus-per-task=40 # number of cores per tasks #SBATCH --hint=nomultithread # we get physical cores not logical #SBATCH --partition=cpu_p1 -#SBATCH --time 2:00:00 # maximum execution time (HH:MM:SS) +#SBATCH --time 20:00:00 # maximum execution time (HH:MM:SS) #SBATCH --output=logs/train_tokenizer/%x-%j.out # output file name #SBATCH --account=six@cpu @@ -20,8 +20,8 @@ pushd $TOKENIZATION_REPO echo "Sharding and compressing seed id ${SEED_ID}" -DATASET_PATH=$six_ALL_CCFRSCRATCH/tokenizer/dataset/tokenization_dataset # TODO: define where is concatenated dataset -SAVE_TOKENIZER_PATH=$six_ALL_CCFRSCRATCH/tokenizer/tokenizer_roman +DATASET_PATH=$six_ALL_CCFRSCRATCH/tokenizer/dataset/tokenization_dataset_v2 # TODO: define where is concatenated dataset +SAVE_TOKENIZER_PATH=$six_ALL_CCFRSCRATCH/tokenizer/tokenizer_ratios_v2 mkdir -p $SAVE_TOKENIZER_PATH diff --git a/train_convert_tokenizer_simple.py b/train_convert_tokenizer_simple.py index 29a0d44..4d4eec3 100644 --- a/train_convert_tokenizer_simple.py +++ b/train_convert_tokenizer_simple.py @@ -58,8 +58,8 @@ def preprocess_text(batch, sequence_length: int) -> List[List[str]]: start = 0 end = sequence_length while end - start != 0: - if end - start <= sequence_length: - # Sort sequence: we fit everything in size one line + if end - start < sequence_length or len(text) < sequence_length: + # Short sequence: we fit everything in size one line row_results.append(text[start: end]) start = end else: @@ -71,8 +71,8 @@ def preprocess_text(batch, sequence_length: int) -> List[List[str]]: else: substring = matches[0] - start = len(substring) - end = start + min(sequence_length, len(text)) + start += len(substring) + end = min(start + sequence_length, len(text)) row_results.append(substring) batch_results.append(row_results) From 63523d3e1ba379bbc0302366da5d43642f6f9fc0 Mon Sep 17 00:00:00 2001 From: RomanCast Date: Fri, 11 Feb 2022 10:44:14 +0100 Subject: [PATCH 3/5] dynamically compute average number of bytes per character --- train_convert_tokenizer_simple.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/train_convert_tokenizer_simple.py b/train_convert_tokenizer_simple.py index 4d4eec3..9076750 100644 --- a/train_convert_tokenizer_simple.py +++ b/train_convert_tokenizer_simple.py @@ -1,6 +1,7 @@ import logging from pathlib import Path from typing import List +import math import sentencepiece as spm from datasets import load_dataset, utils @@ -25,8 +26,6 @@ def get_args(): return parser.parse_args() def dataset_iterator(dataset, batch_size: int, sequence_length_in_byte: int): - # FIXME: we use an approximation of byte length vs byte sequence - sequence_length = sequence_length_in_byte // 2 slices = [(start, min(len(dataset), start + batch_size)) for start in range(0, len(dataset), batch_size)] for start, end in utils.tqdm( @@ -38,12 +37,12 @@ def dataset_iterator(dataset, batch_size: int, sequence_length_in_byte: int): ): # Load things by batch. batch = dataset[start: end] - batch_results = preprocess_text(batch, sequence_length) + batch_results = preprocess_text(batch, sequence_length_in_byte) for row_results in batch_results: for text in row_results: yield text -def preprocess_text(batch, sequence_length: int) -> List[List[str]]: +def preprocess_text(batch, sequence_length_in_byte: int) -> List[List[str]]: batch_results = [] for text in batch["text"]: row_results = [] @@ -54,6 +53,12 @@ def preprocess_text(batch, sequence_length: int) -> List[List[str]]: text = text.strip() + # Compute an average of the number of bytes needed to encode a character for that sequence + # Needed since it will vary a lot depending on the language. + avg_bytes_per_character = math.ceil(len(text.encode('utf8')) / len(text)) + + sequence_length = sequence_length_in_byte // avg_bytes_per_character + # shard text to be into substrings of size < sequence length start = 0 end = sequence_length From 8a58b5e8c37b4c5ef924cc795b3f15f165937747 Mon Sep 17 00:00:00 2001 From: RomanCast Date: Fri, 11 Feb 2022 11:39:48 +0100 Subject: [PATCH 4/5] avoid empty sentences --- train_convert_tokenizer_simple.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/train_convert_tokenizer_simple.py b/train_convert_tokenizer_simple.py index 9076750..d5fe233 100644 --- a/train_convert_tokenizer_simple.py +++ b/train_convert_tokenizer_simple.py @@ -53,6 +53,9 @@ def preprocess_text(batch, sequence_length_in_byte: int) -> List[List[str]]: text = text.strip() + if len(text) == 0: + continue + # Compute an average of the number of bytes needed to encode a character for that sequence # Needed since it will vary a lot depending on the language. avg_bytes_per_character = math.ceil(len(text.encode('utf8')) / len(text)) @@ -104,7 +107,7 @@ def main(): ) tokenizer_path = args.output_folder / "tokenizer" - dataset = load_dataset(args.data_name, data_files="**.jsonl.gz", split="train") + dataset = load_dataset(args.data_name, data_files="**.jsonl", split="train") logger.info(f"Dataset length: {len(dataset)}") # max_length = 0 From 42715a139700c50ae08bbb4d762cd18139115802 Mon Sep 17 00:00:00 2001 From: RomanCast Date: Tue, 15 Feb 2022 06:07:27 +0100 Subject: [PATCH 5/5] small changes to arguments --- slurm/train_tokenizer.slurm | 19 +++++++++---------- train_convert_tokenizer_simple.py | 10 +++++++--- 2 files changed, 16 insertions(+), 13 deletions(-) diff --git a/slurm/train_tokenizer.slurm b/slurm/train_tokenizer.slurm index b78354b..192937f 100644 --- a/slurm/train_tokenizer.slurm +++ b/slurm/train_tokenizer.slurm @@ -2,11 +2,12 @@ #SBATCH --job-name=train_tokenizer #SBATCH --nodes=1 #SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node! -#SBATCH --cpus-per-task=40 # number of cores per tasks +#SBATCH --cpus-per-task=40 # number of cores per tasks #SBATCH --hint=nomultithread # we get physical cores not logical #SBATCH --partition=cpu_p1 -#SBATCH --time 20:00:00 # maximum execution time (HH:MM:SS) +#SBATCH --time 12:00:00 # maximum execution time (HH:MM:SS) #SBATCH --output=logs/train_tokenizer/%x-%j.out # output file name +# #SBATCH --qos=qos_cpu-t4 #SBATCH --account=six@cpu set -x -e @@ -20,8 +21,8 @@ pushd $TOKENIZATION_REPO echo "Sharding and compressing seed id ${SEED_ID}" -DATASET_PATH=$six_ALL_CCFRSCRATCH/tokenizer/dataset/tokenization_dataset_v2 # TODO: define where is concatenated dataset -SAVE_TOKENIZER_PATH=$six_ALL_CCFRSCRATCH/tokenizer/tokenizer_ratios_v2 +DATASET_PATH=$six_ALL_CCFRSCRATCH/tokenizer/dataset/tokenization_dataset # TODO: define where is concatenated dataset +SAVE_TOKENIZER_PATH=$six_ALL_CCFRSCRATCH/tokenizer/tokenizer_equal_nfkc_24M_sentences mkdir -p $SAVE_TOKENIZER_PATH @@ -35,14 +36,12 @@ export HF_DATASETS_CACHE=$SCRATCH/to_delete # ceil(150_000 / (8 * 128)) * 8 * 128 - 200 = 150328 python train_convert_tokenizer_simple.py \ - --input_sentence_size 0 \ --vocab_size 150328 \ --data_name ${DATASET_PATH} \ --output_folder ${SAVE_TOKENIZER_PATH} \ --load_batch_size 1000 \ - --max_sequence_length 8192 \ - --num_threads 1 - - - # --input_sentence_size 12000000 \ + --max_sequence_length 4096 \ + --num_threads 1 \ + --input_sentence_size 24_000_000 \ + --normalizer nfkc diff --git a/train_convert_tokenizer_simple.py b/train_convert_tokenizer_simple.py index d5fe233..4e6869b 100644 --- a/train_convert_tokenizer_simple.py +++ b/train_convert_tokenizer_simple.py @@ -22,6 +22,8 @@ def get_args(): parser.add_argument("--load_batch_size", type=int, default=1) parser.add_argument("--max_sequence_length", type=int, required=True) parser.add_argument("--input_sentence_size", type=int, required=True) + parser.add_argument("--normalizer", type=str, default="nmt_nfkc") + parser.add_argument("--remove-extra-whitespaces", action="store_true") return parser.parse_args() @@ -148,15 +150,17 @@ def main(): eos_id=2, pad_id=3, byte_fallback=True, - train_extremely_large_corpus=True + train_extremely_large_corpus=True, + normalization_rule_name=args.normalizer, + remove_extra_whitespaces=args.remove_extra_whitespaces ) logger.info("Done training the tokenizer. Starting tokenizer conversion") - spm_model_path = tokenizer_path / f"tokenizer.model" + spm_model_path = tokenizer_path.with_suffix(".model") original_tokenizer = SPMTokenizer(str(spm_model_path.absolute())) converter = SpmConverter(original_tokenizer) hf_tokenizer = converter.converted() - tokenizer_json = tokenizer_path / f"tokenizer.json" + tokenizer_json = tokenizer_path.with_suffix(".json") hf_tokenizer.save(str(tokenizer_json.absolute())) # WIP: