diff --git a/slurm/train_tokenizer.slurm b/slurm/train_tokenizer.slurm index 2e52fc0..192937f 100644 --- a/slurm/train_tokenizer.slurm +++ b/slurm/train_tokenizer.slurm @@ -2,11 +2,12 @@ #SBATCH --job-name=train_tokenizer #SBATCH --nodes=1 #SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node! -#SBATCH --cpus-per-task=40 # number of cores per tasks +#SBATCH --cpus-per-task=40 # number of cores per tasks #SBATCH --hint=nomultithread # we get physical cores not logical #SBATCH --partition=cpu_p1 -#SBATCH --time 20:00:00 # maximum execution time (HH:MM:SS) +#SBATCH --time 12:00:00 # maximum execution time (HH:MM:SS) #SBATCH --output=logs/train_tokenizer/%x-%j.out # output file name +# #SBATCH --qos=qos_cpu-t4 #SBATCH --account=six@cpu set -x -e @@ -14,14 +15,14 @@ set -x -e source $six_ALL_CCFRWORK/start-prod conda activate thomas_data_tooling # Debug deepspeed temporarily -TOKENIZATION_REPO=$WORK/code/big_science/tokenization +TOKENIZATION_REPO=$WORK/tokenization pushd $TOKENIZATION_REPO echo "Sharding and compressing seed id ${SEED_ID}" DATASET_PATH=$six_ALL_CCFRSCRATCH/tokenizer/dataset/tokenization_dataset # TODO: define where is concatenated dataset -SAVE_TOKENIZER_PATH=$six_ALL_CCFRSCRATCH/tokenizer/tokenizer +SAVE_TOKENIZER_PATH=$six_ALL_CCFRSCRATCH/tokenizer/tokenizer_equal_nfkc_24M_sentences mkdir -p $SAVE_TOKENIZER_PATH @@ -39,7 +40,8 @@ python train_convert_tokenizer_simple.py \ --data_name ${DATASET_PATH} \ --output_folder ${SAVE_TOKENIZER_PATH} \ --load_batch_size 1000 \ - --input_sentence_size 12000000 \ - --max_sequence_length 65536 \ - --num_threads 80 + --max_sequence_length 4096 \ + --num_threads 1 \ + --input_sentence_size 24_000_000 \ + --normalizer nfkc diff --git a/train_convert_tokenizer_simple.py b/train_convert_tokenizer_simple.py index 1178f02..4e6869b 100644 --- a/train_convert_tokenizer_simple.py +++ b/train_convert_tokenizer_simple.py @@ -1,6 +1,7 @@ import logging from pathlib import Path from typing import List +import math import sentencepiece as spm from datasets import load_dataset, utils @@ -21,12 +22,12 @@ def get_args(): parser.add_argument("--load_batch_size", type=int, default=1) parser.add_argument("--max_sequence_length", type=int, required=True) parser.add_argument("--input_sentence_size", type=int, required=True) + parser.add_argument("--normalizer", type=str, default="nmt_nfkc") + parser.add_argument("--remove-extra-whitespaces", action="store_true") return parser.parse_args() def dataset_iterator(dataset, batch_size: int, sequence_length_in_byte: int): - # FIXME: we use an approximation of byte length vs byte sequence - sequence_length = sequence_length_in_byte // 2 slices = [(start, min(len(dataset), start + batch_size)) for start in range(0, len(dataset), batch_size)] for start, end in utils.tqdm( @@ -38,12 +39,12 @@ def dataset_iterator(dataset, batch_size: int, sequence_length_in_byte: int): ): # Load things by batch. batch = dataset[start: end] - batch_results = preprocess_text(batch, sequence_length) + batch_results = preprocess_text(batch, sequence_length_in_byte) for row_results in batch_results: for text in row_results: yield text -def preprocess_text(batch, sequence_length: int) -> List[List[str]]: +def preprocess_text(batch, sequence_length_in_byte: int) -> List[List[str]]: batch_results = [] for text in batch["text"]: row_results = [] @@ -54,12 +55,21 @@ def preprocess_text(batch, sequence_length: int) -> List[List[str]]: text = text.strip() + if len(text) == 0: + continue + + # Compute an average of the number of bytes needed to encode a character for that sequence + # Needed since it will vary a lot depending on the language. + avg_bytes_per_character = math.ceil(len(text.encode('utf8')) / len(text)) + + sequence_length = sequence_length_in_byte // avg_bytes_per_character + # shard text to be into substrings of size < sequence length start = 0 end = sequence_length while end - start != 0: - if end - start <= sequence_length: - # Sort sequence: we fit everything in size one line + if end - start < sequence_length or len(text) < sequence_length: + # Short sequence: we fit everything in size one line row_results.append(text[start: end]) start = end else: @@ -71,8 +81,8 @@ def preprocess_text(batch, sequence_length: int) -> List[List[str]]: else: substring = matches[0] - start = len(substring) - end = start + min(sequence_length, len(text)) + start += len(substring) + end = min(start + sequence_length, len(text)) row_results.append(substring) batch_results.append(row_results) @@ -99,7 +109,7 @@ def main(): ) tokenizer_path = args.output_folder / "tokenizer" - dataset = load_dataset(args.data_name, data_files="**.jsonl.gz", split="train") + dataset = load_dataset(args.data_name, data_files="**.jsonl", split="train") logger.info(f"Dataset length: {len(dataset)}") # max_length = 0 @@ -129,7 +139,7 @@ def main(): sequence_length_in_byte=args.max_sequence_length ), input_sentence_size=args.input_sentence_size, - shuffle_input_sentence=True, + shuffle_input_sentence=args.input_sentence_size > 0, model_prefix=str(tokenizer_path.absolute()), vocab_size=args.vocab_size, model_type="bpe", @@ -140,14 +150,17 @@ def main(): eos_id=2, pad_id=3, byte_fallback=True, - train_extremely_large_corpus=True + train_extremely_large_corpus=True, + normalization_rule_name=args.normalizer, + remove_extra_whitespaces=args.remove_extra_whitespaces ) - spm_model_path = tokenizer_path / f"tokenizer.model" + logger.info("Done training the tokenizer. Starting tokenizer conversion") + spm_model_path = tokenizer_path.with_suffix(".model") original_tokenizer = SPMTokenizer(str(spm_model_path.absolute())) converter = SpmConverter(original_tokenizer) hf_tokenizer = converter.converted() - tokenizer_json = tokenizer_path / f"tokenizer.json" + tokenizer_json = tokenizer_path.with_suffix(".json") hf_tokenizer.save(str(tokenizer_json.absolute())) # WIP: @@ -162,5 +175,7 @@ def main(): tokenizer_path / f"tokenizer_hf" ) + logger.info("Done converting and saving the tokenizer.") + if __name__ == "__main__": main()