From ba433eac5ef5cbf45833900b3883e3392c845854 Mon Sep 17 00:00:00 2001
From: RomanCast <roman.castagne@gmail.com>
Date: Thu, 10 Feb 2022 18:27:47 +0100
Subject: [PATCH 1/5] small changes

---
 slurm/train_tokenizer.slurm       | 17 ++++++++++-------
 train_convert_tokenizer_simple.py |  5 ++++-
 2 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/slurm/train_tokenizer.slurm b/slurm/train_tokenizer.slurm
index 2e52fc0..58ee9bd 100644
--- a/slurm/train_tokenizer.slurm
+++ b/slurm/train_tokenizer.slurm
@@ -2,10 +2,10 @@
 #SBATCH --job-name=train_tokenizer
 #SBATCH --nodes=1
 #SBATCH --ntasks-per-node=1          # crucial - only 1 task per dist per node!
-#SBATCH --cpus-per-task=40         # number of cores per tasks
+#SBATCH --cpus-per-task=1         # number of cores per tasks
 #SBATCH --hint=nomultithread         # we get physical cores not logical
 #SBATCH --partition=cpu_p1
-#SBATCH --time 20:00:00              # maximum execution time (HH:MM:SS)
+#SBATCH --time 2:00:00              # maximum execution time (HH:MM:SS)
 #SBATCH --output=logs/train_tokenizer/%x-%j.out           # output file name
 #SBATCH --account=six@cpu
 
@@ -14,14 +14,14 @@ set -x -e
 source $six_ALL_CCFRWORK/start-prod
 conda activate thomas_data_tooling # Debug deepspeed temporarily
 
-TOKENIZATION_REPO=$WORK/code/big_science/tokenization
+TOKENIZATION_REPO=$WORK/tokenization
 
 pushd $TOKENIZATION_REPO
 
 echo "Sharding and compressing seed id ${SEED_ID}"
 
 DATASET_PATH=$six_ALL_CCFRSCRATCH/tokenizer/dataset/tokenization_dataset # TODO: define where is concatenated dataset
-SAVE_TOKENIZER_PATH=$six_ALL_CCFRSCRATCH/tokenizer/tokenizer
+SAVE_TOKENIZER_PATH=$six_ALL_CCFRSCRATCH/tokenizer/tokenizer_roman
 
 mkdir -p $SAVE_TOKENIZER_PATH
 
@@ -35,11 +35,14 @@ export HF_DATASETS_CACHE=$SCRATCH/to_delete
 # ceil(150_000 / (8 * 128)) * 8 * 128 - 200 = 150328
 
 python train_convert_tokenizer_simple.py \
+    --input_sentence_size 0 \
     --vocab_size 150328 \
     --data_name ${DATASET_PATH} \
     --output_folder ${SAVE_TOKENIZER_PATH} \
     --load_batch_size 1000 \
-    --input_sentence_size 12000000 \
-    --max_sequence_length 65536 \
-    --num_threads 80
+    --max_sequence_length 8192 \
+    --num_threads 1
+
+
+    # --input_sentence_size 12000000 \
 
diff --git a/train_convert_tokenizer_simple.py b/train_convert_tokenizer_simple.py
index 1178f02..29a0d44 100644
--- a/train_convert_tokenizer_simple.py
+++ b/train_convert_tokenizer_simple.py
@@ -129,7 +129,7 @@ def main():
             sequence_length_in_byte=args.max_sequence_length
         ),
         input_sentence_size=args.input_sentence_size,
-        shuffle_input_sentence=True,
+        shuffle_input_sentence=args.input_sentence_size > 0,
         model_prefix=str(tokenizer_path.absolute()),
         vocab_size=args.vocab_size,
         model_type="bpe",
@@ -143,6 +143,7 @@ def main():
         train_extremely_large_corpus=True
     )
 
+    logger.info("Done training the tokenizer. Starting tokenizer conversion")
     spm_model_path = tokenizer_path / f"tokenizer.model"
     original_tokenizer = SPMTokenizer(str(spm_model_path.absolute()))
     converter = SpmConverter(original_tokenizer)
@@ -162,5 +163,7 @@ def main():
         tokenizer_path / f"tokenizer_hf"
     )
 
+    logger.info("Done converting and saving the tokenizer.")
+
 if __name__ == "__main__":
     main()

From a69da530e178f734d86df718f42f5e8d160c48cf Mon Sep 17 00:00:00 2001
From: RomanCast <roman.castagne@gmail.com>
Date: Fri, 11 Feb 2022 10:23:53 +0100
Subject: [PATCH 2/5] fixed bugs in preprocess text

---
 slurm/train_tokenizer.slurm       | 8 ++++----
 train_convert_tokenizer_simple.py | 8 ++++----
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/slurm/train_tokenizer.slurm b/slurm/train_tokenizer.slurm
index 58ee9bd..b78354b 100644
--- a/slurm/train_tokenizer.slurm
+++ b/slurm/train_tokenizer.slurm
@@ -2,10 +2,10 @@
 #SBATCH --job-name=train_tokenizer
 #SBATCH --nodes=1
 #SBATCH --ntasks-per-node=1          # crucial - only 1 task per dist per node!
-#SBATCH --cpus-per-task=1         # number of cores per tasks
+#SBATCH --cpus-per-task=40         # number of cores per tasks
 #SBATCH --hint=nomultithread         # we get physical cores not logical
 #SBATCH --partition=cpu_p1
-#SBATCH --time 2:00:00              # maximum execution time (HH:MM:SS)
+#SBATCH --time 20:00:00              # maximum execution time (HH:MM:SS)
 #SBATCH --output=logs/train_tokenizer/%x-%j.out           # output file name
 #SBATCH --account=six@cpu
 
@@ -20,8 +20,8 @@ pushd $TOKENIZATION_REPO
 
 echo "Sharding and compressing seed id ${SEED_ID}"
 
-DATASET_PATH=$six_ALL_CCFRSCRATCH/tokenizer/dataset/tokenization_dataset # TODO: define where is concatenated dataset
-SAVE_TOKENIZER_PATH=$six_ALL_CCFRSCRATCH/tokenizer/tokenizer_roman
+DATASET_PATH=$six_ALL_CCFRSCRATCH/tokenizer/dataset/tokenization_dataset_v2 # TODO: define where is concatenated dataset
+SAVE_TOKENIZER_PATH=$six_ALL_CCFRSCRATCH/tokenizer/tokenizer_ratios_v2
 
 mkdir -p $SAVE_TOKENIZER_PATH
 
diff --git a/train_convert_tokenizer_simple.py b/train_convert_tokenizer_simple.py
index 29a0d44..4d4eec3 100644
--- a/train_convert_tokenizer_simple.py
+++ b/train_convert_tokenizer_simple.py
@@ -58,8 +58,8 @@ def preprocess_text(batch, sequence_length: int) -> List[List[str]]:
         start = 0
         end = sequence_length
         while end - start != 0:
-            if end - start <= sequence_length:
-                # Sort sequence: we fit everything in size one line
+            if end - start < sequence_length or len(text) < sequence_length:
+                # Short sequence: we fit everything in size one line
                 row_results.append(text[start: end])
                 start = end
             else:
@@ -71,8 +71,8 @@ def preprocess_text(batch, sequence_length: int) -> List[List[str]]:
                 else:
                     substring = matches[0]
 
-                start = len(substring)
-                end = start + min(sequence_length, len(text))
+                start += len(substring)
+                end = min(start + sequence_length, len(text))
                 row_results.append(substring)
 
         batch_results.append(row_results)

From 63523d3e1ba379bbc0302366da5d43642f6f9fc0 Mon Sep 17 00:00:00 2001
From: RomanCast <roman.castagne@gmail.com>
Date: Fri, 11 Feb 2022 10:44:14 +0100
Subject: [PATCH 3/5] dynamically compute average number of bytes per character

---
 train_convert_tokenizer_simple.py | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/train_convert_tokenizer_simple.py b/train_convert_tokenizer_simple.py
index 4d4eec3..9076750 100644
--- a/train_convert_tokenizer_simple.py
+++ b/train_convert_tokenizer_simple.py
@@ -1,6 +1,7 @@
 import logging
 from pathlib import Path
 from typing import List
+import math
 
 import sentencepiece as spm
 from datasets import load_dataset, utils
@@ -25,8 +26,6 @@ def get_args():
     return parser.parse_args()
 
 def dataset_iterator(dataset, batch_size: int, sequence_length_in_byte: int):
-    # FIXME: we use an approximation of byte length vs byte sequence
-    sequence_length = sequence_length_in_byte // 2
 
     slices = [(start, min(len(dataset), start + batch_size)) for start in range(0, len(dataset), batch_size)]
     for start, end in utils.tqdm(
@@ -38,12 +37,12 @@ def dataset_iterator(dataset, batch_size: int, sequence_length_in_byte: int):
     ):
         # Load things by batch.
         batch = dataset[start: end]
-        batch_results = preprocess_text(batch, sequence_length)
+        batch_results = preprocess_text(batch, sequence_length_in_byte)
         for row_results in batch_results:
             for text in row_results:
                 yield text
 
-def preprocess_text(batch, sequence_length: int) -> List[List[str]]:
+def preprocess_text(batch, sequence_length_in_byte: int) -> List[List[str]]:
     batch_results = []
     for text in batch["text"]:
         row_results = []
@@ -54,6 +53,12 @@ def preprocess_text(batch, sequence_length: int) -> List[List[str]]:
 
         text = text.strip()
 
+        # Compute an average of the number of bytes needed to encode a character for that sequence
+        # Needed since it will vary a lot depending on the language.
+        avg_bytes_per_character = math.ceil(len(text.encode('utf8')) / len(text))
+
+        sequence_length = sequence_length_in_byte // avg_bytes_per_character
+        
         # shard text to be into substrings of size < sequence length
         start = 0
         end = sequence_length

From 8a58b5e8c37b4c5ef924cc795b3f15f165937747 Mon Sep 17 00:00:00 2001
From: RomanCast <roman.castagne@gmail.com>
Date: Fri, 11 Feb 2022 11:39:48 +0100
Subject: [PATCH 4/5] avoid empty sentences

---
 train_convert_tokenizer_simple.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/train_convert_tokenizer_simple.py b/train_convert_tokenizer_simple.py
index 9076750..d5fe233 100644
--- a/train_convert_tokenizer_simple.py
+++ b/train_convert_tokenizer_simple.py
@@ -53,6 +53,9 @@ def preprocess_text(batch, sequence_length_in_byte: int) -> List[List[str]]:
 
         text = text.strip()
 
+        if len(text) == 0:
+            continue
+
         # Compute an average of the number of bytes needed to encode a character for that sequence
         # Needed since it will vary a lot depending on the language.
         avg_bytes_per_character = math.ceil(len(text.encode('utf8')) / len(text))
@@ -104,7 +107,7 @@ def main():
     )
     tokenizer_path = args.output_folder / "tokenizer"
 
-    dataset = load_dataset(args.data_name, data_files="**.jsonl.gz", split="train")
+    dataset = load_dataset(args.data_name, data_files="**.jsonl", split="train")
 
     logger.info(f"Dataset length: {len(dataset)}")
     # max_length = 0

From 42715a139700c50ae08bbb4d762cd18139115802 Mon Sep 17 00:00:00 2001
From: RomanCast <roman.castagne@gmail.com>
Date: Tue, 15 Feb 2022 06:07:27 +0100
Subject: [PATCH 5/5] small changes to arguments

---
 slurm/train_tokenizer.slurm       | 19 +++++++++----------
 train_convert_tokenizer_simple.py | 10 +++++++---
 2 files changed, 16 insertions(+), 13 deletions(-)

diff --git a/slurm/train_tokenizer.slurm b/slurm/train_tokenizer.slurm
index b78354b..192937f 100644
--- a/slurm/train_tokenizer.slurm
+++ b/slurm/train_tokenizer.slurm
@@ -2,11 +2,12 @@
 #SBATCH --job-name=train_tokenizer
 #SBATCH --nodes=1
 #SBATCH --ntasks-per-node=1          # crucial - only 1 task per dist per node!
-#SBATCH --cpus-per-task=40         # number of cores per tasks
+#SBATCH --cpus-per-task=40           # number of cores per tasks
 #SBATCH --hint=nomultithread         # we get physical cores not logical
 #SBATCH --partition=cpu_p1
-#SBATCH --time 20:00:00              # maximum execution time (HH:MM:SS)
+#SBATCH --time 12:00:00              # maximum execution time (HH:MM:SS)
 #SBATCH --output=logs/train_tokenizer/%x-%j.out           # output file name
+# #SBATCH --qos=qos_cpu-t4
 #SBATCH --account=six@cpu
 
 set -x -e
@@ -20,8 +21,8 @@ pushd $TOKENIZATION_REPO
 
 echo "Sharding and compressing seed id ${SEED_ID}"
 
-DATASET_PATH=$six_ALL_CCFRSCRATCH/tokenizer/dataset/tokenization_dataset_v2 # TODO: define where is concatenated dataset
-SAVE_TOKENIZER_PATH=$six_ALL_CCFRSCRATCH/tokenizer/tokenizer_ratios_v2
+DATASET_PATH=$six_ALL_CCFRSCRATCH/tokenizer/dataset/tokenization_dataset # TODO: define where is concatenated dataset
+SAVE_TOKENIZER_PATH=$six_ALL_CCFRSCRATCH/tokenizer/tokenizer_equal_nfkc_24M_sentences
 
 mkdir -p $SAVE_TOKENIZER_PATH
 
@@ -35,14 +36,12 @@ export HF_DATASETS_CACHE=$SCRATCH/to_delete
 # ceil(150_000 / (8 * 128)) * 8 * 128 - 200 = 150328
 
 python train_convert_tokenizer_simple.py \
-    --input_sentence_size 0 \
     --vocab_size 150328 \
     --data_name ${DATASET_PATH} \
     --output_folder ${SAVE_TOKENIZER_PATH} \
     --load_batch_size 1000 \
-    --max_sequence_length 8192 \
-    --num_threads 1
-
-
-    # --input_sentence_size 12000000 \
+    --max_sequence_length 4096 \
+    --num_threads 1 \
+    --input_sentence_size 24_000_000 \
+    --normalizer nfkc
 
diff --git a/train_convert_tokenizer_simple.py b/train_convert_tokenizer_simple.py
index d5fe233..4e6869b 100644
--- a/train_convert_tokenizer_simple.py
+++ b/train_convert_tokenizer_simple.py
@@ -22,6 +22,8 @@ def get_args():
     parser.add_argument("--load_batch_size", type=int, default=1)
     parser.add_argument("--max_sequence_length", type=int, required=True)
     parser.add_argument("--input_sentence_size", type=int, required=True)
+    parser.add_argument("--normalizer", type=str, default="nmt_nfkc")
+    parser.add_argument("--remove-extra-whitespaces", action="store_true")
 
     return parser.parse_args()
 
@@ -148,15 +150,17 @@ def main():
         eos_id=2,
         pad_id=3,
         byte_fallback=True,
-        train_extremely_large_corpus=True
+        train_extremely_large_corpus=True,
+        normalization_rule_name=args.normalizer,
+        remove_extra_whitespaces=args.remove_extra_whitespaces
     )
 
     logger.info("Done training the tokenizer. Starting tokenizer conversion")
-    spm_model_path = tokenizer_path / f"tokenizer.model"
+    spm_model_path = tokenizer_path.with_suffix(".model")
     original_tokenizer = SPMTokenizer(str(spm_model_path.absolute()))
     converter = SpmConverter(original_tokenizer)
     hf_tokenizer = converter.converted()
-    tokenizer_json = tokenizer_path / f"tokenizer.json"
+    tokenizer_json = tokenizer_path.with_suffix(".json")
     hf_tokenizer.save(str(tokenizer_json.absolute()))
 
     # WIP: