Fix BPE tokenization for weird whitespace characters (Closes #199) (#208)

xenova · web-flow · commit 1165f04a9fbb · 2023-07-22T04:51:11.000+02:00
* Add new tokenizer unit test (#199) * Perform `NFKC` normalization for sentencepiece models w/ precompiled charmap * Fix JSDoc indentation * Add problematic string to unit tests * Use consistent BPE split token * Add second problematic string
diff --git a/src/tokenizers.js b/src/tokenizers.js
@@ -475,6 +475,8 @@ class BPE extends TokenizerModel {
     constructor(config) {
         super(config);
 
+        this.BPE_SPLIT_TOKEN = ' ';
+
         this.tokens_to_ids = config.vocab;
 
         this.unk_token_id = this.tokens_to_ids.get(config.unk_token);
@@ -486,7 +488,7 @@ class BPE extends TokenizerModel {
         }
 
         this.bpe_ranks = Object.fromEntries(config.merges.map((x, i) => [x, i]));
-        this.merges = config.merges.map(x => x.split(/\s+/))
+        this.merges = config.merges.map(x => x.split(this.BPE_SPLIT_TOKEN));
 
         this.end_of_word_suffix = config.end_of_word_suffix;
 
@@ -511,7 +513,7 @@ class BPE extends TokenizerModel {
         let prev_char = word[0];
         for (let i = 1; i < word.length; ++i) {
             let char = word[i];
-            pairs.add(`${prev_char} ${char}`);
+            pairs.add(prev_char + this.BPE_SPLIT_TOKEN + char);
             prev_char = char;
         }
         return Array.from(pairs);
@@ -548,7 +550,7 @@ class BPE extends TokenizerModel {
             if (!(bigram in this.bpe_ranks)) {
                 break;
             }
-            let [first, second] = bigram.split(/\s+/g)
+            let [first, second] = bigram.split(this.BPE_SPLIT_TOKEN);
             let new_word = [];
             let i = 0;
             let j = -1;
@@ -579,7 +581,7 @@ class BPE extends TokenizerModel {
                 pairs = this.get_pairs(word);
             }
         }
-        let final_word = word.join(" ");
+        let final_word = word.join(this.BPE_SPLIT_TOKEN);
         this.cache[token] = final_word;
         return final_word;
     }
@@ -593,7 +595,7 @@ class BPE extends TokenizerModel {
         let outputTokens = [];
 
         for (let token of tokens) {
-            let bpe_token_list = this.bpe(token).split(' ');
+            let bpe_token_list = this.bpe(token).split(this.BPE_SPLIT_TOKEN);
 
             for (let t of bpe_token_list) {
                 if (this.tokens_to_ids.has(t)) {
@@ -801,10 +803,10 @@ class NormalizerSequence extends Normalizer {
         this.normalizers = config.normalizers.map(x => Normalizer.fromConfig(x));
     }
     /**
-   * Apply a sequence of Normalizers to the input text.
-   * @param {string} text The text to normalize.
-   * @returns {string} The normalized text.
-   */
+    * Apply a sequence of Normalizers to the input text.
+    * @param {string} text The text to normalize.
+    * @returns {string} The normalized text.
+    */
     normalize(text) {
         return this.normalizers.reduce((t, normalizer) => {
             return normalizer.normalize(t);
@@ -1758,6 +1760,9 @@ class Precompiled extends Normalizer {
      */
     normalize(text) {
         // TODO use this.charsmap
+        // For now, we just apply NFKC normalization
+        // https://github.com/huggingface/tokenizers/blob/291b2e23ae81cf94738835852213ce120152d121/bindings/python/py_src/tokenizers/implementations/sentencepiece_bpe.py#L34
+        text = text.normalize('NFKC');
         return text;
     }
 }
diff --git a/tests/generate_tests.py b/tests/generate_tests.py
@@ -38,6 +38,9 @@
         "The company was founded in 2016.",
         "test $1 R2 #3 €4 £5 ¥6 ₣7 ₹8 ₱9 test",
         "I bought an apple for $1.00 at the store.",
+        "you…  ",
+        "\u0079\u006F\u0075\u2026\u00A0\u00A0",
+        "\u0079\u006F\u0075\u2026\u00A0\u00A0\u0079\u006F\u0075\u2026\u00A0\u00A0",
     ],
     "custom": {
         "tiiuae/falcon-7b": [