Fix ByteLevel pretokenizer

xenova · web-flow · commit 5216fb461d56 · 2023-09-10T00:37:04.000+02:00
* Re-enable other whisper tests

* Fix `ByteLevel` pretokenizer

Only add prefix space to first word, when option is enabled.
diff --git a/scripts/supported_models.py b/scripts/supported_models.py
@@ -439,14 +439,11 @@
         'openai/whisper-medium.en',
         'openai/whisper-large',
         'openai/whisper-large-v2',
-
-        # TODO: add these models
-        # https://github.com/huggingface/transformers/issues/26043
-        # 'NbAiLab/nb-whisper-tiny-beta',
-        # 'NbAiLab/nb-whisper-base-beta',
-        # 'NbAiLab/nb-whisper-small-beta',
-        # 'NbAiLab/nb-whisper-medium-beta',
-        # 'NbAiLab/nb-whisper-large-beta',
+        'NbAiLab/nb-whisper-tiny-beta',
+        'NbAiLab/nb-whisper-base-beta',
+        'NbAiLab/nb-whisper-small-beta',
+        'NbAiLab/nb-whisper-medium-beta',
+        'NbAiLab/nb-whisper-large-beta',
     ],
     'xlm': [
         'xlm-clm-ende-1024',
diff --git a/src/tokenizers.js b/src/tokenizers.js
@@ -1229,19 +1229,18 @@ class ByteLevelPreTokenizer extends PreTokenizer {
      * @returns {string[]} An array of tokens.
      */
     pre_tokenize_text(text) {
+        // Add a leading space if the option is enabled
+        if (this.add_prefix_space && !text.startsWith(' ')) {
+            text = ' ' + text;
+        }
+
         // Split on whitespace and punctuation
         let tokens = this.use_regex ? (text.match(this.pattern) || []) : [text];
 
-        return tokens.map(token => {
-            if (this.add_prefix_space && !token.startsWith(' ')) {
-                token = ' ' + token;
-            }
-
-            // Maps all our bytes to unicode strings, avoiding control tokens of the BPE (spaces in our case)
-            token = Array.from(this.text_encoder.encode(token), byte => this.byte_encoder[byte]).join('');
-
-            return token;
-        });
+        // Maps all our bytes to unicode strings, avoiding control tokens of the BPE (spaces in our case)
+        return tokens.map(
+            token => Array.from(this.text_encoder.encode(token), byte => this.byte_encoder[byte]).join('')
+        );
     }
 }
 
diff --git a/tests/generate_tests.py b/tests/generate_tests.py
@@ -128,6 +128,12 @@ def generate_tokenizer_tests():
                 # means the model does not use a tokenizer (e.g., vision models)
                 continue
 
+            try:
+                # Disable dropout, if the model allows it
+                tokenizer.backend_tokenizer.model.dropout = 0
+            except AttributeError:
+                pass
+
             tokenizer_results = []
 
             shared_texts = TOKENIZER_TEST_DATA["shared"]