Skip to content

Commit 5216fb4

Browse files
authored
Fix ByteLevel pretokenizer
* Re-enable other whisper tests * Fix `ByteLevel` pretokenizer Only add prefix space to first word, when option is enabled.
1 parent ad7e875 commit 5216fb4

File tree

3 files changed

+20
-18
lines changed

3 files changed

+20
-18
lines changed

scripts/supported_models.py

Lines changed: 5 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -439,14 +439,11 @@
439439
'openai/whisper-medium.en',
440440
'openai/whisper-large',
441441
'openai/whisper-large-v2',
442-
443-
# TODO: add these models
444-
# https://github.com/huggingface/transformers/issues/26043
445-
# 'NbAiLab/nb-whisper-tiny-beta',
446-
# 'NbAiLab/nb-whisper-base-beta',
447-
# 'NbAiLab/nb-whisper-small-beta',
448-
# 'NbAiLab/nb-whisper-medium-beta',
449-
# 'NbAiLab/nb-whisper-large-beta',
442+
'NbAiLab/nb-whisper-tiny-beta',
443+
'NbAiLab/nb-whisper-base-beta',
444+
'NbAiLab/nb-whisper-small-beta',
445+
'NbAiLab/nb-whisper-medium-beta',
446+
'NbAiLab/nb-whisper-large-beta',
450447
],
451448
'xlm': [
452449
'xlm-clm-ende-1024',

src/tokenizers.js

Lines changed: 9 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1229,19 +1229,18 @@ class ByteLevelPreTokenizer extends PreTokenizer {
12291229
* @returns {string[]} An array of tokens.
12301230
*/
12311231
pre_tokenize_text(text) {
1232+
// Add a leading space if the option is enabled
1233+
if (this.add_prefix_space && !text.startsWith(' ')) {
1234+
text = ' ' + text;
1235+
}
1236+
12321237
// Split on whitespace and punctuation
12331238
let tokens = this.use_regex ? (text.match(this.pattern) || []) : [text];
12341239

1235-
return tokens.map(token => {
1236-
if (this.add_prefix_space && !token.startsWith(' ')) {
1237-
token = ' ' + token;
1238-
}
1239-
1240-
// Maps all our bytes to unicode strings, avoiding control tokens of the BPE (spaces in our case)
1241-
token = Array.from(this.text_encoder.encode(token), byte => this.byte_encoder[byte]).join('');
1242-
1243-
return token;
1244-
});
1240+
// Maps all our bytes to unicode strings, avoiding control tokens of the BPE (spaces in our case)
1241+
return tokens.map(
1242+
token => Array.from(this.text_encoder.encode(token), byte => this.byte_encoder[byte]).join('')
1243+
);
12451244
}
12461245
}
12471246

tests/generate_tests.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -128,6 +128,12 @@ def generate_tokenizer_tests():
128128
# means the model does not use a tokenizer (e.g., vision models)
129129
continue
130130

131+
try:
132+
# Disable dropout, if the model allows it
133+
tokenizer.backend_tokenizer.model.dropout = 0
134+
except AttributeError:
135+
pass
136+
131137
tokenizer_results = []
132138

133139
shared_texts = TOKENIZER_TEST_DATA["shared"]

0 commit comments

Comments
 (0)