Skip to content

Commit 1165f04

Browse files
authored
Fix BPE tokenization for weird whitespace characters (Closes #199) (#208)
* Add new tokenizer unit test (#199) * Perform `NFKC` normalization for sentencepiece models w/ precompiled charmap * Fix JSDoc indentation * Add problematic string to unit tests * Use consistent BPE split token * Add second problematic string
1 parent 86e68bf commit 1165f04

File tree

2 files changed

+17
-9
lines changed

2 files changed

+17
-9
lines changed

src/tokenizers.js

Lines changed: 14 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -475,6 +475,8 @@ class BPE extends TokenizerModel {
475475
constructor(config) {
476476
super(config);
477477

478+
this.BPE_SPLIT_TOKEN = ' ';
479+
478480
this.tokens_to_ids = config.vocab;
479481

480482
this.unk_token_id = this.tokens_to_ids.get(config.unk_token);
@@ -486,7 +488,7 @@ class BPE extends TokenizerModel {
486488
}
487489

488490
this.bpe_ranks = Object.fromEntries(config.merges.map((x, i) => [x, i]));
489-
this.merges = config.merges.map(x => x.split(/\s+/))
491+
this.merges = config.merges.map(x => x.split(this.BPE_SPLIT_TOKEN));
490492

491493
this.end_of_word_suffix = config.end_of_word_suffix;
492494

@@ -511,7 +513,7 @@ class BPE extends TokenizerModel {
511513
let prev_char = word[0];
512514
for (let i = 1; i < word.length; ++i) {
513515
let char = word[i];
514-
pairs.add(`${prev_char} ${char}`);
516+
pairs.add(prev_char + this.BPE_SPLIT_TOKEN + char);
515517
prev_char = char;
516518
}
517519
return Array.from(pairs);
@@ -548,7 +550,7 @@ class BPE extends TokenizerModel {
548550
if (!(bigram in this.bpe_ranks)) {
549551
break;
550552
}
551-
let [first, second] = bigram.split(/\s+/g)
553+
let [first, second] = bigram.split(this.BPE_SPLIT_TOKEN);
552554
let new_word = [];
553555
let i = 0;
554556
let j = -1;
@@ -579,7 +581,7 @@ class BPE extends TokenizerModel {
579581
pairs = this.get_pairs(word);
580582
}
581583
}
582-
let final_word = word.join(" ");
584+
let final_word = word.join(this.BPE_SPLIT_TOKEN);
583585
this.cache[token] = final_word;
584586
return final_word;
585587
}
@@ -593,7 +595,7 @@ class BPE extends TokenizerModel {
593595
let outputTokens = [];
594596

595597
for (let token of tokens) {
596-
let bpe_token_list = this.bpe(token).split(' ');
598+
let bpe_token_list = this.bpe(token).split(this.BPE_SPLIT_TOKEN);
597599

598600
for (let t of bpe_token_list) {
599601
if (this.tokens_to_ids.has(t)) {
@@ -801,10 +803,10 @@ class NormalizerSequence extends Normalizer {
801803
this.normalizers = config.normalizers.map(x => Normalizer.fromConfig(x));
802804
}
803805
/**
804-
* Apply a sequence of Normalizers to the input text.
805-
* @param {string} text The text to normalize.
806-
* @returns {string} The normalized text.
807-
*/
806+
* Apply a sequence of Normalizers to the input text.
807+
* @param {string} text The text to normalize.
808+
* @returns {string} The normalized text.
809+
*/
808810
normalize(text) {
809811
return this.normalizers.reduce((t, normalizer) => {
810812
return normalizer.normalize(t);
@@ -1758,6 +1760,9 @@ class Precompiled extends Normalizer {
17581760
*/
17591761
normalize(text) {
17601762
// TODO use this.charsmap
1763+
// For now, we just apply NFKC normalization
1764+
// https://github.com/huggingface/tokenizers/blob/291b2e23ae81cf94738835852213ce120152d121/bindings/python/py_src/tokenizers/implementations/sentencepiece_bpe.py#L34
1765+
text = text.normalize('NFKC');
17611766
return text;
17621767
}
17631768
}

tests/generate_tests.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,9 @@
3838
"The company was founded in 2016.",
3939
"test $1 R2 #3 €4 £5 ¥6 ₣7 ₹8 ₱9 test",
4040
"I bought an apple for $1.00 at the store.",
41+
"you… ",
42+
"\u0079\u006F\u0075\u2026\u00A0\u00A0",
43+
"\u0079\u006F\u0075\u2026\u00A0\u00A0\u0079\u006F\u0075\u2026\u00A0\u00A0",
4144
],
4245
"custom": {
4346
"tiiuae/falcon-7b": [

0 commit comments

Comments
 (0)