@@ -475,6 +475,8 @@ class BPE extends TokenizerModel {
475475 constructor ( config ) {
476476 super ( config ) ;
477477
478+ this . BPE_SPLIT_TOKEN = ' ' ;
479+
478480 this . tokens_to_ids = config . vocab ;
479481
480482 this . unk_token_id = this . tokens_to_ids . get ( config . unk_token ) ;
@@ -486,7 +488,7 @@ class BPE extends TokenizerModel {
486488 }
487489
488490 this . bpe_ranks = Object . fromEntries ( config . merges . map ( ( x , i ) => [ x , i ] ) ) ;
489- this . merges = config . merges . map ( x => x . split ( / \s + / ) )
491+ this . merges = config . merges . map ( x => x . split ( this . BPE_SPLIT_TOKEN ) ) ;
490492
491493 this . end_of_word_suffix = config . end_of_word_suffix ;
492494
@@ -511,7 +513,7 @@ class BPE extends TokenizerModel {
511513 let prev_char = word [ 0 ] ;
512514 for ( let i = 1 ; i < word . length ; ++ i ) {
513515 let char = word [ i ] ;
514- pairs . add ( ` ${ prev_char } ${ char } ` ) ;
516+ pairs . add ( prev_char + this . BPE_SPLIT_TOKEN + char ) ;
515517 prev_char = char ;
516518 }
517519 return Array . from ( pairs ) ;
@@ -548,7 +550,7 @@ class BPE extends TokenizerModel {
548550 if ( ! ( bigram in this . bpe_ranks ) ) {
549551 break ;
550552 }
551- let [ first , second ] = bigram . split ( / \s + / g )
553+ let [ first , second ] = bigram . split ( this . BPE_SPLIT_TOKEN ) ;
552554 let new_word = [ ] ;
553555 let i = 0 ;
554556 let j = - 1 ;
@@ -579,7 +581,7 @@ class BPE extends TokenizerModel {
579581 pairs = this . get_pairs ( word ) ;
580582 }
581583 }
582- let final_word = word . join ( " " ) ;
584+ let final_word = word . join ( this . BPE_SPLIT_TOKEN ) ;
583585 this . cache [ token ] = final_word ;
584586 return final_word ;
585587 }
@@ -593,7 +595,7 @@ class BPE extends TokenizerModel {
593595 let outputTokens = [ ] ;
594596
595597 for ( let token of tokens ) {
596- let bpe_token_list = this . bpe ( token ) . split ( ' ' ) ;
598+ let bpe_token_list = this . bpe ( token ) . split ( this . BPE_SPLIT_TOKEN ) ;
597599
598600 for ( let t of bpe_token_list ) {
599601 if ( this . tokens_to_ids . has ( t ) ) {
@@ -801,10 +803,10 @@ class NormalizerSequence extends Normalizer {
801803 this . normalizers = config . normalizers . map ( x => Normalizer . fromConfig ( x ) ) ;
802804 }
803805 /**
804- * Apply a sequence of Normalizers to the input text.
805- * @param {string } text The text to normalize.
806- * @returns {string } The normalized text.
807- */
806+ * Apply a sequence of Normalizers to the input text.
807+ * @param {string } text The text to normalize.
808+ * @returns {string } The normalized text.
809+ */
808810 normalize ( text ) {
809811 return this . normalizers . reduce ( ( t , normalizer ) => {
810812 return normalizer . normalize ( t ) ;
@@ -1758,6 +1760,9 @@ class Precompiled extends Normalizer {
17581760 */
17591761 normalize ( text ) {
17601762 // TODO use this.charsmap
1763+ // For now, we just apply NFKC normalization
1764+ // https://github.com/huggingface/tokenizers/blob/291b2e23ae81cf94738835852213ce120152d121/bindings/python/py_src/tokenizers/implementations/sentencepiece_bpe.py#L34
1765+ text = text . normalize ( 'NFKC' ) ;
17611766 return text ;
17621767 }
17631768}
0 commit comments