File tree Expand file tree Collapse file tree 1 file changed +9
-1
lines changed Expand file tree Collapse file tree 1 file changed +9
-1
lines changed Original file line number Diff line number Diff line change @@ -630,10 +630,12 @@ class BPE extends TokenizerModel {
630630 * Create a BPE instance.
631631 * @param {Object } config The configuration object for BPE.
632632 * @param {Object } config.vocab A mapping of tokens to ids.
633+ * @param {string[] } config.merges An array of BPE merges as strings.
633634 * @param {string } config.unk_token The unknown token used for out of vocabulary words.
634635 * @param {string } config.end_of_word_suffix The suffix to place at the end of each word.
635636 * @param {string } [config.continuing_subword_suffix] The suffix to insert between words.
636- * @param {Array } config.merges An array of BPE merges as strings.
637+ * @param {boolean } [config.byte_fallback=false] Whether to use spm byte-fallback trick (defaults to False)
638+ * @param {boolean } [config.ignore_merges=false] Whether or not to match tokens with the vocab before using merges.
637639 */
638640 constructor ( config ) {
639641 super ( config ) ;
@@ -665,6 +667,8 @@ class BPE extends TokenizerModel {
665667 this . text_encoder = new TextEncoder ( ) ;
666668 }
667669
670+ this . ignore_merges = this . config . ignore_merges ?? false ;
671+
668672 /** @type {Map<string, string[]> } */
669673 this . cache = new Map ( ) ;
670674 }
@@ -826,6 +830,10 @@ class BPE extends TokenizerModel {
826830 const outputTokens = [ ] ;
827831
828832 for ( const token of tokens ) {
833+ if ( this . ignore_merges && this . tokens_to_ids . has ( token ) ) {
834+ outputTokens . push ( token ) ;
835+ continue ;
836+ }
829837 const bpe_token_list = this . bpe ( token ) ;
830838
831839 for ( const t of bpe_token_list ) {
You can’t perform that action at this time.
0 commit comments