File tree Expand file tree Collapse file tree 1 file changed +9
-1
lines changed Expand file tree Collapse file tree 1 file changed +9
-1
lines changed Original file line number Diff line number Diff line change @@ -630,10 +630,12 @@ class BPE extends TokenizerModel {
630
630
* Create a BPE instance.
631
631
* @param {Object } config The configuration object for BPE.
632
632
* @param {Object } config.vocab A mapping of tokens to ids.
633
+ * @param {string[] } config.merges An array of BPE merges as strings.
633
634
* @param {string } config.unk_token The unknown token used for out of vocabulary words.
634
635
* @param {string } config.end_of_word_suffix The suffix to place at the end of each word.
635
636
* @param {string } [config.continuing_subword_suffix] The suffix to insert between words.
636
- * @param {Array } config.merges An array of BPE merges as strings.
637
+ * @param {boolean } [config.byte_fallback=false] Whether to use spm byte-fallback trick (defaults to False)
638
+ * @param {boolean } [config.ignore_merges=false] Whether or not to match tokens with the vocab before using merges.
637
639
*/
638
640
constructor ( config ) {
639
641
super ( config ) ;
@@ -665,6 +667,8 @@ class BPE extends TokenizerModel {
665
667
this . text_encoder = new TextEncoder ( ) ;
666
668
}
667
669
670
+ this . ignore_merges = this . config . ignore_merges ?? false ;
671
+
668
672
/** @type {Map<string, string[]> } */
669
673
this . cache = new Map ( ) ;
670
674
}
@@ -826,6 +830,10 @@ class BPE extends TokenizerModel {
826
830
const outputTokens = [ ] ;
827
831
828
832
for ( const token of tokens ) {
833
+ if ( this . ignore_merges && this . tokens_to_ids . has ( token ) ) {
834
+ outputTokens . push ( token ) ;
835
+ continue ;
836
+ }
829
837
const bpe_token_list = this . bpe ( token ) ;
830
838
831
839
for ( const t of bpe_token_list ) {
You can’t perform that action at this time.
0 commit comments