Skip to content

Commit 6d5901e

Browse files
authored
Add ignore_merges option to BPE tokenizers (#716)
1 parent 6427431 commit 6d5901e

File tree

1 file changed

+9
-1
lines changed

1 file changed

+9
-1
lines changed

src/tokenizers.js

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -630,10 +630,12 @@ class BPE extends TokenizerModel {
630630
* Create a BPE instance.
631631
* @param {Object} config The configuration object for BPE.
632632
* @param {Object} config.vocab A mapping of tokens to ids.
633+
* @param {string[]} config.merges An array of BPE merges as strings.
633634
* @param {string} config.unk_token The unknown token used for out of vocabulary words.
634635
* @param {string} config.end_of_word_suffix The suffix to place at the end of each word.
635636
* @param {string} [config.continuing_subword_suffix] The suffix to insert between words.
636-
* @param {Array} config.merges An array of BPE merges as strings.
637+
* @param {boolean} [config.byte_fallback=false] Whether to use spm byte-fallback trick (defaults to False)
638+
* @param {boolean} [config.ignore_merges=false] Whether or not to match tokens with the vocab before using merges.
637639
*/
638640
constructor(config) {
639641
super(config);
@@ -665,6 +667,8 @@ class BPE extends TokenizerModel {
665667
this.text_encoder = new TextEncoder();
666668
}
667669

670+
this.ignore_merges = this.config.ignore_merges ?? false;
671+
668672
/** @type {Map<string, string[]>} */
669673
this.cache = new Map();
670674
}
@@ -826,6 +830,10 @@ class BPE extends TokenizerModel {
826830
const outputTokens = [];
827831

828832
for (const token of tokens) {
833+
if (this.ignore_merges && this.tokens_to_ids.has(token)) {
834+
outputTokens.push(token);
835+
continue;
836+
}
829837
const bpe_token_list = this.bpe(token);
830838

831839
for (const t of bpe_token_list) {

0 commit comments

Comments
 (0)