Skip to content

Commit 7c1bde9

Browse files
authored
Add support for Split pretokenizer w/ behavior=removed & invert=false
1 parent 7b1ce3c commit 7c1bde9

File tree

2 files changed

+12
-0
lines changed

2 files changed

+12
-0
lines changed

src/tokenizers.js

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1518,6 +1518,8 @@ class SplitPreTokenizer extends PreTokenizer {
15181518

15191519
if (this.config.invert) {
15201520
return text.match(this.pattern) || [];
1521+
} else if (this.config.behavior?.toLowerCase() === 'removed') {
1522+
return text.split(this.pattern).filter(x => x);
15211523
} else {
15221524
return regexSplit(text, this.pattern);
15231525
}

tests/models/roberta/tokenization.js

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -691,4 +691,14 @@ export const TEST_CONFIG = {
691691
decoded: "<s> \tH\u00e4LLo!how \n Are yoU? </s>",
692692
},
693693
},
694+
695+
// Split tokenizer with behavior="Removed" and invert=false
696+
"onnx-community/camembertv2-base": {
697+
SIMPLE: {
698+
text: BASE_TEST_STRINGS.SIMPLE,
699+
tokens: ['How', 'are', 'you', 'doi', '##ng', '?'],
700+
ids: [1, 14473, 9556, 10577, 6471, 9274, 38, 2],
701+
decoded: "[CLS] How are you doing? [SEP]",
702+
}
703+
},
694704
};

0 commit comments

Comments
 (0)