Skip to content

Commit 57a9ea1

Browse files
author
Max Hniebergall
committed
add test
1 parent 4bdf9a4 commit 57a9ea1

File tree

1 file changed

+14
-0
lines changed

1 file changed

+14
-0
lines changed

x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/inference/nlp/tokenizers/DebertaV2TokenizerTests.java

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,20 @@ public void testTokenize() throws IOException {
9494
}
9595
}
9696

97+
public void testTokenizeWithHiddenControlCharacters() throws IOException {
98+
try (
99+
DebertaV2Tokenizer tokenizer = DebertaV2Tokenizer.builder(
100+
TEST_CASE_VOCAB,
101+
TEST_CASE_SCORES,
102+
new DebertaV2Tokenization(false, false, null, Tokenization.Truncate.NONE, -1)
103+
).build()
104+
) {
105+
TokenizationResult.Tokens tokenization = tokenizer.tokenize("\u009F\u008Fz", Tokenization.Truncate.NONE, -1, 0, null).get(0);
106+
assertThat(tokenStrings(tokenization.tokens().get(0)), contains("▁", "z"));
107+
108+
}
109+
}
110+
97111
public void testSurrogatePair() throws IOException {
98112
try (
99113
DebertaV2Tokenizer tokenizer = DebertaV2Tokenizer.builder(

0 commit comments

Comments
 (0)