diff --git a/docs/changelog/117189.yaml b/docs/changelog/117189.yaml new file mode 100644 index 0000000000000..e89c2d81506d9 --- /dev/null +++ b/docs/changelog/117189.yaml @@ -0,0 +1,5 @@ +pr: 117189 +summary: Fix deberta tokenizer bug caused by bug in normalizer +area: Machine Learning +type: bug +issues: [] diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/inference/nlp/tokenizers/PrecompiledCharMapNormalizer.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/inference/nlp/tokenizers/PrecompiledCharMapNormalizer.java index bbe5bea691c35..5dd7dbbffaa61 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/inference/nlp/tokenizers/PrecompiledCharMapNormalizer.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/inference/nlp/tokenizers/PrecompiledCharMapNormalizer.java @@ -194,7 +194,7 @@ Reader normalize(CharSequence str) { if (charDelta < 0) { // normalised form is shorter int lastDiff = getLastCumulativeDiff(); - addOffCorrectMap(normalizedCharPos, lastDiff + charDelta); + addOffCorrectMap(normalizedCharPos, lastDiff - charDelta); } else if (charDelta > 0) { // inserted chars, add the offset in the output stream int lastDiff = getLastCumulativeDiff(); diff --git a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/inference/nlp/tokenizers/DebertaV2TokenizerTests.java b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/inference/nlp/tokenizers/DebertaV2TokenizerTests.java index bbe509da67452..a8461de8630ae 100644 --- a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/inference/nlp/tokenizers/DebertaV2TokenizerTests.java +++ b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/inference/nlp/tokenizers/DebertaV2TokenizerTests.java @@ -94,6 +94,20 @@ public void testTokenize() throws IOException { } } + public void testTokenizeWithHiddenControlCharacters() throws IOException { + try ( + DebertaV2Tokenizer tokenizer = DebertaV2Tokenizer.builder( + TEST_CASE_VOCAB, + TEST_CASE_SCORES, + new DebertaV2Tokenization(false, false, null, Tokenization.Truncate.NONE, -1) + ).build() + ) { + TokenizationResult.Tokens tokenization = tokenizer.tokenize("\u009F\u008Fz", Tokenization.Truncate.NONE, -1, 0, null).get(0); + assertThat(tokenStrings(tokenization.tokens().get(0)), contains("▁", "z")); + + } + } + public void testSurrogatePair() throws IOException { try ( DebertaV2Tokenizer tokenizer = DebertaV2Tokenizer.builder(