[ML] Fix deberta tokenizer bug caused by bug in normalizer (elastic#117189)

maxhniebergall · alexey-ivanov-es · commit e1b2b0c2612f · 2024-11-28T17:43:54.000Z
* Fix deberta tokenizer bug caused by bug in normalizer which caused offesets to be negative

* Update docs/changelog/117189.yaml
diff --git a/docs/changelog/117189.yaml b/docs/changelog/117189.yaml
@@ -0,0 +1,5 @@
+pr: 117189
+summary: Fix deberta tokenizer bug caused by bug in normalizer
+area: Machine Learning
+type: bug
+issues: []
diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/inference/nlp/tokenizers/PrecompiledCharMapNormalizer.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/inference/nlp/tokenizers/PrecompiledCharMapNormalizer.java
@@ -194,7 +194,7 @@ Reader normalize(CharSequence str) {
                     if (charDelta < 0) {
                         // normalised form is shorter
                         int lastDiff = getLastCumulativeDiff();
-                        addOffCorrectMap(normalizedCharPos, lastDiff + charDelta);
+                        addOffCorrectMap(normalizedCharPos, lastDiff - charDelta);
                     } else if (charDelta > 0) {
                         // inserted chars, add the offset in the output stream
                         int lastDiff = getLastCumulativeDiff();
diff --git a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/inference/nlp/tokenizers/DebertaV2TokenizerTests.java b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/inference/nlp/tokenizers/DebertaV2TokenizerTests.java
@@ -94,6 +94,20 @@ public void testTokenize() throws IOException {
         }
     }
 
+    public void testTokenizeWithHiddenControlCharacters() throws IOException {
+        try (
+            DebertaV2Tokenizer tokenizer = DebertaV2Tokenizer.builder(
+                TEST_CASE_VOCAB,
+                TEST_CASE_SCORES,
+                new DebertaV2Tokenization(false, false, null, Tokenization.Truncate.NONE, -1)
+            ).build()
+        ) {
+            TokenizationResult.Tokens tokenization = tokenizer.tokenize("\u009F\u008Fz", Tokenization.Truncate.NONE, -1, 0, null).get(0);
+            assertThat(tokenStrings(tokenization.tokens().get(0)), contains("▁", "z"));
+
+        }
+    }
+
     public void testSurrogatePair() throws IOException {
         try (
             DebertaV2Tokenizer tokenizer = DebertaV2Tokenizer.builder(