Fix deberta tokenizer bug caused by bug in normalizer which caused offesets to be negative

Max Hniebergall · Max Hniebergall · commit b2a09ed86e3d · 2024-11-20T14:39:02.000-05:00
diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/inference/nlp/tokenizers/PrecompiledCharMapNormalizer.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/inference/nlp/tokenizers/PrecompiledCharMapNormalizer.java
@@ -194,7 +194,7 @@ Reader normalize(CharSequence str) {
                     if (charDelta < 0) {
                         // normalised form is shorter
                         int lastDiff = getLastCumulativeDiff();
-                        addOffCorrectMap(normalizedCharPos, lastDiff + charDelta);
+                        addOffCorrectMap(normalizedCharPos, lastDiff - charDelta);
                     } else if (charDelta > 0) {
                         // inserted chars, add the offset in the output stream
                         int lastDiff = getLastCumulativeDiff();