Skip to content

Commit e1b2b0c

Browse files
maxhniebergallalexey-ivanov-es
authored andcommitted
[ML] Fix deberta tokenizer bug caused by bug in normalizer (elastic#117189)
* Fix deberta tokenizer bug caused by bug in normalizer which caused offesets to be negative * Update docs/changelog/117189.yaml
1 parent 1b8afc5 commit e1b2b0c

File tree

3 files changed

+20
-1
lines changed

3 files changed

+20
-1
lines changed

docs/changelog/117189.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
pr: 117189
2+
summary: Fix deberta tokenizer bug caused by bug in normalizer
3+
area: Machine Learning
4+
type: bug
5+
issues: []

x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/inference/nlp/tokenizers/PrecompiledCharMapNormalizer.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -194,7 +194,7 @@ Reader normalize(CharSequence str) {
194194
if (charDelta < 0) {
195195
// normalised form is shorter
196196
int lastDiff = getLastCumulativeDiff();
197-
addOffCorrectMap(normalizedCharPos, lastDiff + charDelta);
197+
addOffCorrectMap(normalizedCharPos, lastDiff - charDelta);
198198
} else if (charDelta > 0) {
199199
// inserted chars, add the offset in the output stream
200200
int lastDiff = getLastCumulativeDiff();

x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/inference/nlp/tokenizers/DebertaV2TokenizerTests.java

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,20 @@ public void testTokenize() throws IOException {
9494
}
9595
}
9696

97+
public void testTokenizeWithHiddenControlCharacters() throws IOException {
98+
try (
99+
DebertaV2Tokenizer tokenizer = DebertaV2Tokenizer.builder(
100+
TEST_CASE_VOCAB,
101+
TEST_CASE_SCORES,
102+
new DebertaV2Tokenization(false, false, null, Tokenization.Truncate.NONE, -1)
103+
).build()
104+
) {
105+
TokenizationResult.Tokens tokenization = tokenizer.tokenize("\u009F\u008Fz", Tokenization.Truncate.NONE, -1, 0, null).get(0);
106+
assertThat(tokenStrings(tokenization.tokens().get(0)), contains("▁", "z"));
107+
108+
}
109+
}
110+
97111
public void testSurrogatePair() throws IOException {
98112
try (
99113
DebertaV2Tokenizer tokenizer = DebertaV2Tokenizer.builder(

0 commit comments

Comments
 (0)