Skip to content
Merged
Show file tree
Hide file tree
Changes from 10 commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
5851594
Reset the last optimised value when stringEnd gets reset too.
gmarouli Sep 22, 2025
8552375
Update docs/changelog/135184.yaml
gmarouli Sep 22, 2025
add871b
Revert "Reset the last optimised value when stringEnd gets reset too."
gmarouli Sep 23, 2025
c9844d1
Use qualified import
gmarouli Sep 23, 2025
0048c84
Add unit test that captures the issue.
gmarouli Sep 23, 2025
57d4c8f
[CI] Update transport version definitions
Sep 23, 2025
3402bc6
Reset the last value properly
gmarouli Sep 23, 2025
4b71f8c
[CI] Update transport version definitions
Sep 23, 2025
222a251
Add comments
gmarouli Sep 23, 2025
3be8275
Update docs/changelog/135184.yaml
gmarouli Sep 24, 2025
34ddd56
Merge branch 'main' into bug-fix-utf8-optimiser
gmarouli Sep 24, 2025
482c33e
Add getText() also in ESCborParserTests
gmarouli Sep 24, 2025
964ef6d
Merge branch 'main' into bug-fix-utf8-optimiser
gmarouli Sep 25, 2025
9a39f67
Add randomised test that compares an optimised and a baseline parseAr…
gmarouli Sep 25, 2025
1a00f09
Revert adding to importing org.hamcrest.Matchers
gmarouli Sep 25, 2025
5caf1f6
Revert adding to importing org.hamcrest.Matchers and remove not neede…
gmarouli Sep 25, 2025
f3e75d1
Delete docs/changelog/135184.yaml
gmarouli Sep 25, 2025
8f5f178
Use `mapOrdered` to simplify the assertion
gmarouli Sep 29, 2025
3cf40d3
Add yaml test
gmarouli Sep 29, 2025
23f6a11
Make `text()` also call `optimisedText()` some times
gmarouli Sep 29, 2025
850c4ac
Merge branch 'main' into bug-fix-utf8-optimiser
gmarouli Sep 29, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions docs/changelog/135184.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
pr: 135184
summary: "Bug fix, last optimised value in `ESUTF8StreamJsonParser` kept old value"
area: Infra/Core
type: bug
issues:
- 135256
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,12 @@
import java.util.ArrayList;
import java.util.List;

/**
* Provides the method getValueAsText that is a best-effort optimization for UTF8 fields. If the
* {@link UTF8StreamJsonParser} has already parsed the text, then the caller should fall back to getText.
* This is sufficient because when we call getText, jackson stores the parsed UTF-16 value for future calls.
* Once we've already got the parsed UTF-16 value, the optimization isn't necessary.
*/
public class ESUTF8StreamJsonParser extends UTF8StreamJsonParser implements OptimizedTextCapable {
protected int stringEnd = -1;
protected int stringLength;
Expand Down Expand Up @@ -53,6 +59,7 @@ public ESUTF8StreamJsonParser(
*/
@Override
public Text getValueAsText() throws IOException {
// _tokenIncomplete is true when UTF8StreamJsonParser has already processed this value.
if (_currToken == JsonToken.VALUE_STRING && _tokenIncomplete) {
if (lastOptimisedValue != null) {
return new Text(new XContentString.UTF8Bytes(lastOptimisedValue), stringLength);
Expand Down Expand Up @@ -148,33 +155,33 @@ protected Text _finishAndReturnText() throws IOException {

@Override
public JsonToken nextToken() throws IOException {
maybeResetCurrentTokenState();
stringEnd = -1;
resetCurrentTokenState();
return super.nextToken();
}

@Override
public boolean nextFieldName(SerializableString str) throws IOException {
maybeResetCurrentTokenState();
stringEnd = -1;
resetCurrentTokenState();
return super.nextFieldName(str);
}

@Override
public String nextFieldName() throws IOException {
maybeResetCurrentTokenState();
stringEnd = -1;
resetCurrentTokenState();
return super.nextFieldName();
}

/**
* Resets the current token state before moving to the next.
* Resets the current token state before moving to the next. It resets the _inputPtr and the
* _tokenIncomplete only if {@link UTF8StreamJsonParser#getText()} or {@link UTF8StreamJsonParser#getValueAsString()}
* hasn't run yet.
*/
private void maybeResetCurrentTokenState() {
private void resetCurrentTokenState() {
if (_currToken == JsonToken.VALUE_STRING && _tokenIncomplete && stringEnd > 0) {
_inputPtr = stringEnd;
_tokenIncomplete = false;
lastOptimisedValue = null;
}
lastOptimisedValue = null;
stringEnd = -1;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
import com.fasterxml.jackson.core.JsonToken;
import com.fasterxml.jackson.core.exc.InputCoercionException;
import com.fasterxml.jackson.core.exc.StreamConstraintsException;
import com.fasterxml.jackson.core.filter.FilteringParserDelegate;
import com.fasterxml.jackson.core.io.JsonEOFException;

import org.elasticsearch.core.IOUtils;
Expand All @@ -26,6 +27,7 @@
import org.elasticsearch.xcontent.XContentParserConfiguration;
import org.elasticsearch.xcontent.XContentString;
import org.elasticsearch.xcontent.XContentType;
import org.elasticsearch.xcontent.provider.OptimizedTextCapable;
import org.elasticsearch.xcontent.provider.XContentParserConfigurationImpl;
import org.elasticsearch.xcontent.support.AbstractXContentParser;

Expand Down Expand Up @@ -143,7 +145,19 @@ public String text() throws IOException {

@Override
public XContentString optimizedText() throws IOException {
// TODO: enable utf-8 parsing optimization once verified it is completely safe
if (currentToken().isValue() == false) {
throwOnNoText();
}
var parser = this.parser;
if (parser instanceof FilteringParserDelegate delegate) {
parser = delegate.delegate();
}
if (parser instanceof OptimizedTextCapable optimizedTextCapableParser) {
var bytesRef = optimizedTextCapableParser.getValueAsText();
if (bytesRef != null) {
return bytesRef;
}
}
return new Text(text());
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,10 @@
import java.nio.charset.StandardCharsets;
import java.util.Locale;

import static org.hamcrest.Matchers.equalTo;
import static org.hamcrest.Matchers.notNullValue;
import static org.hamcrest.Matchers.nullValue;

public class ESUTF8StreamJsonParserTests extends ESTestCase {

private void testParseJson(String input, CheckedConsumer<ESUTF8StreamJsonParser, IOException> test) throws IOException {
Expand All @@ -34,35 +38,35 @@ private void testParseJson(String input, CheckedConsumer<ESUTF8StreamJsonParser,
}

private void assertTextRef(XContentString.UTF8Bytes textRef, String expectedValue) {
assertThat(textRef, Matchers.equalTo(new XContentString.UTF8Bytes(expectedValue.getBytes(StandardCharsets.UTF_8))));
assertThat(textRef, equalTo(new XContentString.UTF8Bytes(expectedValue.getBytes(StandardCharsets.UTF_8))));
}

public void testGetValueAsText() throws IOException {
testParseJson("{\"foo\": \"bar\"}", parser -> {
assertThat(parser.nextToken(), Matchers.equalTo(JsonToken.START_OBJECT));
assertThat(parser.nextFieldName(), Matchers.equalTo("foo"));
assertThat(parser.nextValue(), Matchers.equalTo(JsonToken.VALUE_STRING));
assertThat(parser.nextToken(), equalTo(JsonToken.START_OBJECT));
assertThat(parser.nextFieldName(), equalTo("foo"));
assertThat(parser.nextValue(), equalTo(JsonToken.VALUE_STRING));

var text = parser.getValueAsText();
assertThat(text, Matchers.notNullValue());

var bytes = text.bytes();
assertThat(bytes.offset(), Matchers.equalTo(9));
assertThat(bytes.offset() + bytes.length(), Matchers.equalTo(12));
assertThat(bytes.offset(), equalTo(9));
assertThat(bytes.offset() + bytes.length(), equalTo(12));
assertTextRef(bytes, "bar");

assertThat(parser.getValueAsString(), Matchers.equalTo("bar"));
assertThat(parser.getValueAsString(), equalTo("bar"));
assertThat(parser.getValueAsText(), Matchers.nullValue());

assertThat(parser.nextToken(), Matchers.equalTo(JsonToken.END_OBJECT));
assertThat(parser.nextToken(), equalTo(JsonToken.END_OBJECT));
});

testParseJson("{\"foo\": [\"bar\\\"baz\\\"\", \"foobar\"]}", parser -> {
assertThat(parser.nextToken(), Matchers.equalTo(JsonToken.START_OBJECT));
assertThat(parser.nextFieldName(), Matchers.equalTo("foo"));
assertThat(parser.nextToken(), equalTo(JsonToken.START_OBJECT));
assertThat(parser.nextFieldName(), equalTo("foo"));

assertThat(parser.nextValue(), Matchers.equalTo(JsonToken.START_ARRAY));
assertThat(parser.nextValue(), Matchers.equalTo(JsonToken.VALUE_STRING));
assertThat(parser.nextValue(), equalTo(JsonToken.START_ARRAY));
assertThat(parser.nextValue(), equalTo(JsonToken.VALUE_STRING));

var firstText = parser.getValueAsText();
assertThat(firstText, Matchers.notNullValue());
Expand All @@ -73,89 +77,110 @@ public void testGetValueAsText() throws IOException {
assertTextRef(firstText.bytes(), "bar\"baz\"");

// Ensure values lastOptimisedValue is reset
assertThat(parser.nextValue(), Matchers.equalTo(JsonToken.VALUE_STRING));
assertThat(parser.nextValue(), equalTo(JsonToken.VALUE_STRING));
var secondTest = parser.getValueAsText();
assertThat(secondTest, Matchers.notNullValue());
assertTextRef(secondTest.bytes(), "foobar");
secondTest = parser.getValueAsText();
assertThat(secondTest, Matchers.notNullValue());
assertTextRef(secondTest.bytes(), "foobar");
assertThat(parser.nextValue(), Matchers.equalTo(JsonToken.END_ARRAY));
assertThat(parser.nextValue(), equalTo(JsonToken.END_ARRAY));
});

testParseJson("{\"foo\": \"b\\u00e5r\"}", parser -> {
assertThat(parser.nextToken(), Matchers.equalTo(JsonToken.START_OBJECT));
assertThat(parser.nextFieldName(), Matchers.equalTo("foo"));
assertThat(parser.nextValue(), Matchers.equalTo(JsonToken.VALUE_STRING));
assertThat(parser.nextToken(), equalTo(JsonToken.START_OBJECT));
assertThat(parser.nextFieldName(), equalTo("foo"));
assertThat(parser.nextValue(), equalTo(JsonToken.VALUE_STRING));

assertThat(parser.getValueAsText(), Matchers.nullValue());
assertThat(parser.getValueAsString(), Matchers.equalTo("bår"));
assertThat(parser.getValueAsString(), equalTo("bår"));
});

testParseJson("{\"foo\": \"\uD83D\uDE0A\"}", parser -> {
assertThat(parser.nextToken(), Matchers.equalTo(JsonToken.START_OBJECT));
assertThat(parser.nextFieldName(), Matchers.equalTo("foo"));
assertThat(parser.nextValue(), Matchers.equalTo(JsonToken.VALUE_STRING));
assertThat(parser.nextToken(), equalTo(JsonToken.START_OBJECT));
assertThat(parser.nextFieldName(), equalTo("foo"));
assertThat(parser.nextValue(), equalTo(JsonToken.VALUE_STRING));

var text = parser.getValueAsText();
assertThat(text, Matchers.notNullValue());
var bytes = text.bytes();
assertTextRef(bytes, "\uD83D\uDE0A");
assertThat(text.stringLength(), Matchers.equalTo(2));
assertThat(text.stringLength(), equalTo(2));
});

testParseJson("{\"foo\": \"bår\"}", parser -> {
assertThat(parser.nextToken(), Matchers.equalTo(JsonToken.START_OBJECT));
assertThat(parser.nextFieldName(), Matchers.equalTo("foo"));
assertThat(parser.nextValue(), Matchers.equalTo(JsonToken.VALUE_STRING));
assertThat(parser.nextToken(), equalTo(JsonToken.START_OBJECT));
assertThat(parser.nextFieldName(), equalTo("foo"));
assertThat(parser.nextValue(), equalTo(JsonToken.VALUE_STRING));

var text = parser.getValueAsText();
assertThat(text, Matchers.notNullValue());

var bytes = text.bytes();
assertThat(bytes.offset(), Matchers.equalTo(9));
assertThat(bytes.offset() + bytes.length(), Matchers.equalTo(13));
assertThat(bytes.offset(), equalTo(9));
assertThat(bytes.offset() + bytes.length(), equalTo(13));
assertTextRef(bytes, "bår");

assertThat(parser.getValueAsString(), Matchers.equalTo("bår"));
assertThat(parser.getValueAsString(), equalTo("bår"));

assertThat(parser.nextToken(), Matchers.equalTo(JsonToken.END_OBJECT));
assertThat(parser.nextToken(), equalTo(JsonToken.END_OBJECT));
});

testParseJson("{\"foo\": [\"lorem\", \"ipsum\", \"dolor\"]}", parser -> {
assertThat(parser.nextToken(), Matchers.equalTo(JsonToken.START_OBJECT));
assertThat(parser.nextFieldName(), Matchers.equalTo("foo"));
assertThat(parser.nextValue(), Matchers.equalTo(JsonToken.START_ARRAY));
assertThat(parser.nextToken(), equalTo(JsonToken.START_OBJECT));
assertThat(parser.nextFieldName(), equalTo("foo"));
assertThat(parser.nextValue(), equalTo(JsonToken.START_ARRAY));

assertThat(parser.nextValue(), Matchers.equalTo(JsonToken.VALUE_STRING));
assertThat(parser.nextValue(), equalTo(JsonToken.VALUE_STRING));
{
var textRef = parser.getValueAsText().bytes();
assertThat(textRef, Matchers.notNullValue());
assertThat(textRef.offset(), Matchers.equalTo(10));
assertThat(textRef.offset() + textRef.length(), Matchers.equalTo(15));
assertThat(textRef.offset(), equalTo(10));
assertThat(textRef.offset() + textRef.length(), equalTo(15));
assertTextRef(textRef, "lorem");
}

assertThat(parser.nextValue(), Matchers.equalTo(JsonToken.VALUE_STRING));
assertThat(parser.nextValue(), equalTo(JsonToken.VALUE_STRING));
{
var textRef = parser.getValueAsText().bytes();
assertThat(textRef, Matchers.notNullValue());
assertThat(textRef.offset(), Matchers.equalTo(19));
assertThat(textRef.offset() + textRef.length(), Matchers.equalTo(24));
assertThat(textRef.offset(), equalTo(19));
assertThat(textRef.offset() + textRef.length(), equalTo(24));
assertTextRef(textRef, "ipsum");
}

assertThat(parser.nextValue(), Matchers.equalTo(JsonToken.VALUE_STRING));
assertThat(parser.nextValue(), equalTo(JsonToken.VALUE_STRING));
{
var textRef = parser.getValueAsText().bytes();
assertThat(textRef, Matchers.notNullValue());
assertThat(textRef.offset(), Matchers.equalTo(28));
assertThat(textRef.offset() + textRef.length(), Matchers.equalTo(33));
assertThat(textRef.offset(), equalTo(28));
assertThat(textRef.offset() + textRef.length(), equalTo(33));
assertTextRef(textRef, "dolor");
}

assertThat(parser.nextToken(), Matchers.equalTo(JsonToken.END_ARRAY));
assertThat(parser.nextToken(), Matchers.equalTo(JsonToken.END_OBJECT));
assertThat(parser.nextToken(), equalTo(JsonToken.END_ARRAY));
assertThat(parser.nextToken(), equalTo(JsonToken.END_OBJECT));
});
}

public void testOptimisedValueAndText() throws IOException {
testParseJson("{\"my-funky-field\": \"14`\\\\%+\"}", parser -> {
assertThat(parser.nextToken(), equalTo(JsonToken.START_OBJECT));
assertThat(parser.nextFieldName(), equalTo("my-funky-field"));
assertThat(parser.nextValue(), equalTo(JsonToken.VALUE_STRING));

var text = parser.getValueAsText();
assertThat(text, Matchers.notNullValue());

assertTextRef(text.bytes(), "14`\\%+");
// Retrieve the value for a second time to ensure the last value is available
assertThat(parser.getText(), equalTo("14`\\%+"));
// After retrieving with getText() we do not use the optimised value even if it's there.
assertThat(parser.getValueAsText(), Matchers.nullValue());
assertThat(parser.lastOptimisedValue, notNullValue());

assertThat(parser.nextToken(), equalTo(JsonToken.END_OBJECT));
assertThat(parser.lastOptimisedValue, nullValue());
});
}

Expand Down Expand Up @@ -238,7 +263,7 @@ private TestInput buildRandomInput(int length) {
return new TestInput(input.toString(), result.toString(), doesSupportOptimized);
}

public void testGetValueRandomized() throws IOException {
public void testGetValueAndTextRandomized() throws IOException {
StringBuilder inputBuilder = new StringBuilder();
inputBuilder.append('{');

Expand All @@ -262,27 +287,32 @@ public void testGetValueRandomized() throws IOException {

inputBuilder.append('}');
testParseJson(inputBuilder.toString(), parser -> {
assertThat(parser.nextToken(), Matchers.equalTo(JsonToken.START_OBJECT));
assertThat(parser.nextToken(), equalTo(JsonToken.START_OBJECT));
for (int i = 0; i < numKeys; i++) {
assertThat(parser.nextFieldName(), Matchers.equalTo(keys[i]));
assertThat(parser.nextValue(), Matchers.equalTo(JsonToken.VALUE_STRING));
assertThat(parser.nextFieldName(), equalTo(keys[i]));
assertThat(parser.nextValue(), equalTo(JsonToken.VALUE_STRING));

String currVal = inputs[i].result();
if (inputs[i].supportsOptimized()) {
var text = parser.getValueAsText();
assertTextRef(text.bytes(), currVal);
assertThat(text.stringLength(), Matchers.equalTo(currVal.length()));

// Retrieve it twice to ensure it works as expected
assertThat(text.stringLength(), equalTo(currVal.length()));
// Retrieve it a second time
text = parser.getValueAsText();
assertThat(text, Matchers.notNullValue());
assertTextRef(text.bytes(), currVal);
assertThat(text.stringLength(), Matchers.equalTo(currVal.length()));
assertThat(text.stringLength(), equalTo(currVal.length()));
// Use getText()
assertThat(parser.getText(), equalTo(text.string()));
// After retrieving it with getText() we do not use the optimised value anymore.
assertThat(parser.getValueAsText(), Matchers.nullValue());
} else {
assertThat(parser.getText(), Matchers.notNullValue());
assertThat(parser.getValueAsText(), Matchers.nullValue());
assertThat(parser.getValueAsString(), Matchers.equalTo(currVal));
assertThat(parser.getValueAsString(), equalTo(currVal));
// Retrieve it twice to ensure it works as expected
assertThat(parser.getValueAsText(), Matchers.nullValue());
assertThat(parser.getValueAsString(), Matchers.equalTo(currVal));
assertThat(parser.getValueAsString(), equalTo(currVal));
}
}
});
Expand Down
2 changes: 1 addition & 1 deletion server/src/main/resources/transport/upper_bounds/9.2.csv
Original file line number Diff line number Diff line change
@@ -1 +1 @@
index_request_include_tsid,9167000
inference_api_openai_embeddings_headers,9169000
Original file line number Diff line number Diff line change
Expand Up @@ -59,8 +59,7 @@ public void testOptimizedTextHasBytes() throws Exception {
assertSame(XContentParser.Token.FIELD_NAME, parser.nextToken());
assertTrue(parser.nextToken().isValue());
Text text = (Text) parser.optimizedText();
// TODO: uncomment after utf8 optimized parsing has been enabled again:
// assertTrue(text.hasBytes());
assertTrue(text.hasBytes());
assertThat(text.string(), equalTo("foo"));
}
}
Expand Down