diff --git a/libs/x-content/impl/build.gradle b/libs/x-content/impl/build.gradle index 35e122d336c68..e9290e1c4b242 100644 --- a/libs/x-content/impl/build.gradle +++ b/libs/x-content/impl/build.gradle @@ -55,3 +55,7 @@ tasks.named("thirdPartyAudit").configure { 'com.fasterxml.jackson.databind.cfg.MapperBuilder' ) } + +tasks.named("licenseHeaders").configure { + approvedLicenses = ['Apache', 'AGLP+SSPL+Elastic License', 'Generated', 'Vendored'] +} diff --git a/libs/x-content/impl/src/main/java/org/elasticsearch/xcontent/provider/OptimizedTextCapable.java b/libs/x-content/impl/src/main/java/org/elasticsearch/xcontent/provider/OptimizedTextCapable.java new file mode 100644 index 0000000000000..432f21a3365b6 --- /dev/null +++ b/libs/x-content/impl/src/main/java/org/elasticsearch/xcontent/provider/OptimizedTextCapable.java @@ -0,0 +1,29 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the "Elastic License + * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side + * Public License v 1"; you may not use this file except in compliance with, at + * your election, the "Elastic License 2.0", the "GNU Affero General Public + * License v3.0 only", or the "Server Side Public License, v 1". + */ + +package org.elasticsearch.xcontent.provider; + +import org.elasticsearch.xcontent.Text; + +import java.io.IOException; + +/** + * Indicates that a {@link com.fasterxml.jackson.core.JsonParser} is capable of + * returning the underlying UTF-8 encoded bytes of the current string token. + * This is useful for performance optimizations, as it allows the parser to + * avoid unnecessary conversions to and from strings. + */ +public interface OptimizedTextCapable { + + /** + * Method that will try to get underlying UTF-8 encoded bytes of the current string token. + * This is only a best-effort attempt; if there is some reason the bytes cannot be retrieved, this method will return null. + */ + Text getValueAsText() throws IOException; +} diff --git a/libs/x-content/impl/src/main/java/org/elasticsearch/xcontent/provider/cbor/CborXContentImpl.java b/libs/x-content/impl/src/main/java/org/elasticsearch/xcontent/provider/cbor/CborXContentImpl.java index 690799721d9c0..063db426d87c0 100644 --- a/libs/x-content/impl/src/main/java/org/elasticsearch/xcontent/provider/cbor/CborXContentImpl.java +++ b/libs/x-content/impl/src/main/java/org/elasticsearch/xcontent/provider/cbor/CborXContentImpl.java @@ -47,7 +47,7 @@ public static XContent cborXContent() { } static { - cborFactory = XContentImplUtils.configure(CBORFactory.builder()); + cborFactory = XContentImplUtils.configure(ESCborFactory.builder()); cborFactory.configure(CBORFactory.Feature.FAIL_ON_SYMBOL_HASH_OVERFLOW, false); // this trips on many mappings now... // Do not automatically close unclosed objects/arrays in com.fasterxml.jackson.dataformat.cbor.CBORGenerator#close() method cborFactory.configure(JsonGenerator.Feature.AUTO_CLOSE_JSON_CONTENT, false); diff --git a/libs/x-content/impl/src/main/java/org/elasticsearch/xcontent/provider/cbor/ESCborFactory.java b/libs/x-content/impl/src/main/java/org/elasticsearch/xcontent/provider/cbor/ESCborFactory.java new file mode 100644 index 0000000000000..073ff476c48e1 --- /dev/null +++ b/libs/x-content/impl/src/main/java/org/elasticsearch/xcontent/provider/cbor/ESCborFactory.java @@ -0,0 +1,28 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the "Elastic License + * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side + * Public License v 1"; you may not use this file except in compliance with, at + * your election, the "Elastic License 2.0", the "GNU Affero General Public + * License v3.0 only", or the "Server Side Public License, v 1". + */ + +package org.elasticsearch.xcontent.provider.cbor; + +import com.fasterxml.jackson.core.io.IOContext; +import com.fasterxml.jackson.core.sym.ByteQuadsCanonicalizer; +import com.fasterxml.jackson.dataformat.cbor.CBORFactory; +import com.fasterxml.jackson.dataformat.cbor.CBORParser; + +public class ESCborFactory extends CBORFactory { + + public static ESCborFactoryBuilder builder() { + return new ESCborFactoryBuilder(); + } + + @Override + protected CBORParser _createParser(byte[] data, int offset, int len, IOContext ctxt) { + ByteQuadsCanonicalizer can = _byteSymbolCanonicalizer.makeChildOrPlaceholder(_factoryFeatures); + return new ESCborParser(ctxt, _parserFeatures, _formatParserFeatures, _objectCodec, can, null, data, offset, offset + len, false); + } +} diff --git a/libs/x-content/impl/src/main/java/org/elasticsearch/xcontent/provider/cbor/ESCborFactoryBuilder.java b/libs/x-content/impl/src/main/java/org/elasticsearch/xcontent/provider/cbor/ESCborFactoryBuilder.java new file mode 100644 index 0000000000000..76674e1471efb --- /dev/null +++ b/libs/x-content/impl/src/main/java/org/elasticsearch/xcontent/provider/cbor/ESCborFactoryBuilder.java @@ -0,0 +1,21 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the "Elastic License + * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side + * Public License v 1"; you may not use this file except in compliance with, at + * your election, the "Elastic License 2.0", the "GNU Affero General Public + * License v3.0 only", or the "Server Side Public License, v 1". + */ + +package org.elasticsearch.xcontent.provider.cbor; + +import com.fasterxml.jackson.dataformat.cbor.CBORFactory; +import com.fasterxml.jackson.dataformat.cbor.CBORFactoryBuilder; + +public class ESCborFactoryBuilder extends CBORFactoryBuilder { + + @Override + public CBORFactory build() { + return new ESCborFactory(); + } +} diff --git a/libs/x-content/impl/src/main/java/org/elasticsearch/xcontent/provider/cbor/ESCborParser.java b/libs/x-content/impl/src/main/java/org/elasticsearch/xcontent/provider/cbor/ESCborParser.java new file mode 100644 index 0000000000000..a65104fa96164 --- /dev/null +++ b/libs/x-content/impl/src/main/java/org/elasticsearch/xcontent/provider/cbor/ESCborParser.java @@ -0,0 +1,219 @@ +/* + * Copyright Elasticsearch B.V., and/or licensed to Elasticsearch B.V. + * under one or more license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch B.V. licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + * + * This file is based on a modification of https://github.com/FasterXML/jackson-dataformats-binary which is licensed under the Apache 2.0 License. + */ + +package org.elasticsearch.xcontent.provider.cbor; + +import com.fasterxml.jackson.core.JsonToken; +import com.fasterxml.jackson.core.ObjectCodec; +import com.fasterxml.jackson.core.io.IOContext; +import com.fasterxml.jackson.core.sym.ByteQuadsCanonicalizer; +import com.fasterxml.jackson.dataformat.cbor.CBORConstants; +import com.fasterxml.jackson.dataformat.cbor.CBORParser; + +import org.elasticsearch.xcontent.Text; +import org.elasticsearch.xcontent.XContentString; +import org.elasticsearch.xcontent.provider.OptimizedTextCapable; + +import java.io.IOException; +import java.io.InputStream; +import java.util.Locale; + +/** + * Contains code adapted from {@link CBORParser} licensed under the Apache License 2.0. + */ +public class ESCborParser extends CBORParser implements OptimizedTextCapable { + public ESCborParser( + IOContext ctxt, + int parserFeatures, + int cborFeatures, + ObjectCodec codec, + ByteQuadsCanonicalizer sym, + InputStream in, + byte[] inputBuffer, + int start, + int end, + boolean bufferRecyclable + ) { + super(ctxt, parserFeatures, cborFeatures, codec, sym, in, inputBuffer, start, end, bufferRecyclable); + } + + @Override + public Text getValueAsText() throws IOException { + JsonToken t = _currToken; + if (_tokenIncomplete) { + if (t == JsonToken.VALUE_STRING) { + return _finishAndReturnText(_typeByte); + } + } + return null; + } + + private Text _finishAndReturnText(int ch) throws IOException { + final int type = ((ch >> 5) & 0x7); + ch &= 0x1F; + + // sanity check + if (type != CBORConstants.MAJOR_TYPE_TEXT) { + // should never happen so + _throwInternal(); + } + int previousPointer = _inputPtr; + + // String value, decode + final int len = _decodeExplicitLength(ch); + if (len == 0) { + return new Text(new XContentString.UTF8Bytes(new byte[0], 0, 0), 0); + } + if (len < 0) { + // optimized text is not supported for chunked strings + return null; + } + final int available = _inputEnd - _inputPtr; + if (available >= len) { + Text text = new Text(new XContentString.UTF8Bytes(_inputBuffer, _inputPtr, len)); + _inputPtr = previousPointer; + return text; + } + // this is expected to be used in the context where the input stream is not available + return null; + } + + /** + * Method used to decode explicit length of a variable-length value + * (or, for indefinite/chunked, indicate that one is not known). + * Note that long (64-bit) length is only allowed if it fits in + * 32-bit signed int, for now; expectation being that longer values + * are always encoded as chunks. + */ + private int _decodeExplicitLength(int lowBits) throws IOException { + // common case, indefinite length; relies on marker + if (lowBits == 31) { + return -1; + } + if (lowBits <= 23) { + return lowBits; + } + switch (lowBits - 24) { + case 0: + return _decode8Bits(); + case 1: + return _decode16Bits(); + case 2: + return _decode32Bits(); + case 3: + long l = _decode64Bits(); + if (l < 0 || l > MAX_INT_L) { + throw _constructError("Illegal length for " + currentToken() + ": " + l); + } + return (int) l; + } + throw _constructError( + String.format( + Locale.ROOT, + "Invalid 5-bit length indicator for `JsonToken.%s`: 0x%02X; only 0x00-0x17, 0x1F allowed", + currentToken(), + lowBits + ) + ); + } + + private int _decode8Bits() throws IOException { + if (_inputPtr >= _inputEnd) { + loadMoreGuaranteed(); + } + return _inputBuffer[_inputPtr++] & 0xFF; + } + + private int _decode16Bits() throws IOException { + int ptr = _inputPtr; + if ((ptr + 1) >= _inputEnd) { + return _slow16(); + } + final byte[] b = _inputBuffer; + int v = ((b[ptr] & 0xFF) << 8) + (b[ptr + 1] & 0xFF); + _inputPtr = ptr + 2; + return v; + } + + private int _slow16() throws IOException { + if (_inputPtr >= _inputEnd) { + loadMoreGuaranteed(); + } + int v = (_inputBuffer[_inputPtr++] & 0xFF); + if (_inputPtr >= _inputEnd) { + loadMoreGuaranteed(); + } + return (v << 8) + (_inputBuffer[_inputPtr++] & 0xFF); + } + + private int _decode32Bits() throws IOException { + int ptr = _inputPtr; + if ((ptr + 3) >= _inputEnd) { + return _slow32(); + } + final byte[] b = _inputBuffer; + int v = (b[ptr++] << 24) + ((b[ptr++] & 0xFF) << 16) + ((b[ptr++] & 0xFF) << 8) + (b[ptr++] & 0xFF); + _inputPtr = ptr; + return v; + } + + private int _slow32() throws IOException { + if (_inputPtr >= _inputEnd) { + loadMoreGuaranteed(); + } + int v = _inputBuffer[_inputPtr++]; // sign will disappear anyway + if (_inputPtr >= _inputEnd) { + loadMoreGuaranteed(); + } + v = (v << 8) + (_inputBuffer[_inputPtr++] & 0xFF); + if (_inputPtr >= _inputEnd) { + loadMoreGuaranteed(); + } + v = (v << 8) + (_inputBuffer[_inputPtr++] & 0xFF); + if (_inputPtr >= _inputEnd) { + loadMoreGuaranteed(); + } + return (v << 8) + (_inputBuffer[_inputPtr++] & 0xFF); + } + + private long _decode64Bits() throws IOException { + int ptr = _inputPtr; + if ((ptr + 7) >= _inputEnd) { + return _slow64(); + } + final byte[] b = _inputBuffer; + int i1 = (b[ptr++] << 24) + ((b[ptr++] & 0xFF) << 16) + ((b[ptr++] & 0xFF) << 8) + (b[ptr++] & 0xFF); + int i2 = (b[ptr++] << 24) + ((b[ptr++] & 0xFF) << 16) + ((b[ptr++] & 0xFF) << 8) + (b[ptr++] & 0xFF); + _inputPtr = ptr; + return _long(i1, i2); + } + + private long _slow64() throws IOException { + return _long(_decode32Bits(), _decode32Bits()); + } + + private static long _long(int i1, int i2) { + long l1 = i1; + long l2 = i2; + l2 = (l2 << 32) >>> 32; + return (l1 << 32) + l2; + } +} diff --git a/libs/x-content/impl/src/main/java/org/elasticsearch/xcontent/provider/json/ESUTF8StreamJsonParser.java b/libs/x-content/impl/src/main/java/org/elasticsearch/xcontent/provider/json/ESUTF8StreamJsonParser.java index c4a9823d68ecf..bf33f2b3ae663 100644 --- a/libs/x-content/impl/src/main/java/org/elasticsearch/xcontent/provider/json/ESUTF8StreamJsonParser.java +++ b/libs/x-content/impl/src/main/java/org/elasticsearch/xcontent/provider/json/ESUTF8StreamJsonParser.java @@ -18,13 +18,14 @@ import org.elasticsearch.xcontent.Text; import org.elasticsearch.xcontent.XContentString; +import org.elasticsearch.xcontent.provider.OptimizedTextCapable; import java.io.IOException; import java.io.InputStream; import java.util.ArrayList; import java.util.List; -public class ESUTF8StreamJsonParser extends UTF8StreamJsonParser { +public class ESUTF8StreamJsonParser extends UTF8StreamJsonParser implements OptimizedTextCapable { protected int stringEnd = -1; protected int stringLength; @@ -49,6 +50,7 @@ public ESUTF8StreamJsonParser( * Method that will try to get underlying UTF-8 encoded bytes of the current string token. * This is only a best-effort attempt; if there is some reason the bytes cannot be retrieved, this method will return null. */ + @Override public Text getValueAsText() throws IOException { if (_currToken == JsonToken.VALUE_STRING && _tokenIncomplete) { if (stringEnd > 0) { diff --git a/libs/x-content/impl/src/main/java/org/elasticsearch/xcontent/provider/json/JsonXContentParser.java b/libs/x-content/impl/src/main/java/org/elasticsearch/xcontent/provider/json/JsonXContentParser.java index ecf83756c8c8f..af96e7a8ed34d 100644 --- a/libs/x-content/impl/src/main/java/org/elasticsearch/xcontent/provider/json/JsonXContentParser.java +++ b/libs/x-content/impl/src/main/java/org/elasticsearch/xcontent/provider/json/JsonXContentParser.java @@ -27,6 +27,7 @@ import org.elasticsearch.xcontent.XContentParserConfiguration; import org.elasticsearch.xcontent.XContentString; import org.elasticsearch.xcontent.XContentType; +import org.elasticsearch.xcontent.provider.OptimizedTextCapable; import org.elasticsearch.xcontent.provider.XContentParserConfigurationImpl; import org.elasticsearch.xcontent.support.AbstractXContentParser; @@ -151,8 +152,8 @@ public XContentString optimizedText() throws IOException { if (parser instanceof FilteringParserDelegate delegate) { parser = delegate.delegate(); } - if (parser instanceof ESUTF8StreamJsonParser esParser) { - var bytesRef = esParser.getValueAsText(); + if (parser instanceof OptimizedTextCapable optimizedTextCapableParser) { + var bytesRef = optimizedTextCapableParser.getValueAsText(); if (bytesRef != null) { return bytesRef; } diff --git a/libs/x-content/impl/src/test/java/org/elasticsearch/xcontent/provider/cbor/ESCborParserTests.java b/libs/x-content/impl/src/test/java/org/elasticsearch/xcontent/provider/cbor/ESCborParserTests.java new file mode 100644 index 0000000000000..21248a8fd1c84 --- /dev/null +++ b/libs/x-content/impl/src/test/java/org/elasticsearch/xcontent/provider/cbor/ESCborParserTests.java @@ -0,0 +1,61 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the "Elastic License + * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side + * Public License v 1"; you may not use this file except in compliance with, at + * your election, the "Elastic License 2.0", the "GNU Affero General Public + * License v3.0 only", or the "Server Side Public License, v 1". + */ + +package org.elasticsearch.xcontent.provider.cbor; + +import com.fasterxml.jackson.core.JsonToken; +import com.fasterxml.jackson.dataformat.cbor.CBORFactory; + +import org.elasticsearch.test.ESTestCase; +import org.elasticsearch.xcontent.Text; +import org.elasticsearch.xcontent.XContentBuilder; +import org.elasticsearch.xcontent.cbor.CborXContent; +import org.hamcrest.Matchers; + +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.util.Map; + +import static org.hamcrest.Matchers.equalTo; + +public class ESCborParserTests extends ESTestCase { + + public void testParseText() throws IOException { + testStringValue("foo"); + testStringValue("føø"); + testStringValue("f\u00F8\u00F8"); + testStringValue("ツ"); // 3 bytes in UTF-8, counts as 1 character + testStringValue("🐔"); // 4 bytes in UTF-8, counts as 2 characters + testStringValue(randomUnicodeOfLengthBetween(1, 1_000)); + } + + private void testStringValue(String expected) throws IOException { + CBORFactory factory = new ESCborFactoryBuilder().build(); + assertThat(factory, Matchers.instanceOf(ESCborFactory.class)); + + ByteArrayOutputStream outputStream; + try (XContentBuilder builder = CborXContent.contentBuilder()) { + builder.map(Map.of("text", expected)); + outputStream = (ByteArrayOutputStream) builder.getOutputStream(); + } + ESCborParser parser = (ESCborParser) factory.createParser(outputStream.toByteArray()); + + assertThat(parser, Matchers.instanceOf(ESCborParser.class)); + assertThat(parser.nextToken(), equalTo(JsonToken.START_OBJECT)); + assertThat(parser.nextFieldName(), equalTo("text")); + assertThat(parser.nextToken(), equalTo(JsonToken.VALUE_STRING)); + Text text = parser.getValueAsText(); + assertThat(text.hasBytes(), equalTo(true)); + assertThat(text.stringLength(), equalTo(expected.length())); + assertThat(text.string(), equalTo(expected)); + assertThat(parser.getValueAsText().string(), equalTo(expected)); + assertThat(parser.getValueAsString(), equalTo(expected)); + assertThat(parser.nextToken(), equalTo(JsonToken.END_OBJECT)); + } +} diff --git a/libs/x-content/src/main/java/org/elasticsearch/xcontent/Text.java b/libs/x-content/src/main/java/org/elasticsearch/xcontent/Text.java index 7a771f247a448..bd0168bbc6684 100644 --- a/libs/x-content/src/main/java/org/elasticsearch/xcontent/Text.java +++ b/libs/x-content/src/main/java/org/elasticsearch/xcontent/Text.java @@ -95,11 +95,42 @@ public String string() { @Override public int stringLength() { if (stringLength < 0) { - stringLength = string().length(); + if (hasString()) { + stringLength = string().length(); + } else { + stringLength = countCharsUtf8(bytes()); + if (stringLength < 0) { + stringLength = string().length(); + } + } } return stringLength; } + private int countCharsUtf8(UTF8Bytes bytes) { + int count = 0; + int offset = bytes.offset(); + int end = offset + bytes.length(); + for (int i = offset; i < end; i++) { + byte b = bytes.bytes()[i]; + if ((b & 0x80) == 0) { + count++; // 1 byte character + } else if ((b & 0xE0) == 0xC0) { + count++; // 2 byte character + i++; // skip next byte + } else if ((b & 0xF0) == 0xE0) { + count++; // 3 byte character + i += 2; // skip next two bytes + } else if ((b & 0xF8) == 0xF0) { + count += 2; // 4 byte character + i += 3; // skip next three bytes + } else { + return -1; // invalid UTF-8 sequence + } + } + return count; + } + @Override public String toString() { return string(); diff --git a/libs/x-content/src/test/java/org/elasticsearch/xcontent/TextTests.java b/libs/x-content/src/test/java/org/elasticsearch/xcontent/TextTests.java index 4e637ff0248f5..a483bfd82439a 100644 --- a/libs/x-content/src/test/java/org/elasticsearch/xcontent/TextTests.java +++ b/libs/x-content/src/test/java/org/elasticsearch/xcontent/TextTests.java @@ -75,7 +75,7 @@ public void testStringLength() { var text = new Text(encoded); assertFalse(text.hasString()); assertEquals(stringLength, text.stringLength()); - assertTrue(text.hasString()); + assertFalse(text.hasString()); } {