elastic · jordan-powers · Jun 4, 2025 · Jun 2, 2025 · Jun 2, 2025 · Jun 2, 2025
@@ -6,22 +6,17 @@
  * your election, the "Elastic License 2.0", the "GNU Affero General Public
  * License v3.0 only", or the "Server Side Public License, v 1".
  */
-package org.elasticsearch.common.text;
-
-import org.apache.lucene.util.BytesRef;
-import org.elasticsearch.common.bytes.BytesArray;
-import org.elasticsearch.common.bytes.BytesReference;
-import org.elasticsearch.xcontent.ToXContentFragment;
-import org.elasticsearch.xcontent.XContentBuilder;
+package org.elasticsearch.xcontent;
 
 import java.io.IOException;
+import java.nio.ByteBuffer;
 import java.nio.charset.StandardCharsets;
 
 /**
- * Both {@link String} and {@link BytesReference} representation of the text. Starts with one of those, and if
- * the other is requests, caches the other one in a local reference so no additional conversion will be needed.
+ * Both {@link String} and {@link UTF8Bytes} representation of the text. Starts with one of those, and if
+ * the other is requested, caches the other one in a local reference so no additional conversion will be needed.
  */
-public final class Text implements Comparable<Text>, ToXContentFragment {
+public final class Text implements XContentString, Comparable<Text>, ToXContentFragment {
 
     public static final Text[] EMPTY_ARRAY = new Text[0];
 
@@ -36,31 +31,46 @@ public static Text[] convertFromStringArray(String[] strings) {
         return texts;
     }
 
-    private BytesReference bytes;
-    private String text;
+    private UTF8Bytes bytes;
+    private String string;
     private int hash;
+    private int stringLength = -1;
+
+    /**
+     * Construct a Text from encoded UTF8Bytes. Since no string length is specified, {@link #stringLength()}
+     * will perform a string conversion to measure the string length.
+     */
+    public Text(UTF8Bytes bytes) {
+        this.bytes = bytes;
+    }
 
-    public Text(BytesReference bytes) {
+    /**
+     * Construct a Text from encoded UTF8Bytes and an explicit string length. Used to avoid string conversion
+     * in {@link #stringLength()}. The provided stringLength should match the value that would
+     * be calculated by {@link Text#Text(UTF8Bytes)}.
+     */
+    public Text(UTF8Bytes bytes, int stringLength) {
         this.bytes = bytes;
+        this.stringLength = stringLength;
     }
 
-    public Text(String text) {
-        this.text = text;
+    public Text(String string) {
+        this.string = string;
     }
 
     /**
-     * Whether a {@link BytesReference} view of the data is already materialized.
+     * Whether an {@link UTF8Bytes} view of the data is already materialized.
      */
     public boolean hasBytes() {
         return bytes != null;
     }
 
-    /**
-     * Returns a {@link BytesReference} view of the data.
-     */
-    public BytesReference bytes() {
+    @Override
+    public UTF8Bytes bytes() {
         if (bytes == null) {
-            bytes = new BytesArray(text.getBytes(StandardCharsets.UTF_8));
+            var byteBuff = StandardCharsets.UTF_8.encode(string);
+            assert byteBuff.hasArray();
+            bytes = new UTF8Bytes(byteBuff.array(), byteBuff.arrayOffset() + byteBuff.position(), byteBuff.remaining());
         }
         return bytes;
     }
@@ -69,14 +79,25 @@ public BytesReference bytes() {
      * Whether a {@link String} view of the data is already materialized.
      */
     public boolean hasString() {
-        return text != null;
+        return string != null;
     }
 
-    /**
-     * Returns a {@link String} view of the data.
-     */
+    @Override
     public String string() {
-        return text == null ? bytes.utf8ToString() : text;
+        if (string == null) {
+            var byteBuff = ByteBuffer.wrap(bytes.bytes(), bytes.offset(), bytes.length());
+            string = StandardCharsets.UTF_8.decode(byteBuff).toString();
+            assert (stringLength < 0) || (string.length() == stringLength);
+        }
+        return string;
+    }
+
+    @Override
+    public int stringLength() {
+        if (stringLength < 0) {
+            stringLength = string().length();
+        }
+        return stringLength;
     }
 
     @Override
@@ -115,8 +136,7 @@ public XContentBuilder toXContent(XContentBuilder builder, Params params) throws
         } else {
             // TODO: TextBytesOptimization we can use a buffer here to convert it? maybe add a
             // request to jackson to support InputStream as well?
-            BytesRef br = this.bytes().toBytesRef();
-            return builder.utf8Value(br.bytes, br.offset, br.length);
+            return builder.utf8Value(bytes.bytes(), bytes.offset(), bytes.length());
         }
     }
 }
@@ -0,0 +1,61 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the "Elastic License
+ * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side
+ * Public License v 1"; you may not use this file except in compliance with, at
+ * your election, the "Elastic License 2.0", the "GNU Affero General Public
+ * License v3.0 only", or the "Server Side Public License, v 1".
+ */
+
+package org.elasticsearch.xcontent;
+
+import java.nio.ByteBuffer;
+
+public interface XContentString {
+    record UTF8Bytes(byte[] bytes, int offset, int length) implements Comparable<UTF8Bytes> {
+        public UTF8Bytes(byte[] bytes) {
+            this(bytes, 0, bytes.length);
+        }
+
+        @Override
+        public int compareTo(UTF8Bytes o) {
+            if (this.bytes == o.bytes && this.offset == o.offset && this.length == o.length) {
+                return 0;
+            }
+
+            return ByteBuffer.wrap(bytes, offset, length).compareTo(ByteBuffer.wrap(o.bytes, o.offset, o.length));
+        }
+
+        @Override
+        public boolean equals(Object o) {
+            if (this == o) {
+                return true;
+            }
+            if (o == null || getClass() != o.getClass()) {
+                return false;
+            }
+
+            return this.compareTo((UTF8Bytes) o) == 0;
+        }
+
+        @Override
+        public int hashCode() {
+            return ByteBuffer.wrap(bytes, offset, length).hashCode();
+        }
+    }
+
+    /**
+     * Returns a {@link String} view of the data.
+     */
+    String string();
+
+    /**
+     * Returns an encoded {@link UTF8Bytes} view of the data.
+     */
+    UTF8Bytes bytes();
+
+    /**
+     * Returns the number of characters in the represented string.
+     */
+    int stringLength();
+}
@@ -0,0 +1,190 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the "Elastic License
+ * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side
+ * Public License v 1"; you may not use this file except in compliance with, at
+ * your election, the "Elastic License 2.0", the "GNU Affero General Public
+ * License v3.0 only", or the "Server Side Public License, v 1".
+ */
+
+package org.elasticsearch.xcontent;
+
+import org.elasticsearch.test.ESTestCase;
+
+import java.nio.charset.StandardCharsets;
+
+public class TextTests extends ESTestCase {
+    public void testConvertToBytes() {
+        String value = randomUnicodeOfLength(randomInt(128));
+        byte[] encodedArr = value.getBytes(StandardCharsets.UTF_8);
+        var encoded = new XContentString.UTF8Bytes(encodedArr);
+
+        var text = new Text(value);
+        assertTrue(text.hasString());
+        assertFalse(text.hasBytes());
+
+        assertEquals(value, text.string());
+        assertEquals(encoded, text.bytes());
+
+        assertTrue(text.hasString());
+        assertTrue(text.hasBytes());
+
+        // Ensure the conversion didn't mess up subsequent calls
+        assertEquals(value, text.string());
+        assertEquals(encoded, text.bytes());
+
+        assertSame(text.bytes(), text.bytes());
+    }
+
+    public void testConvertToString() {
+        String value = randomUnicodeOfLength(randomInt(128));
+        byte[] encodedArr = value.getBytes(StandardCharsets.UTF_8);
+        var encoded = new XContentString.UTF8Bytes(encodedArr);
+
+        var text = new Text(encoded);
+        assertFalse(text.hasString());
+        assertTrue(text.hasBytes());
+
+        assertEquals(value, text.string());
+        assertEquals(encoded, text.bytes());
+
+        assertTrue(text.hasString());
+        assertTrue(text.hasBytes());
+
+        // Ensure the conversion didn't mess up subsequent calls
+        assertEquals(value, text.string());
+        assertEquals(encoded, text.bytes());
+
+        assertSame(encoded, text.bytes());
+    }
+
+    public void testStringLength() {
+        int stringLength = randomInt(128);
+        String value = randomUnicodeOfLength(stringLength);
+        byte[] encodedArr = value.getBytes(StandardCharsets.UTF_8);
+        var encoded = new XContentString.UTF8Bytes(encodedArr);
+
+        {
+            var text = new Text(value);
+            assertTrue(text.hasString());
+            assertEquals(stringLength, text.stringLength());
+        }
+
+        {
+            var text = new Text(encoded);
+            assertFalse(text.hasString());
+            assertEquals(stringLength, text.stringLength());
+            assertTrue(text.hasString());
+        }
+
+        {
+            var text = new Text(encoded, stringLength);
+            assertFalse(text.hasString());
+            assertEquals(stringLength, text.stringLength());
+            assertFalse(text.hasString());
+        }
+    }
+
+    public void testEquals() {
+        String value = randomUnicodeOfLength(randomInt(128));
+        byte[] encodedArr = value.getBytes(StandardCharsets.UTF_8);
+        var encoded = new XContentString.UTF8Bytes(encodedArr);
+
+        {
+            var text1 = new Text(value);
+            var text2 = new Text(value);
+            assertTrue(text1.equals(text2));
+        }
+
+        {
+            var text1 = new Text(value);
+            var text2 = new Text(encoded);
+            assertTrue(text1.equals(text2));
+        }
+
+        {
+            var text1 = new Text(encoded);
+            var text2 = new Text(encoded);
+            assertTrue(text1.equals(text2));
+        }
+    }
+
+    public void testCompareTo() {
+        String value1 = randomUnicodeOfLength(randomInt(128));
+        byte[] encodedArr1 = value1.getBytes(StandardCharsets.UTF_8);
+        var encoded1 = new XContentString.UTF8Bytes(encodedArr1);
+
+        {
+            var text1 = new Text(value1);
+            var text2 = new Text(value1);
+            assertEquals(0, text1.compareTo(text2));
+        }
+
+        {
+            var text1 = new Text(value1);
+            var text2 = new Text(encoded1);
+            assertEquals(0, text1.compareTo(text2));
+        }
+
+        {
+            var text1 = new Text(encoded1);
+            var text2 = new Text(encoded1);
+            assertEquals(0, text1.compareTo(text2));
+        }
+
+        String value2 = randomUnicodeOfLength(randomInt(128));
+        byte[] encodedArr2 = value2.getBytes(StandardCharsets.UTF_8);
+        var encoded2 = new XContentString.UTF8Bytes(encodedArr2);
+
+        int compSign = (int) Math.signum(encoded1.compareTo(encoded2));
+
+        {
+            var text1 = new Text(value1);
+            var text2 = new Text(value2);
+            assertEquals(compSign, (int) Math.signum(text1.compareTo(text2)));
+        }
+
+        {
+            var text1 = new Text(value1);
+            var text2 = new Text(encoded2);
+            assertEquals(compSign, (int) Math.signum(text1.compareTo(text2)));
+        }
+
+        {
+            var text1 = new Text(encoded1);
+            var text2 = new Text(value2);
+            assertEquals(compSign, (int) Math.signum(text1.compareTo(text2)));
+        }
+
+        {
+            var text1 = new Text(encoded1);
+            var text2 = new Text(encoded2);
+            assertEquals(compSign, (int) Math.signum(text1.compareTo(text2)));
+        }
+    }
+
+    public void testRandomized() {
+        int stringLength = randomInt(128);
+        String value = randomUnicodeOfLength(stringLength);
+        byte[] encodedArr = value.getBytes(StandardCharsets.UTF_8);
+        var encoded = new XContentString.UTF8Bytes(encodedArr);
+
+        Text text = switch (randomInt(2)) {
+            case 0 -> new Text(value);
+            case 1 -> new Text(encoded);
+            default -> new Text(encoded, stringLength);
+        };
+
+        for (int i = 0; i < 20; i++) {
+            switch (randomInt(5)) {
+                case 0 -> assertEquals(encoded, text.bytes());
+                case 1 -> assertSame(text.bytes(), text.bytes());
+                case 2 -> assertEquals(value, text.string());
+                case 3 -> assertEquals(value, text.toString());
+                case 4 -> assertEquals(stringLength, text.stringLength());
+                case 5 -> assertEquals(new Text(value), text);
+            }
+        }
+    }
+
+}
diff --git a/...om-suggester/src/main/java/org/elasticsearch/example/customsuggester/CustomSuggester.java b/...om-suggester/src/main/java/org/elasticsearch/example/customsuggester/CustomSuggester.java
@@ -11,7 +11,7 @@
 
 import org.apache.lucene.search.IndexSearcher;
 import org.apache.lucene.util.CharsRefBuilder;
-import org.elasticsearch.common.text.Text;
+import org.elasticsearch.xcontent.Text;
 import org.elasticsearch.search.suggest.Suggest;
 import org.elasticsearch.search.suggest.Suggester;
 

diff --git a/...m-suggester/src/main/java/org/elasticsearch/example/customsuggester/CustomSuggestion.java b/...m-suggester/src/main/java/org/elasticsearch/example/customsuggester/CustomSuggestion.java
@@ -11,7 +11,7 @@
 
 import org.elasticsearch.common.io.stream.StreamInput;
 import org.elasticsearch.common.io.stream.StreamOutput;
-import org.elasticsearch.common.text.Text;
+import org.elasticsearch.xcontent.Text;
 import org.elasticsearch.search.suggest.Suggest;
 import org.elasticsearch.xcontent.ParseField;
 import org.elasticsearch.xcontent.XContentBuilder;