Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -6,22 +6,17 @@
* your election, the "Elastic License 2.0", the "GNU Affero General Public
* License v3.0 only", or the "Server Side Public License, v 1".
*/
package org.elasticsearch.common.text;

import org.apache.lucene.util.BytesRef;
import org.elasticsearch.common.bytes.BytesArray;
import org.elasticsearch.common.bytes.BytesReference;
import org.elasticsearch.xcontent.ToXContentFragment;
import org.elasticsearch.xcontent.XContentBuilder;
package org.elasticsearch.xcontent;

import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.charset.StandardCharsets;

/**
* Both {@link String} and {@link BytesReference} representation of the text. Starts with one of those, and if
* the other is requests, caches the other one in a local reference so no additional conversion will be needed.
* Both {@link String} and {@link UTF8Bytes} representation of the text. Starts with one of those, and if
* the other is requested, caches the other one in a local reference so no additional conversion will be needed.
*/
public final class Text implements Comparable<Text>, ToXContentFragment {
public final class Text implements XContentString, Comparable<Text>, ToXContentFragment {

public static final Text[] EMPTY_ARRAY = new Text[0];

Expand All @@ -36,31 +31,46 @@ public static Text[] convertFromStringArray(String[] strings) {
return texts;
}

private BytesReference bytes;
private String text;
private UTF8Bytes bytes;
private String string;
private int hash;
private int stringLength = -1;

/**
* Construct a Text from encoded UTF8Bytes. Since no string length is specified, {@link #stringLength()}
* will perform a string conversion to measure the string length.
*/
public Text(UTF8Bytes bytes) {
this.bytes = bytes;
}

public Text(BytesReference bytes) {
/**
* Construct a Text from encoded UTF8Bytes and an explicit string length. Used to avoid string conversion
* in {@link #stringLength()}. The provided stringLength should match the value that would
* be calculated by {@link Text#Text(UTF8Bytes)}.
*/
public Text(UTF8Bytes bytes, int stringLength) {
this.bytes = bytes;
this.stringLength = stringLength;
}

public Text(String text) {
this.text = text;
public Text(String string) {
this.string = string;
}

/**
* Whether a {@link BytesReference} view of the data is already materialized.
* Whether an {@link UTF8Bytes} view of the data is already materialized.
*/
public boolean hasBytes() {
return bytes != null;
}

/**
* Returns a {@link BytesReference} view of the data.
*/
public BytesReference bytes() {
@Override
public UTF8Bytes bytes() {
if (bytes == null) {
bytes = new BytesArray(text.getBytes(StandardCharsets.UTF_8));
var byteBuff = StandardCharsets.UTF_8.encode(string);
assert byteBuff.hasArray();
bytes = new UTF8Bytes(byteBuff.array(), byteBuff.arrayOffset() + byteBuff.position(), byteBuff.remaining());
}
return bytes;
}
Expand All @@ -69,14 +79,25 @@ public BytesReference bytes() {
* Whether a {@link String} view of the data is already materialized.
*/
public boolean hasString() {
return text != null;
return string != null;
}

/**
* Returns a {@link String} view of the data.
*/
@Override
public String string() {
return text == null ? bytes.utf8ToString() : text;
if (string == null) {
var byteBuff = ByteBuffer.wrap(bytes.bytes(), bytes.offset(), bytes.length());
string = StandardCharsets.UTF_8.decode(byteBuff).toString();
assert (stringLength < 0) || (string.length() == stringLength);
}
return string;
}

@Override
public int stringLength() {
if (stringLength < 0) {
stringLength = string().length();
}
return stringLength;
}

@Override
Expand Down Expand Up @@ -115,8 +136,7 @@ public XContentBuilder toXContent(XContentBuilder builder, Params params) throws
} else {
// TODO: TextBytesOptimization we can use a buffer here to convert it? maybe add a
// request to jackson to support InputStream as well?
BytesRef br = this.bytes().toBytesRef();
return builder.utf8Value(br.bytes, br.offset, br.length);
return builder.utf8Value(bytes.bytes(), bytes.offset(), bytes.length());
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the "Elastic License
* 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side
* Public License v 1"; you may not use this file except in compliance with, at
* your election, the "Elastic License 2.0", the "GNU Affero General Public
* License v3.0 only", or the "Server Side Public License, v 1".
*/

package org.elasticsearch.xcontent;

import java.nio.ByteBuffer;

public interface XContentString {
record UTF8Bytes(byte[] bytes, int offset, int length) implements Comparable<UTF8Bytes> {
public UTF8Bytes(byte[] bytes) {
this(bytes, 0, bytes.length);
}

@Override
public int compareTo(UTF8Bytes o) {
if (this.bytes == o.bytes && this.offset == o.offset && this.length == o.length) {
return 0;
}

return ByteBuffer.wrap(bytes, offset, length).compareTo(ByteBuffer.wrap(o.bytes, o.offset, o.length));
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Random thought: Is there any value in peeling off a fast-path case for when all the fields are identical by ==? Not sure whether this ever actually happens, but it would avoid two object allocations.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Make sense to me, especially since equal() delegates to compareTo(), so this could happen fairly frequently.

Copy link
Contributor

@prdoyle prdoyle Jun 4, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, essentially you're making sure you're no slower than the built-in Record.equals() for the case that it returns true. Could be a little slower for the false case.

}

@Override
public boolean equals(Object o) {
if (this == o) {
return true;
}
if (o == null || getClass() != o.getClass()) {
return false;
}

return this.compareTo((UTF8Bytes) o) == 0;
}

@Override
public int hashCode() {
return ByteBuffer.wrap(bytes, offset, length).hashCode();
}
}

/**
* Returns a {@link String} view of the data.
*/
String string();

/**
* Returns an encoded {@link UTF8Bytes} view of the data.
*/
UTF8Bytes bytes();

/**
* Returns the number of characters in the represented string.
*/
int stringLength();
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,190 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the "Elastic License
* 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side
* Public License v 1"; you may not use this file except in compliance with, at
* your election, the "Elastic License 2.0", the "GNU Affero General Public
* License v3.0 only", or the "Server Side Public License, v 1".
*/

package org.elasticsearch.xcontent;

import org.elasticsearch.test.ESTestCase;

import java.nio.charset.StandardCharsets;

public class TextTests extends ESTestCase {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🎉

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a pretty good test suite!

Optional: Perhaps consider some tests that use randomization to do a sequence of operations (like string(), stringLength(), bytes()) and assert that the result is right?

This is probably not necessary, since you've already added regression tests for the sequences that bit us before.

public void testConvertToBytes() {
String value = randomUnicodeOfLength(randomInt(128));
byte[] encodedArr = value.getBytes(StandardCharsets.UTF_8);
var encoded = new XContentString.UTF8Bytes(encodedArr);

var text = new Text(value);
assertTrue(text.hasString());
assertFalse(text.hasBytes());

assertEquals(value, text.string());
assertEquals(encoded, text.bytes());

assertTrue(text.hasString());
assertTrue(text.hasBytes());

// Ensure the conversion didn't mess up subsequent calls
assertEquals(value, text.string());
assertEquals(encoded, text.bytes());

assertSame(text.bytes(), text.bytes());
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Tricky!

}

public void testConvertToString() {
String value = randomUnicodeOfLength(randomInt(128));
byte[] encodedArr = value.getBytes(StandardCharsets.UTF_8);
var encoded = new XContentString.UTF8Bytes(encodedArr);

var text = new Text(encoded);
assertFalse(text.hasString());
assertTrue(text.hasBytes());

assertEquals(value, text.string());
assertEquals(encoded, text.bytes());

assertTrue(text.hasString());
assertTrue(text.hasBytes());

// Ensure the conversion didn't mess up subsequent calls
assertEquals(value, text.string());
assertEquals(encoded, text.bytes());

assertSame(encoded, text.bytes());
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Probably we could just change all prior assertEquals on encoded to be assertSame instead?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I want to keep the assertEquals to verify the equals() method. I could see a situation where equals() is wrong or messes up the internal state somehow.

}

public void testStringLength() {
int stringLength = randomInt(128);
String value = randomUnicodeOfLength(stringLength);
byte[] encodedArr = value.getBytes(StandardCharsets.UTF_8);
var encoded = new XContentString.UTF8Bytes(encodedArr);

{
var text = new Text(value);
assertTrue(text.hasString());
assertEquals(stringLength, text.stringLength());
}

{
var text = new Text(encoded);
assertFalse(text.hasString());
assertEquals(stringLength, text.stringLength());
assertTrue(text.hasString());
}

{
var text = new Text(encoded, stringLength);
assertFalse(text.hasString());
assertEquals(stringLength, text.stringLength());
assertFalse(text.hasString());
}
}

public void testEquals() {
String value = randomUnicodeOfLength(randomInt(128));
byte[] encodedArr = value.getBytes(StandardCharsets.UTF_8);
var encoded = new XContentString.UTF8Bytes(encodedArr);

{
var text1 = new Text(value);
var text2 = new Text(value);
assertTrue(text1.equals(text2));
}

{
var text1 = new Text(value);
var text2 = new Text(encoded);
assertTrue(text1.equals(text2));
}

{
var text1 = new Text(encoded);
var text2 = new Text(encoded);
assertTrue(text1.equals(text2));
}
}

public void testCompareTo() {
String value1 = randomUnicodeOfLength(randomInt(128));
byte[] encodedArr1 = value1.getBytes(StandardCharsets.UTF_8);
var encoded1 = new XContentString.UTF8Bytes(encodedArr1);

{
var text1 = new Text(value1);
var text2 = new Text(value1);
assertEquals(0, text1.compareTo(text2));
}

{
var text1 = new Text(value1);
var text2 = new Text(encoded1);
assertEquals(0, text1.compareTo(text2));
}

{
var text1 = new Text(encoded1);
var text2 = new Text(encoded1);
assertEquals(0, text1.compareTo(text2));
}

String value2 = randomUnicodeOfLength(randomInt(128));
byte[] encodedArr2 = value2.getBytes(StandardCharsets.UTF_8);
var encoded2 = new XContentString.UTF8Bytes(encodedArr2);

int compSign = (int) Math.signum(encoded1.compareTo(encoded2));

{
var text1 = new Text(value1);
var text2 = new Text(value2);
assertEquals(compSign, (int) Math.signum(text1.compareTo(text2)));
}

{
var text1 = new Text(value1);
var text2 = new Text(encoded2);
assertEquals(compSign, (int) Math.signum(text1.compareTo(text2)));
}

{
var text1 = new Text(encoded1);
var text2 = new Text(value2);
assertEquals(compSign, (int) Math.signum(text1.compareTo(text2)));
}

{
var text1 = new Text(encoded1);
var text2 = new Text(encoded2);
assertEquals(compSign, (int) Math.signum(text1.compareTo(text2)));
}
}

public void testRandomized() {
int stringLength = randomInt(128);
String value = randomUnicodeOfLength(stringLength);
byte[] encodedArr = value.getBytes(StandardCharsets.UTF_8);
var encoded = new XContentString.UTF8Bytes(encodedArr);

Text text = switch (randomInt(2)) {
case 0 -> new Text(value);
case 1 -> new Text(encoded);
default -> new Text(encoded, stringLength);
};

for (int i = 0; i < 20; i++) {
switch (randomInt(5)) {
case 0 -> assertEquals(encoded, text.bytes());
case 1 -> assertSame(text.bytes(), text.bytes());
case 2 -> assertEquals(value, text.string());
case 3 -> assertEquals(value, text.toString());
case 4 -> assertEquals(stringLength, text.stringLength());
case 5 -> assertEquals(new Text(value), text);
}
}
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.util.CharsRefBuilder;
import org.elasticsearch.common.text.Text;
import org.elasticsearch.xcontent.Text;
import org.elasticsearch.search.suggest.Suggest;
import org.elasticsearch.search.suggest.Suggester;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

import org.elasticsearch.common.io.stream.StreamInput;
import org.elasticsearch.common.io.stream.StreamOutput;
import org.elasticsearch.common.text.Text;
import org.elasticsearch.xcontent.Text;
import org.elasticsearch.search.suggest.Suggest;
import org.elasticsearch.xcontent.ParseField;
import org.elasticsearch.xcontent.XContentBuilder;
Expand Down
Loading