Skip to content

Commit 9ebe95a

Browse files
Better sizing BytesRef for Strings in Queries (#115655)
* Better sizing BytesRefs for Strings in Queries * Update docs/changelog/115655.yaml * iter * added test * iter * extracted method * iter --------- Co-authored-by: Elastic Machine <[email protected]>
1 parent 81fd1de commit 9ebe95a

File tree

4 files changed

+53
-7
lines changed

4 files changed

+53
-7
lines changed

docs/changelog/115655.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
pr: 115655
2+
summary: Better sizing `BytesRef` for Strings in Queries
3+
area: Search
4+
type: enhancement
5+
issues: []

server/src/main/java/org/elasticsearch/common/lucene/BytesRefs.java

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111

1212
import org.apache.lucene.index.IndexWriter;
1313
import org.apache.lucene.util.BytesRef;
14+
import org.apache.lucene.util.UnicodeUtil;
1415

1516
public class BytesRefs {
1617

@@ -56,6 +57,25 @@ public static BytesRef checkIndexableLength(BytesRef input) {
5657
return input;
5758
}
5859

60+
/**
61+
* Converts a given string to a {@link BytesRef} object with an exactly sized byte array.
62+
* <p>
63+
* This method alternative method to the standard {@link BytesRef} constructor's allocates the
64+
* exact byte array size needed for the string. This is done by parsing the UTF-16 string two
65+
* times the first to estimate the array length and the second to copy the string value inside
66+
* the array.
67+
* </p>
68+
*
69+
* @param s the input string to convert
70+
* @return a BytesRef object representing the input string
71+
*/
72+
public static BytesRef toExactSizedBytesRef(String s) {
73+
int l = s.length();
74+
byte[] b = new byte[UnicodeUtil.calcUTF16toUTF8Length(s, 0, l)];
75+
UnicodeUtil.UTF16toUTF8(s, 0, l, b);
76+
return new BytesRef(b, 0, b.length);
77+
}
78+
5979
/**
6080
* Produces a UTF-string prefix of the input BytesRef. If the prefix cutoff would produce
6181
* ill-formed UTF, it falls back to the hexadecimal representation.
@@ -70,5 +90,4 @@ private static String safeStringPrefix(BytesRef input, int prefixLength) {
7090
return prefix.toString();
7191
}
7292
}
73-
7493
}

server/src/main/java/org/elasticsearch/index/query/AbstractQueryBuilder.java

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -216,12 +216,12 @@ public final int hashCode() {
216216
* @return the same input object or a {@link BytesRef} representation if input was of type string
217217
*/
218218
static Object maybeConvertToBytesRef(Object obj) {
219-
if (obj instanceof String) {
220-
return BytesRefs.checkIndexableLength(BytesRefs.toBytesRef(obj));
221-
} else if (obj instanceof CharBuffer) {
222-
return BytesRefs.checkIndexableLength(new BytesRef((CharBuffer) obj));
223-
} else if (obj instanceof BigInteger) {
224-
return BytesRefs.toBytesRef(obj);
219+
if (obj instanceof String v) {
220+
return BytesRefs.checkIndexableLength(BytesRefs.toExactSizedBytesRef(v));
221+
} else if (obj instanceof CharBuffer v) {
222+
return BytesRefs.checkIndexableLength(new BytesRef(v));
223+
} else if (obj instanceof BigInteger v) {
224+
return BytesRefs.toBytesRef(v);
225225
}
226226
return obj;
227227
}

server/src/test/java/org/elasticsearch/index/query/AbstractQueryBuilderTests.java

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
package org.elasticsearch.index.query;
1111

1212
import org.apache.lucene.index.IndexWriter;
13+
import org.apache.lucene.util.BytesRef;
1314
import org.elasticsearch.common.ParsingException;
1415
import org.elasticsearch.common.settings.Settings;
1516
import org.elasticsearch.search.SearchModule;
@@ -93,4 +94,25 @@ public void testMaybeConvertToBytesRefLongTerm() {
9394
assertThat(e.getMessage(), containsString("term starting with [aaaaa"));
9495
}
9596

97+
public void testMaybeConvertToBytesRefStringCorrectSize() {
98+
int capacity = randomIntBetween(20, 40);
99+
StringBuilder termBuilder = new StringBuilder(capacity);
100+
int correctSize = 0;
101+
for (int i = 0; i < capacity; i++) {
102+
if (i < capacity / 3) {
103+
termBuilder.append((char) randomIntBetween(0, 128));
104+
++correctSize; // use only one byte for char < 128
105+
} else if (i < 2 * capacity / 3) {
106+
termBuilder.append((char) randomIntBetween(128, 2048));
107+
correctSize += 2; // use two bytes for char < 2048
108+
} else {
109+
termBuilder.append((char) randomIntBetween(2048, 4092));
110+
correctSize += 3; // use three bytes for char >= 2048
111+
}
112+
}
113+
BytesRef bytesRef = (BytesRef) AbstractQueryBuilder.maybeConvertToBytesRef(termBuilder.toString());
114+
assertEquals(correctSize, bytesRef.bytes.length);
115+
assertEquals(correctSize, bytesRef.length);
116+
}
117+
96118
}

0 commit comments

Comments
 (0)