Skip to content

Commit 869a381

Browse files
authored
fix: Improve performance of the UTF-8 string comparison logic. (#2182)
1 parent 54ce2c7 commit 869a381

File tree

2 files changed

+51
-36
lines changed
  • google-cloud-firestore/src

2 files changed

+51
-36
lines changed

google-cloud-firestore/src/main/java/com/google/cloud/firestore/Order.java

Lines changed: 36 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@
1616

1717
package com.google.cloud.firestore;
1818

19+
import static java.lang.Character.isSurrogate;
20+
1921
import com.google.firestore.v1.MapValue;
2022
import com.google.firestore.v1.Value;
2123
import com.google.firestore.v1.Value.ValueTypeCase;
@@ -136,46 +138,46 @@ public int compare(@Nonnull Value left, @Nonnull Value right) {
136138

137139
/** Compare strings in UTF-8 encoded byte order */
138140
public static int compareUtf8Strings(String left, String right) {
139-
int i = 0;
140-
while (i < left.length() && i < right.length()) {
141-
int leftCodePoint = left.codePointAt(i);
142-
int rightCodePoint = right.codePointAt(i);
143-
144-
if (leftCodePoint != rightCodePoint) {
145-
if (leftCodePoint < 128 && rightCodePoint < 128) {
146-
// ASCII comparison
147-
return Integer.compare(leftCodePoint, rightCodePoint);
148-
} else {
149-
// UTF-8 encode the character at index i for byte comparison.
150-
ByteString leftBytes = ByteString.copyFromUtf8(getUtf8SafeBytes(left, i));
151-
ByteString rightBytes = ByteString.copyFromUtf8(getUtf8SafeBytes(right, i));
152-
int comp = compareByteStrings(leftBytes, rightBytes);
153-
if (comp != 0) {
154-
return comp;
155-
} else {
156-
// EXTREMELY RARE CASE: Code points differ, but their UTF-8 byte representations are
157-
// identical. This can happen with malformed input (invalid surrogate pairs), where
158-
// Java's encoding leads to unexpected byte sequences. Meanwhile, any invalid surrogate
159-
// inputs get converted to "?" by protocol buffer while round tripping, so we almost
160-
// never receive invalid strings from backend.
161-
// Fallback to code point comparison for graceful handling.
162-
return Integer.compare(leftCodePoint, rightCodePoint);
163-
}
164-
}
141+
// noinspection StringEquality
142+
if (left == right) {
143+
return 0;
144+
}
145+
146+
// Find the first differing character (a.k.a. "UTF-16 code unit") in the two strings and,
147+
// if found, use that character to determine the relative ordering of the two strings as a
148+
// whole. Comparing UTF-16 strings in UTF-8 byte order can be done simply and efficiently by
149+
// comparing the UTF-16 code units (chars). This serendipitously works because of the way UTF-8
150+
// and UTF-16 happen to represent Unicode code points.
151+
//
152+
// After finding the first pair of differing characters, there are two cases:
153+
//
154+
// Case 1: Both characters are non-surrogates (code points less than or equal to 0xFFFF) or
155+
// both are surrogates from a surrogate pair (that collectively represent code points greater
156+
// than 0xFFFF). In this case their numeric order as UTF-16 code units is the same as the
157+
// lexicographical order of their corresponding UTF-8 byte sequences. A direct comparison is
158+
// sufficient.
159+
//
160+
// Case 2: One character is a surrogate and the other is not. In this case the surrogate-
161+
// containing string is always ordered after the non-surrogate. This is because surrogates are
162+
// used to represent code points greater than 0xFFFF which have 4-byte UTF-8 representations
163+
// and are lexicographically greater than the 1, 2, or 3-byte representations of code points
164+
// less than or equal to 0xFFFF.
165+
final int length = Math.min(left.length(), right.length());
166+
for (int i = 0; i < length; i++) {
167+
final char leftChar = left.charAt(i);
168+
final char rightChar = right.charAt(i);
169+
if (leftChar != rightChar) {
170+
return (isSurrogate(leftChar) == isSurrogate(rightChar))
171+
? Character.compare(leftChar, rightChar)
172+
: isSurrogate(leftChar) ? 1 : -1;
165173
}
166-
// Increment by 2 for surrogate pairs, 1 otherwise.
167-
i += Character.charCount(leftCodePoint);
168174
}
169175

170-
// Compare lengths if all characters are equal
176+
// Use the lengths of the strings to determine the overall comparison result since either the
177+
// strings were equal or one is a prefix of the other.
171178
return Integer.compare(left.length(), right.length());
172179
}
173180

174-
private static String getUtf8SafeBytes(String str, int index) {
175-
int firstCodePoint = str.codePointAt(index);
176-
return str.substring(index, index + Character.charCount(firstCodePoint));
177-
}
178-
179181
private int compareBlobs(Value left, Value right) {
180182
ByteString leftBytes = left.getBytesValue();
181183
ByteString rightBytes = right.getBytesValue();

google-cloud-firestore/src/test/java/com/google/cloud/firestore/OrderTest.java

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -218,7 +218,7 @@ public void compareUtf8StringsShouldReturnCorrectValue() {
218218
ByteString b2 = ByteString.copyFromUtf8(s2);
219219
int expected = compareByteStrings(b1, b2);
220220

221-
if (actual == expected) {
221+
if (Integer.signum(actual) == Integer.signum(expected)) {
222222
passCount++;
223223
} else {
224224
errors.add(
@@ -227,9 +227,12 @@ public void compareUtf8StringsShouldReturnCorrectValue() {
227227
+ "\", s2=\""
228228
+ s2
229229
+ "\") returned "
230+
+ signName(actual)
231+
+ " ("
230232
+ actual
233+
+ ")"
231234
+ ", but expected "
232-
+ expected
235+
+ signName(expected)
233236
+ " (i="
234237
+ i
235238
+ ", s1.length="
@@ -252,6 +255,16 @@ public void compareUtf8StringsShouldReturnCorrectValue() {
252255
}
253256
}
254257

258+
private static String signName(int value) {
259+
if (value < 0) {
260+
return "a negative value";
261+
} else if (value > 0) {
262+
return "a positive value";
263+
} else {
264+
return "0";
265+
}
266+
}
267+
255268
private static class StringPairGenerator {
256269

257270
private final StringGenerator stringGenerator;

0 commit comments

Comments
 (0)