Skip to content

fix: Further improve performance of the UTF-8 string comparison logic #1

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@

package com.google.cloud.firestore;

import static java.lang.Character.isSurrogate;

import com.google.firestore.v1.MapValue;
import com.google.firestore.v1.Value;
import com.google.firestore.v1.Value.ValueTypeCase;
Expand Down Expand Up @@ -136,46 +138,46 @@ public int compare(@Nonnull Value left, @Nonnull Value right) {

/** Compare strings in UTF-8 encoded byte order */
public static int compareUtf8Strings(String left, String right) {
int i = 0;
while (i < left.length() && i < right.length()) {
int leftCodePoint = left.codePointAt(i);
int rightCodePoint = right.codePointAt(i);

if (leftCodePoint != rightCodePoint) {
if (leftCodePoint < 128 && rightCodePoint < 128) {
// ASCII comparison
return Integer.compare(leftCodePoint, rightCodePoint);
} else {
// UTF-8 encode the character at index i for byte comparison.
ByteString leftBytes = ByteString.copyFromUtf8(getUtf8SafeBytes(left, i));
ByteString rightBytes = ByteString.copyFromUtf8(getUtf8SafeBytes(right, i));
int comp = compareByteStrings(leftBytes, rightBytes);
if (comp != 0) {
return comp;
} else {
// EXTREMELY RARE CASE: Code points differ, but their UTF-8 byte representations are
// identical. This can happen with malformed input (invalid surrogate pairs), where
// Java's encoding leads to unexpected byte sequences. Meanwhile, any invalid surrogate
// inputs get converted to "?" by protocol buffer while round tripping, so we almost
// never receive invalid strings from backend.
// Fallback to code point comparison for graceful handling.
return Integer.compare(leftCodePoint, rightCodePoint);
}
}
// noinspection StringEquality
if (left == right) {
return 0;
}

// Find the first differing character (a.k.a. "UTF-16 code unit") in the two strings and,
// if found, use that character to determine the relative ordering of the two strings as a
// whole. Comparing UTF-16 strings in UTF-8 byte order can be done simply and efficiently by
// comparing the UTF-16 code units (chars). This serendipitously works because of the way UTF-8
// and UTF-16 happen to represent Unicode code points.
//
// After finding the first pair of differing characters, there are two cases:
//
// Case 1: Both characters are non-surrogates (code points less than or equal to 0xFFFF) or
// both are surrogates from a surrogate pair (that collectively represent code points greater
// than 0xFFFF). In this case their numeric order as UTF-16 code units is the same as the
// lexicographical order of their corresponding UTF-8 byte sequences. A direct comparison is
// sufficient.
//
// Case 2: One character is a surrogate and the other is not. In this case the surrogate-
// containing string is always ordered after the non-surrogate. This is because surrogates are
// used to represent code points greater than 0xFFFF which have 4-byte UTF-8 representations
// and are lexicographically greater than the 1, 2, or 3-byte representations of code points
// less than or equal to 0xFFFF.
final int length = Math.min(left.length(), right.length());
for (int i = 0; i < length; i++) {
final char leftChar = left.charAt(i);
final char rightChar = right.charAt(i);
if (leftChar != rightChar) {
return (isSurrogate(leftChar) == isSurrogate(rightChar))
? Character.compare(leftChar, rightChar)
: isSurrogate(leftChar) ? 1 : -1;
}
// Increment by 2 for surrogate pairs, 1 otherwise.
i += Character.charCount(leftCodePoint);
}

// Compare lengths if all characters are equal
// Use the lengths of the strings to determine the overall comparison result since either the
// strings were equal or one is a prefix of the other.
return Integer.compare(left.length(), right.length());
}

private static String getUtf8SafeBytes(String str, int index) {
int firstCodePoint = str.codePointAt(index);
return str.substring(index, index + Character.charCount(firstCodePoint));
}

private int compareBlobs(Value left, Value right) {
ByteString leftBytes = left.getBytesValue();
ByteString rightBytes = right.getBytesValue();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -218,7 +218,7 @@ public void compareUtf8StringsShouldReturnCorrectValue() {
ByteString b2 = ByteString.copyFromUtf8(s2);
int expected = compareByteStrings(b1, b2);

if (actual == expected) {
if (Integer.signum(actual) == Integer.signum(expected)) {
passCount++;
} else {
errors.add(
Expand All @@ -227,9 +227,12 @@ public void compareUtf8StringsShouldReturnCorrectValue() {
+ "\", s2=\""
+ s2
+ "\") returned "
+ signName(actual)
+ " ("
+ actual
+ ")"
+ ", but expected "
+ expected
+ signName(expected)
+ " (i="
+ i
+ ", s1.length="
Expand All @@ -252,6 +255,16 @@ public void compareUtf8StringsShouldReturnCorrectValue() {
}
}

private static String signName(int value) {
if (value < 0) {
return "a negative value";
} else if (value > 0) {
return "a positive value";
} else {
return "0";
}
}

private static class StringPairGenerator {

private final StringGenerator stringGenerator;
Expand Down