diff --git a/google-cloud-firestore/src/main/java/com/google/cloud/firestore/Order.java b/google-cloud-firestore/src/main/java/com/google/cloud/firestore/Order.java index 7f37d8e05..8cfd107d6 100644 --- a/google-cloud-firestore/src/main/java/com/google/cloud/firestore/Order.java +++ b/google-cloud-firestore/src/main/java/com/google/cloud/firestore/Order.java @@ -16,6 +16,8 @@ package com.google.cloud.firestore; +import static java.lang.Character.isSurrogate; + import com.google.firestore.v1.MapValue; import com.google.firestore.v1.Value; import com.google.firestore.v1.Value.ValueTypeCase; @@ -136,46 +138,46 @@ public int compare(@Nonnull Value left, @Nonnull Value right) { /** Compare strings in UTF-8 encoded byte order */ public static int compareUtf8Strings(String left, String right) { - int i = 0; - while (i < left.length() && i < right.length()) { - int leftCodePoint = left.codePointAt(i); - int rightCodePoint = right.codePointAt(i); - - if (leftCodePoint != rightCodePoint) { - if (leftCodePoint < 128 && rightCodePoint < 128) { - // ASCII comparison - return Integer.compare(leftCodePoint, rightCodePoint); - } else { - // UTF-8 encode the character at index i for byte comparison. - ByteString leftBytes = ByteString.copyFromUtf8(getUtf8SafeBytes(left, i)); - ByteString rightBytes = ByteString.copyFromUtf8(getUtf8SafeBytes(right, i)); - int comp = compareByteStrings(leftBytes, rightBytes); - if (comp != 0) { - return comp; - } else { - // EXTREMELY RARE CASE: Code points differ, but their UTF-8 byte representations are - // identical. This can happen with malformed input (invalid surrogate pairs), where - // Java's encoding leads to unexpected byte sequences. Meanwhile, any invalid surrogate - // inputs get converted to "?" by protocol buffer while round tripping, so we almost - // never receive invalid strings from backend. - // Fallback to code point comparison for graceful handling. - return Integer.compare(leftCodePoint, rightCodePoint); - } - } + // noinspection StringEquality + if (left == right) { + return 0; + } + + // Find the first differing character (a.k.a. "UTF-16 code unit") in the two strings and, + // if found, use that character to determine the relative ordering of the two strings as a + // whole. Comparing UTF-16 strings in UTF-8 byte order can be done simply and efficiently by + // comparing the UTF-16 code units (chars). This serendipitously works because of the way UTF-8 + // and UTF-16 happen to represent Unicode code points. + // + // After finding the first pair of differing characters, there are two cases: + // + // Case 1: Both characters are non-surrogates (code points less than or equal to 0xFFFF) or + // both are surrogates from a surrogate pair (that collectively represent code points greater + // than 0xFFFF). In this case their numeric order as UTF-16 code units is the same as the + // lexicographical order of their corresponding UTF-8 byte sequences. A direct comparison is + // sufficient. + // + // Case 2: One character is a surrogate and the other is not. In this case the surrogate- + // containing string is always ordered after the non-surrogate. This is because surrogates are + // used to represent code points greater than 0xFFFF which have 4-byte UTF-8 representations + // and are lexicographically greater than the 1, 2, or 3-byte representations of code points + // less than or equal to 0xFFFF. + final int length = Math.min(left.length(), right.length()); + for (int i = 0; i < length; i++) { + final char leftChar = left.charAt(i); + final char rightChar = right.charAt(i); + if (leftChar != rightChar) { + return (isSurrogate(leftChar) == isSurrogate(rightChar)) + ? Character.compare(leftChar, rightChar) + : isSurrogate(leftChar) ? 1 : -1; } - // Increment by 2 for surrogate pairs, 1 otherwise. - i += Character.charCount(leftCodePoint); } - // Compare lengths if all characters are equal + // Use the lengths of the strings to determine the overall comparison result since either the + // strings were equal or one is a prefix of the other. return Integer.compare(left.length(), right.length()); } - private static String getUtf8SafeBytes(String str, int index) { - int firstCodePoint = str.codePointAt(index); - return str.substring(index, index + Character.charCount(firstCodePoint)); - } - private int compareBlobs(Value left, Value right) { ByteString leftBytes = left.getBytesValue(); ByteString rightBytes = right.getBytesValue(); diff --git a/google-cloud-firestore/src/test/java/com/google/cloud/firestore/OrderTest.java b/google-cloud-firestore/src/test/java/com/google/cloud/firestore/OrderTest.java index 076cbf2db..65e19d745 100644 --- a/google-cloud-firestore/src/test/java/com/google/cloud/firestore/OrderTest.java +++ b/google-cloud-firestore/src/test/java/com/google/cloud/firestore/OrderTest.java @@ -218,7 +218,7 @@ public void compareUtf8StringsShouldReturnCorrectValue() { ByteString b2 = ByteString.copyFromUtf8(s2); int expected = compareByteStrings(b1, b2); - if (actual == expected) { + if (Integer.signum(actual) == Integer.signum(expected)) { passCount++; } else { errors.add( @@ -227,9 +227,12 @@ public void compareUtf8StringsShouldReturnCorrectValue() { + "\", s2=\"" + s2 + "\") returned " + + signName(actual) + + " (" + actual + + ")" + ", but expected " - + expected + + signName(expected) + " (i=" + i + ", s1.length=" @@ -252,6 +255,16 @@ public void compareUtf8StringsShouldReturnCorrectValue() { } } + private static String signName(int value) { + if (value < 0) { + return "a negative value"; + } else if (value > 0) { + return "a positive value"; + } else { + return "0"; + } + } + private static class StringPairGenerator { private final StringGenerator stringGenerator;