|
14 | 14 |
|
15 | 15 | package com.google.firebase.firestore.util; |
16 | 16 |
|
| 17 | +import static java.lang.Character.isSurrogate; |
| 18 | + |
17 | 19 | import android.annotation.SuppressLint; |
18 | 20 | import android.os.Handler; |
19 | 21 | import android.os.Looper; |
@@ -87,46 +89,46 @@ public static int compareIntegers(int i1, int i2) { |
87 | 89 |
|
88 | 90 | /** Compare strings in UTF-8 encoded byte order */ |
89 | 91 | public static int compareUtf8Strings(String left, String right) { |
90 | | - int i = 0; |
91 | | - while (i < left.length() && i < right.length()) { |
92 | | - int leftCodePoint = left.codePointAt(i); |
93 | | - int rightCodePoint = right.codePointAt(i); |
94 | | - |
95 | | - if (leftCodePoint != rightCodePoint) { |
96 | | - if (leftCodePoint < 128 && rightCodePoint < 128) { |
97 | | - // ASCII comparison |
98 | | - return Integer.compare(leftCodePoint, rightCodePoint); |
99 | | - } else { |
100 | | - // substring and do UTF-8 encoded byte comparison |
101 | | - ByteString leftBytes = ByteString.copyFromUtf8(getUtf8SafeBytes(left, i)); |
102 | | - ByteString rightBytes = ByteString.copyFromUtf8(getUtf8SafeBytes(right, i)); |
103 | | - int comp = compareByteStrings(leftBytes, rightBytes); |
104 | | - if (comp != 0) { |
105 | | - return comp; |
106 | | - } else { |
107 | | - // EXTREMELY RARE CASE: Code points differ, but their UTF-8 byte representations are |
108 | | - // identical. This can happen with malformed input (invalid surrogate pairs), where |
109 | | - // Java's encoding leads to unexpected byte sequences. Meanwhile, any invalid surrogate |
110 | | - // inputs get converted to "?" by protocol buffer while round tripping, so we almost |
111 | | - // never receive invalid strings from backend. |
112 | | - // Fallback to code point comparison for graceful handling. |
113 | | - return Integer.compare(leftCodePoint, rightCodePoint); |
114 | | - } |
115 | | - } |
| 92 | + // noinspection StringEquality |
| 93 | + if (left == right) { |
| 94 | + return 0; |
| 95 | + } |
| 96 | + |
| 97 | + // Find the first differing character (a.k.a. "UTF-16 code unit") in the two strings and, |
| 98 | + // if found, use that character to determine the relative ordering of the two strings as a |
| 99 | + // whole. Comparing UTF-16 strings in UTF-8 byte order can be done simply and efficiently by |
| 100 | + // comparing the UTF-16 code units (chars). This serendipitously works because of the way UTF-8 |
| 101 | + // and UTF-16 happen to represent Unicode code points. |
| 102 | + // |
| 103 | + // After finding the first pair of differing characters, there are two cases: |
| 104 | + // |
| 105 | + // Case 1: Both characters are non-surrogates (code points less than or equal to 0xFFFF) or |
| 106 | + // both are surrogates from a surrogate pair (that collectively represent code points greater |
| 107 | + // than 0xFFFF). In this case their numeric order as UTF-16 code units is the same as the |
| 108 | + // lexicographical order of their corresponding UTF-8 byte sequences. A direct comparison is |
| 109 | + // sufficient. |
| 110 | + // |
| 111 | + // Case 2: One character is a surrogate and the other is not. In this case the surrogate- |
| 112 | + // containing string is always ordered after the non-surrogate. This is because surrogates are |
| 113 | + // used to represent code points greater than 0xFFFF which have 4-byte UTF-8 representations |
| 114 | + // and are lexicographically greater than the 1, 2, or 3-byte representations of code points |
| 115 | + // less than or equal to 0xFFFF. |
| 116 | + final int length = Math.min(left.length(), right.length()); |
| 117 | + for (int i = 0; i < length; i++) { |
| 118 | + final char leftChar = left.charAt(i); |
| 119 | + final char rightChar = right.charAt(i); |
| 120 | + if (leftChar != rightChar) { |
| 121 | + return (isSurrogate(leftChar) == isSurrogate(rightChar)) |
| 122 | + ? Util.compareIntegers(leftChar, rightChar) |
| 123 | + : isSurrogate(leftChar) ? 1 : -1; |
116 | 124 | } |
117 | | - // Increment by 2 for surrogate pairs, 1 otherwise. |
118 | | - i += Character.charCount(leftCodePoint); |
119 | 125 | } |
120 | 126 |
|
121 | | - // Compare lengths if all characters are equal |
| 127 | + // Use the lengths of the strings to determine the overall comparison result since either the |
| 128 | + // strings were equal or one is a prefix of the other. |
122 | 129 | return Integer.compare(left.length(), right.length()); |
123 | 130 | } |
124 | 131 |
|
125 | | - private static String getUtf8SafeBytes(String str, int index) { |
126 | | - int firstCodePoint = str.codePointAt(index); |
127 | | - return str.substring(index, index + Character.charCount(firstCodePoint)); |
128 | | - } |
129 | | - |
130 | 132 | /** |
131 | 133 | * Utility function to compare longs. Note that we can't use Long.compare because it's only |
132 | 134 | * available after Android 19. |
|
0 commit comments