@@ -94,18 +94,29 @@ public static int compareUtf8Strings(String left, String right) {
9494 return 0 ;
9595 }
9696
97- // Find the first differing characters in the strings and, if found, use it to determine the
98- // overall comparison result.
97+ // Find the first differing characters in the strings and, if found, use them to determine the
98+ // overall comparison result. This simple and efficient formula serendipitously works because
99+ // of the properties of UTF-8 and UTF-16 encodings; that is, if both UTF-16 characters are
100+ // surrogates or both are non-surrogates then the relative ordering of those individual
101+ // characters is the same as the relative ordering of the lexicographical ordering of the UTF-8
102+ // encoding of those characters (or character pairs, in the case of surrogate pairs). Also, if
103+ // one is a surrogate and the other is not then it is assumed to be the high surrogate of a
104+ // surrogate pair (otherwise it would not constitute a valid surrogate pair) and, therefore,
105+ // would necessarily be ordered _after_ the non-surrogate because all surrogate pairs represent
106+ // characters with code points above 0xFFFF and such characters produce a 4-byte UTF-8 encoding
107+ // whose first byte is 11110xxx, and since the other character is a non-surrogate it represents
108+ // a character with a code point less than or equal to 0xFFFF and produces a 1-byte, 2-byte, or
109+ // 3-byte UTF-8 encoding whose first (or only) byte is 0xxxxxxx, 110xxxxx, or 1110xxxx,
110+ // respectively, which is always less than 11110xxx when interpreted as a 2's-complement
111+ // unsigned integer.
99112 final int length = Math .min (left .length (), right .length ());
100113 for (int i = 0 ; i < length ; i ++) {
101114 final char leftChar = left .charAt (i );
102115 final char rightChar = right .charAt (i );
103116 if (leftChar != rightChar ) {
104- boolean leftIsSurrogate = isSurrogate (leftChar );
105- boolean rightIsSurrogate = isSurrogate (rightChar );
106- return (leftIsSurrogate == rightIsSurrogate )
117+ return (isSurrogate (leftChar ) == isSurrogate (rightChar ))
107118 ? Util .compareIntegers (leftChar , rightChar )
108- : leftIsSurrogate ? 1 : -1 ;
119+ : isSurrogate ( leftChar ) ? 1 : -1 ;
109120 }
110121 }
111122
0 commit comments