Skip to content

Commit 5667e5c

Browse files
committed
fix: Improve performance of the UTF-8 string comparison logic.
The semantics of this logic were originally fixed by #2275, but this fix caused a material performance degradation, which was then improved by #2299 The performance was, however, still suboptimal, and this PR further improves the speed back to close to its original speed and, serendipitously, simplifies the algorithm too. This commit is a port of firebase/firebase-js-sdk#9143
1 parent cfa9fdc commit 5667e5c

File tree

1 file changed

+37
-44
lines changed

1 file changed

+37
-44
lines changed

dev/src/order.ts

Lines changed: 37 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -254,56 +254,49 @@ function compareVectors(left: ApiMapValue, right: ApiMapValue): number {
254254
* @internal
255255
*/
256256
export function compareUtf8Strings(left: string, right: string): number {
257-
let i = 0;
258-
while (i < left.length && i < right.length) {
259-
const leftCodePoint = left.codePointAt(i)!;
260-
const rightCodePoint = right.codePointAt(i)!;
261-
262-
if (leftCodePoint !== rightCodePoint) {
263-
if (leftCodePoint < 128 && rightCodePoint < 128) {
264-
// ASCII comparison
265-
return primitiveComparator(leftCodePoint, rightCodePoint);
266-
} else {
267-
// Lazy instantiate TextEncoder
268-
const encoder = new TextEncoder();
269-
270-
// UTF-8 encode the character at index i for byte comparison.
271-
const leftBytes = encoder.encode(getUtf8SafeSubstring(left, i));
272-
const rightBytes = encoder.encode(getUtf8SafeSubstring(right, i));
273-
const comp = compareBlobs(
274-
Buffer.from(leftBytes),
275-
Buffer.from(rightBytes)
276-
);
277-
if (comp !== 0) {
278-
return comp;
279-
} else {
280-
// EXTREMELY RARE CASE: Code points differ, but their UTF-8 byte
281-
// representations are identical. This can happen with malformed input
282-
// (invalid surrogate pairs). The backend also actively prevents invalid
283-
// surrogates as INVALID_ARGUMENT errors, so we almost never receive
284-
// invalid strings from backend.
285-
// Fallback to code point comparison for graceful handling.
286-
return primitiveComparator(leftCodePoint, rightCodePoint);
287-
}
288-
}
257+
// Find the first differing character (a.k.a. "UTF-16 code unit") in the two strings and,
258+
// if found, use that character to determine the relative ordering of the two strings as a
259+
// whole. Comparing UTF-16 strings in UTF-8 byte order can be done simply and efficiently by
260+
// comparing the UTF-16 code units (chars). This serendipitously works because of the way UTF-8
261+
// and UTF-16 happen to represent Unicode code points.
262+
//
263+
// After finding the first pair of differing characters, there are two cases:
264+
//
265+
// Case 1: Both characters are non-surrogates (code points less than or equal to 0xFFFF) or
266+
// both are surrogates from a surrogate pair (that collectively represent code points greater
267+
// than 0xFFFF). In this case their numeric order as UTF-16 code units is the same as the
268+
// lexicographical order of their corresponding UTF-8 byte sequences. A direct comparison is
269+
// sufficient.
270+
//
271+
// Case 2: One character is a surrogate and the other is not. In this case the surrogate-
272+
// containing string is always ordered after the non-surrogate. This is because surrogates are
273+
// used to represent code points greater than 0xFFFF which have 4-byte UTF-8 representations
274+
// and are lexicographically greater than the 1, 2, or 3-byte representations of code points
275+
// less than or equal to 0xFFFF.
276+
const length = Math.min(left.length, right.length);
277+
for (let i = 0; i < length; i++) {
278+
const leftChar = left.charAt(i);
279+
const rightChar = right.charAt(i);
280+
if (leftChar !== rightChar) {
281+
return isSurrogate(leftChar) === isSurrogate(rightChar)
282+
? primitiveComparator(leftChar, rightChar)
283+
: isSurrogate(leftChar)
284+
? 1
285+
: -1;
289286
}
290-
// Increment by 2 for surrogate pairs, 1 otherwise
291-
i += leftCodePoint > 0xffff ? 2 : 1;
292287
}
293288

294-
// Compare lengths if all characters are equal
289+
// Use the lengths of the strings to determine the overall comparison result since either the
290+
// strings were equal or one is a prefix of the other.
295291
return primitiveComparator(left.length, right.length);
296292
}
297293

298-
function getUtf8SafeSubstring(str: string, index: number): string {
299-
const firstCodePoint = str.codePointAt(index)!;
300-
if (firstCodePoint > 0xffff) {
301-
// It's a surrogate pair, return the whole pair
302-
return str.substring(index, index + 2);
303-
} else {
304-
// It's a single code point, return it
305-
return str.substring(index, index + 1);
306-
}
294+
const MIN_SURROGATE = 0xd800;
295+
const MAX_SURROGATE = 0xdfff;
296+
297+
export function isSurrogate(s: string): boolean {
298+
const c = s.charCodeAt(0);
299+
return c >= MIN_SURROGATE && c <= MAX_SURROGATE;
307300
}
308301

309302
/*!

0 commit comments

Comments
 (0)