@@ -254,56 +254,49 @@ function compareVectors(left: ApiMapValue, right: ApiMapValue): number {
254
254
* @internal
255
255
*/
256
256
export function compareUtf8Strings ( left : string , right : string ) : number {
257
- let i = 0 ;
258
- while ( i < left . length && i < right . length ) {
259
- const leftCodePoint = left . codePointAt ( i ) ! ;
260
- const rightCodePoint = right . codePointAt ( i ) ! ;
261
-
262
- if ( leftCodePoint !== rightCodePoint ) {
263
- if ( leftCodePoint < 128 && rightCodePoint < 128 ) {
264
- // ASCII comparison
265
- return primitiveComparator ( leftCodePoint , rightCodePoint ) ;
266
- } else {
267
- // Lazy instantiate TextEncoder
268
- const encoder = new TextEncoder ( ) ;
269
-
270
- // UTF-8 encode the character at index i for byte comparison.
271
- const leftBytes = encoder . encode ( getUtf8SafeSubstring ( left , i ) ) ;
272
- const rightBytes = encoder . encode ( getUtf8SafeSubstring ( right , i ) ) ;
273
- const comp = compareBlobs (
274
- Buffer . from ( leftBytes ) ,
275
- Buffer . from ( rightBytes )
276
- ) ;
277
- if ( comp !== 0 ) {
278
- return comp ;
279
- } else {
280
- // EXTREMELY RARE CASE: Code points differ, but their UTF-8 byte
281
- // representations are identical. This can happen with malformed input
282
- // (invalid surrogate pairs). The backend also actively prevents invalid
283
- // surrogates as INVALID_ARGUMENT errors, so we almost never receive
284
- // invalid strings from backend.
285
- // Fallback to code point comparison for graceful handling.
286
- return primitiveComparator ( leftCodePoint , rightCodePoint ) ;
287
- }
288
- }
257
+ // Find the first differing character (a.k.a. "UTF-16 code unit") in the two strings and,
258
+ // if found, use that character to determine the relative ordering of the two strings as a
259
+ // whole. Comparing UTF-16 strings in UTF-8 byte order can be done simply and efficiently by
260
+ // comparing the UTF-16 code units (chars). This serendipitously works because of the way UTF-8
261
+ // and UTF-16 happen to represent Unicode code points.
262
+ //
263
+ // After finding the first pair of differing characters, there are two cases:
264
+ //
265
+ // Case 1: Both characters are non-surrogates (code points less than or equal to 0xFFFF) or
266
+ // both are surrogates from a surrogate pair (that collectively represent code points greater
267
+ // than 0xFFFF). In this case their numeric order as UTF-16 code units is the same as the
268
+ // lexicographical order of their corresponding UTF-8 byte sequences. A direct comparison is
269
+ // sufficient.
270
+ //
271
+ // Case 2: One character is a surrogate and the other is not. In this case the surrogate-
272
+ // containing string is always ordered after the non-surrogate. This is because surrogates are
273
+ // used to represent code points greater than 0xFFFF which have 4-byte UTF-8 representations
274
+ // and are lexicographically greater than the 1, 2, or 3-byte representations of code points
275
+ // less than or equal to 0xFFFF.
276
+ const length = Math . min ( left . length , right . length ) ;
277
+ for ( let i = 0 ; i < length ; i ++ ) {
278
+ const leftChar = left . charAt ( i ) ;
279
+ const rightChar = right . charAt ( i ) ;
280
+ if ( leftChar !== rightChar ) {
281
+ return isSurrogate ( leftChar ) === isSurrogate ( rightChar )
282
+ ? primitiveComparator ( leftChar , rightChar )
283
+ : isSurrogate ( leftChar )
284
+ ? 1
285
+ : - 1 ;
289
286
}
290
- // Increment by 2 for surrogate pairs, 1 otherwise
291
- i += leftCodePoint > 0xffff ? 2 : 1 ;
292
287
}
293
288
294
- // Compare lengths if all characters are equal
289
+ // Use the lengths of the strings to determine the overall comparison result since either the
290
+ // strings were equal or one is a prefix of the other.
295
291
return primitiveComparator ( left . length , right . length ) ;
296
292
}
297
293
298
- function getUtf8SafeSubstring ( str : string , index : number ) : string {
299
- const firstCodePoint = str . codePointAt ( index ) ! ;
300
- if ( firstCodePoint > 0xffff ) {
301
- // It's a surrogate pair, return the whole pair
302
- return str . substring ( index , index + 2 ) ;
303
- } else {
304
- // It's a single code point, return it
305
- return str . substring ( index , index + 1 ) ;
306
- }
294
+ const MIN_SURROGATE = 0xd800 ;
295
+ const MAX_SURROGATE = 0xdfff ;
296
+
297
+ export function isSurrogate ( s : string ) : boolean {
298
+ const c = s . charCodeAt ( 0 ) ;
299
+ return c >= MIN_SURROGATE && c <= MAX_SURROGATE ;
307
300
}
308
301
309
302
/*!
0 commit comments