diff --git a/dev/src/order.ts b/dev/src/order.ts index 7d5a03464..04c93bbc1 100644 --- a/dev/src/order.ts +++ b/dev/src/order.ts @@ -248,19 +248,62 @@ function compareVectors(left: ApiMapValue, right: ApiMapValue): number { return compareArrays(leftArray, rightArray); } -function stringToUtf8Bytes(str: string): Uint8Array { - return new TextEncoder().encode(str); -} - /*! * Compare strings in UTF-8 encoded byte order * @private * @internal */ export function compareUtf8Strings(left: string, right: string): number { - const leftBytes = stringToUtf8Bytes(left); - const rightBytes = stringToUtf8Bytes(right); - return compareBlobs(Buffer.from(leftBytes), Buffer.from(rightBytes)); + let i = 0; + while (i < left.length && i < right.length) { + const leftCodePoint = left.codePointAt(i)!; + const rightCodePoint = right.codePointAt(i)!; + + if (leftCodePoint !== rightCodePoint) { + if (leftCodePoint < 128 && rightCodePoint < 128) { + // ASCII comparison + return primitiveComparator(leftCodePoint, rightCodePoint); + } else { + // Lazy instantiate TextEncoder + const encoder = new TextEncoder(); + + // UTF-8 encode the character at index i for byte comparison. + const leftBytes = encoder.encode(getUtf8SafeSubstring(left, i)); + const rightBytes = encoder.encode(getUtf8SafeSubstring(right, i)); + const comp = compareBlobs( + Buffer.from(leftBytes), + Buffer.from(rightBytes) + ); + if (comp !== 0) { + return comp; + } else { + // EXTREMELY RARE CASE: Code points differ, but their UTF-8 byte + // representations are identical. This can happen with malformed input + // (invalid surrogate pairs). The backend also actively prevents invalid + // surrogates as INVALID_ARGUMENT errors, so we almost never receive + // invalid strings from backend. + // Fallback to code point comparison for graceful handling. + return primitiveComparator(leftCodePoint, rightCodePoint); + } + } + } + // Increment by 2 for surrogate pairs, 1 otherwise + i += leftCodePoint > 0xffff ? 2 : 1; + } + + // Compare lengths if all characters are equal + return primitiveComparator(left.length, right.length); +} + +function getUtf8SafeSubstring(str: string, index: number): string { + const firstCodePoint = str.codePointAt(index)!; + if (firstCodePoint > 0xffff) { + // It's a surrogate pair, return the whole pair + return str.substring(index, index + 2); + } else { + // It's a single code point, return it + return str.substring(index, index + 1); + } } /*! diff --git a/dev/system-test/firestore.ts b/dev/system-test/firestore.ts index 178db10d1..35efde7de 100644 --- a/dev/system-test/firestore.ts +++ b/dev/system-test/firestore.ts @@ -4086,6 +4086,20 @@ describe('Query class', () => { }); describe('sort unicode strings', () => { + const expectedDocs = [ + 'b', + 'a', + 'h', + 'i', + 'c', + 'f', + 'e', + 'd', + 'g', + 'k', + 'j', + ]; + it('snapshot listener sorts unicode strings same as server', async () => { const collection = await testCollectionWithDocs({ a: {value: 'Łukasiewicz'}, @@ -4095,10 +4109,13 @@ describe('Query class', () => { e: {value: 'P'}, f: {value: '︒'}, g: {value: '🐵'}, + h: {value: '你好'}, + i: {value: '你顥'}, + j: {value: '😁'}, + k: {value: '😀'}, }); const query = collection.orderBy('value'); - const expectedDocs = ['b', 'a', 'c', 'f', 'e', 'd', 'g']; const getSnapshot = await query.get(); expect(getSnapshot.docs.map(d => d.id)).to.deep.equal(expectedDocs); @@ -4123,10 +4140,13 @@ describe('Query class', () => { e: {value: ['P']}, f: {value: ['︒']}, g: {value: ['🐵']}, + h: {value: ['你好']}, + i: {value: ['你顥']}, + j: {value: ['😁']}, + k: {value: ['😀']}, }); const query = collection.orderBy('value'); - const expectedDocs = ['b', 'a', 'c', 'f', 'e', 'd', 'g']; const getSnapshot = await query.get(); expect(getSnapshot.docs.map(d => d.id)).to.deep.equal(expectedDocs); @@ -4151,10 +4171,13 @@ describe('Query class', () => { e: {value: {foo: 'P'}}, f: {value: {foo: '︒'}}, g: {value: {foo: '🐵'}}, + h: {value: {foo: '你好'}}, + i: {value: {foo: '你顥'}}, + j: {value: {foo: '😁'}}, + k: {value: {foo: '😀'}}, }); const query = collection.orderBy('value'); - const expectedDocs = ['b', 'a', 'c', 'f', 'e', 'd', 'g']; const getSnapshot = await query.get(); expect(getSnapshot.docs.map(d => d.id)).to.deep.equal(expectedDocs); @@ -4179,10 +4202,13 @@ describe('Query class', () => { e: {value: {P: true}}, f: {value: {'︒': true}}, g: {value: {'🐵': true}}, + h: {value: {你好: true}}, + i: {value: {你顥: true}}, + j: {value: {'😁': true}}, + k: {value: {'😀': true}}, }); const query = collection.orderBy('value'); - const expectedDocs = ['b', 'a', 'c', 'f', 'e', 'd', 'g']; const getSnapshot = await query.get(); expect(getSnapshot.docs.map(d => d.id)).to.deep.equal(expectedDocs); @@ -4207,17 +4233,25 @@ describe('Query class', () => { P: {value: true}, '︒': {value: true}, '🐵': {value: true}, + 你好: {value: true}, + 你顥: {value: true}, + '😁': {value: true}, + '😀': {value: true}, }); const query = collection.orderBy(FieldPath.documentId()); const expectedDocs = [ 'Sierpiński', 'Łukasiewicz', + '你好', + '你顥', '岩澤', '︒', 'P', '🄟', '🐵', + '😀', + '😁', ]; const getSnapshot = await query.get(); diff --git a/dev/test/order.ts b/dev/test/order.ts index a193b6d2d..2794dee54 100644 --- a/dev/test/order.ts +++ b/dev/test/order.ts @@ -284,3 +284,219 @@ describe('Order', () => { } }); }); + +class StringPair { + constructor( + readonly s1: string, + readonly s2: string + ) {} +} + +class StringPairGenerator { + constructor(private stringGenerator: StringGenerator) {} + + next(): StringPair { + const prefix = this.stringGenerator.next(); + const s1 = prefix + this.stringGenerator.next(); + const s2 = prefix + this.stringGenerator.next(); + return new StringPair(s1, s2); + } +} + +class StringGenerator { + private static readonly DEFAULT_SURROGATE_PAIR_PROBABILITY = 0.33; + private static readonly DEFAULT_MAX_LENGTH = 20; + + private readonly rnd: Random; + private readonly surrogatePairProbability: number; + private readonly maxLength: number; + + constructor(seed: number); + constructor(rnd: Random, surrogatePairProbability: number, maxLength: number); + constructor( + seedOrRnd: number | Random, + surrogatePairProbability?: number, + maxLength?: number + ) { + if (typeof seedOrRnd === 'number') { + this.rnd = new Random(seedOrRnd); + this.surrogatePairProbability = + StringGenerator.DEFAULT_SURROGATE_PAIR_PROBABILITY; + this.maxLength = StringGenerator.DEFAULT_MAX_LENGTH; + } else { + this.rnd = seedOrRnd; + this.surrogatePairProbability = StringGenerator.validateProbability( + surrogatePairProbability! + ); + this.maxLength = StringGenerator.validateLength(maxLength!); + } + } + + private static validateProbability(probability: number): number { + if (!Number.isFinite(probability)) { + throw new Error( + `invalid surrogate pair probability: ${probability} (must be between 0.0 and 1.0, inclusive)` + ); + } else if (probability < 0.0) { + throw new Error( + `invalid surrogate pair probability: ${probability} (must be greater than or equal to zero)` + ); + } else if (probability > 1.0) { + throw new Error( + `invalid surrogate pair probability: ${probability} (must be less than or equal to 1)` + ); + } + return probability; + } + + private static validateLength(length: number): number { + if (length < 0) { + throw new Error( + `invalid maximum string length: ${length} (must be greater than or equal to zero)` + ); + } + return length; + } + + next(): string { + const length = this.rnd.nextInt(this.maxLength + 1); + const sb = new StringBuilder(); + while (sb.length() < length) { + const codePoint = this.nextCodePoint(); + sb.appendCodePoint(codePoint); + } + return sb.toString(); + } + + private isNextSurrogatePair(): boolean { + return StringGenerator.nextBoolean(this.rnd, this.surrogatePairProbability); + } + + private static nextBoolean(rnd: Random, probability: number): boolean { + if (probability === 0.0) { + return false; + } else if (probability === 1.0) { + return true; + } else { + return rnd.nextFloat() < probability; + } + } + + private nextCodePoint(): number { + if (this.isNextSurrogatePair()) { + return this.nextSurrogateCodePoint(); + } else { + return this.nextNonSurrogateCodePoint(); + } + } + + private nextSurrogateCodePoint(): number { + const highSurrogateMin = 0xd800; + const highSurrogateMax = 0xdbff; + const lowSurrogateMin = 0xdc00; + const lowSurrogateMax = 0xdfff; + + const highSurrogate = this.nextCodePointRange( + highSurrogateMin, + highSurrogateMax + ); + const lowSurrogate = this.nextCodePointRange( + lowSurrogateMin, + lowSurrogateMax + ); + + return (highSurrogate - 0xd800) * 0x400 + (lowSurrogate - 0xdc00) + 0x10000; + } + + private nextNonSurrogateCodePoint(): number { + let codePoint; + do { + codePoint = this.nextCodePointRange(0, 0xffff); // BMP range + } while (codePoint >= 0xd800 && codePoint <= 0xdfff); // Exclude surrogate range + + return codePoint; + } + + private nextCodePointRange(min: number, max: number): number { + const rangeSize = max - min + 1; + const offset = this.rnd.nextInt(rangeSize); + return min + offset; + } +} + +class Random { + private seed: number; + + constructor(seed: number) { + this.seed = seed; + } + + nextInt(max: number): number { + this.seed = (this.seed * 9301 + 49297) % 233280; + const rnd = this.seed / 233280; + return Math.floor(rnd * max); + } + + nextFloat(): number { + this.seed = (this.seed * 9301 + 49297) % 233280; + return this.seed / 233280; + } +} + +class StringBuilder { + private buffer: string[] = []; + + append(str: string): StringBuilder { + this.buffer.push(str); + return this; + } + + appendCodePoint(codePoint: number): StringBuilder { + this.buffer.push(String.fromCodePoint(codePoint)); + return this; + } + + toString(): string { + return this.buffer.join(''); + } + + length(): number { + return this.buffer.join('').length; + } +} + +describe('CompareUtf8Strings', () => { + it('compareUtf8Strings should return correct results', () => { + const errors = []; + const seed = Math.floor(Math.random() * Number.MAX_SAFE_INTEGER); + let passCount = 0; + const stringGenerator = new StringGenerator(new Random(seed), 0.33, 20); + const stringPairGenerator = new StringPairGenerator(stringGenerator); + + for (let i = 0; i < 1000000 && errors.length < 10; i++) { + const {s1, s2} = stringPairGenerator.next(); + + const actual = order.compareUtf8Strings(s1, s2); + const expected = Buffer.from(s1, 'utf8').compare(Buffer.from(s2, 'utf8')); + + if (actual === expected) { + passCount++; + } else { + errors.push( + `compareUtf8Strings(s1="${s1}", s2="${s2}") returned ${actual}, ` + + `but expected ${expected} (i=${i}, s1.length=${s1.length}, s2.length=${s2.length})` + ); + } + } + + if (errors.length > 0) { + console.error( + `${errors.length} test cases failed, ${passCount} test cases passed, seed=${seed};` + ); + errors.forEach((error, index) => + console.error(`errors[${index}]: ${error}`) + ); + throw new Error('Test failed'); + } + }).timeout(20000); +});