dconeybe · dconeybe · Jul 7, 2025
diff --git a/google-cloud-firestore/src/main/java/com/google/cloud/firestore/Order.java b/google-cloud-firestore/src/main/java/com/google/cloud/firestore/Order.java
@@ -16,6 +16,8 @@
 
 package com.google.cloud.firestore;
 
+import static java.lang.Character.isSurrogate;
+
 import com.google.firestore.v1.MapValue;
 import com.google.firestore.v1.Value;
 import com.google.firestore.v1.Value.ValueTypeCase;
@@ -136,46 +138,46 @@ public int compare(@Nonnull Value left, @Nonnull Value right) {
 
   /** Compare strings in UTF-8 encoded byte order */
   public static int compareUtf8Strings(String left, String right) {
-    int i = 0;
-    while (i < left.length() && i < right.length()) {
-      int leftCodePoint = left.codePointAt(i);
-      int rightCodePoint = right.codePointAt(i);
-
-      if (leftCodePoint != rightCodePoint) {
-        if (leftCodePoint < 128 && rightCodePoint < 128) {
-          // ASCII comparison
-          return Integer.compare(leftCodePoint, rightCodePoint);
-        } else {
-          // UTF-8 encode the character at index i for byte comparison.
-          ByteString leftBytes = ByteString.copyFromUtf8(getUtf8SafeBytes(left, i));
-          ByteString rightBytes = ByteString.copyFromUtf8(getUtf8SafeBytes(right, i));
-          int comp = compareByteStrings(leftBytes, rightBytes);
-          if (comp != 0) {
-            return comp;
-          } else {
-            // EXTREMELY RARE CASE: Code points differ, but their UTF-8 byte representations are
-            // identical. This can happen with malformed input (invalid surrogate pairs), where
-            // Java's encoding leads to unexpected byte sequences. Meanwhile, any invalid surrogate
-            // inputs get converted to "?" by protocol buffer while round tripping, so we almost
-            // never receive invalid strings from backend.
-            // Fallback to code point comparison for graceful handling.
-            return Integer.compare(leftCodePoint, rightCodePoint);
-          }
-        }
+    // noinspection StringEquality
+    if (left == right) {
+      return 0;
+    }
+
+    // Find the first differing character (a.k.a. "UTF-16 code unit") in the two strings and,
+    // if found, use that character to determine the relative ordering of the two strings as a
+    // whole. Comparing UTF-16 strings in UTF-8 byte order can be done simply and efficiently by
+    // comparing the UTF-16 code units (chars). This serendipitously works because of the way UTF-8
+    // and UTF-16 happen to represent Unicode code points.
+    //
+    // After finding the first pair of differing characters, there are two cases:
+    //
+    // Case 1: Both characters are non-surrogates (code points less than or equal to 0xFFFF) or
+    // both are surrogates from a surrogate pair (that collectively represent code points greater
+    // than 0xFFFF). In this case their numeric order as UTF-16 code units is the same as the
+    // lexicographical order of their corresponding UTF-8 byte sequences. A direct comparison is
+    // sufficient.
+    //
+    // Case 2: One character is a surrogate and the other is not. In this case the surrogate-
+    // containing string is always ordered after the non-surrogate. This is because surrogates are
+    // used to represent code points greater than 0xFFFF which have 4-byte UTF-8 representations
+    // and are lexicographically greater than the 1, 2, or 3-byte representations of code points
+    // less than or equal to 0xFFFF.
+    final int length = Math.min(left.length(), right.length());
+    for (int i = 0; i < length; i++) {
+      final char leftChar = left.charAt(i);
+      final char rightChar = right.charAt(i);
+      if (leftChar != rightChar) {
+        return (isSurrogate(leftChar) == isSurrogate(rightChar))
+            ? Character.compare(leftChar, rightChar)
+            : isSurrogate(leftChar) ? 1 : -1;
       }
-      // Increment by 2 for surrogate pairs, 1 otherwise.
-      i += Character.charCount(leftCodePoint);
     }
 
-    // Compare lengths if all characters are equal
+    // Use the lengths of the strings to determine the overall comparison result since either the
+    // strings were equal or one is a prefix of the other.
     return Integer.compare(left.length(), right.length());
   }
 
-  private static String getUtf8SafeBytes(String str, int index) {
-    int firstCodePoint = str.codePointAt(index);
-    return str.substring(index, index + Character.charCount(firstCodePoint));
-  }
-
   private int compareBlobs(Value left, Value right) {
     ByteString leftBytes = left.getBytesValue();
     ByteString rightBytes = right.getBytesValue();

diff --git a/google-cloud-firestore/src/test/java/com/google/cloud/firestore/OrderTest.java b/google-cloud-firestore/src/test/java/com/google/cloud/firestore/OrderTest.java
@@ -218,7 +218,7 @@ public void compareUtf8StringsShouldReturnCorrectValue() {
       ByteString b2 = ByteString.copyFromUtf8(s2);
       int expected = compareByteStrings(b1, b2);
 
-      if (actual == expected) {
+      if (Integer.signum(actual) == Integer.signum(expected)) {
         passCount++;
       } else {
         errors.add(
@@ -227,9 +227,12 @@ public void compareUtf8StringsShouldReturnCorrectValue() {
                 + "\", s2=\""
                 + s2
                 + "\") returned "
+                + signName(actual)
+                + " ("
                 + actual
+                + ")"
                 + ", but expected "
-                + expected
+                + signName(expected)
                 + " (i="
                 + i
                 + ", s1.length="
@@ -252,6 +255,16 @@ public void compareUtf8StringsShouldReturnCorrectValue() {
     }
   }
 
+  private static String signName(int value) {
+    if (value < 0) {
+      return "a negative value";
+    } else if (value > 0) {
+      return "a positive value";
+    } else {
+      return "0";
+    }
+  }
+
   private static class StringPairGenerator {
 
     private final StringGenerator stringGenerator;