[SPARK-36992][SQL] Improve byte array sort perf by unify getPrefix function of UTF8String and ByteArray

ulysses-you · srowen · commit f9cc7fbbad5f · 2021-10-16T09:53:51.000-05:00
### What changes were proposed in this pull request? Unify the getPrefix function of `UTF8String` and `ByteArray`. ### Why are the changes needed? When execute sort operator, we first compare the prefix. However the getPrefix function of byte array is slow. We use first 8 bytes as the prefix, so at most we will call 8 times with `Platform.getByte` which is slower than call once with `Platform.getInt` or `Platform.getLong`. ### Does this PR introduce _any_ user-facing change? no ### How was this patch tested? pass `org.apache.spark.util.collection.unsafe.sort.PrefixComparatorsSuite` Closes apache#34267 from ulysses-you/binary-prefix. Authored-by: ulysses-you <ulyssesyou18@gmail.com> Signed-off-by: Sean Owen <srowen@gmail.com>
diff --git a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/ByteArray.java b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/ByteArray.java
@@ -17,6 +17,7 @@
 
 package org.apache.spark.unsafe.types;
 
+import java.nio.ByteOrder;
 import java.util.Arrays;
 
 import com.google.common.primitives.Ints;
@@ -26,6 +27,8 @@
 public final class ByteArray {
 
   public static final byte[] EMPTY_BYTE = new byte[0];
+  private static final boolean IS_LITTLE_ENDIAN =
+      ByteOrder.nativeOrder() == ByteOrder.LITTLE_ENDIAN;
 
   /**
    * Writes the content of a byte array into a memory address, identified by an object and an
@@ -42,15 +45,34 @@ public static void writeToMemory(byte[] src, Object target, long targetOffset) {
   public static long getPrefix(byte[] bytes) {
     if (bytes == null) {
       return 0L;
+    }
+    return getPrefix(bytes, Platform.BYTE_ARRAY_OFFSET, bytes.length);
+  }
+
+  static long getPrefix(Object base, long offset, int numBytes) {
+    // Since JVMs are either 4-byte aligned or 8-byte aligned, we check the size of the bytes.
+    // If size is 0, just return 0.
+    // If size is between 1 and 4 (inclusive), assume data is 4-byte aligned under the hood and
+    // use a getInt to fetch the prefix.
+    // If size is greater than 4, assume we have at least 8 bytes of data to fetch.
+    // After getting the data, we use a mask to mask out data that is not part of the bytes.
+    final long p;
+    final long mask;
+    if (numBytes >= 8) {
+      p = Platform.getLong(base, offset);
+      mask = 0;
+    } else if (numBytes > 4) {
+      p = Platform.getLong(base, offset);
+      mask = (1L << (8 - numBytes) * 8) - 1;
+    } else if (numBytes > 0) {
+      long pRaw = Platform.getInt(base, offset);
+      p = IS_LITTLE_ENDIAN ? pRaw : (pRaw << 32);
+      mask = (1L << (8 - numBytes) * 8) - 1;
     } else {
-      final int minLen = Math.min(bytes.length, 8);
-      long p = 0;
-      for (int i = 0; i < minLen; ++i) {
-        p |= ((long) Platform.getByte(bytes, Platform.BYTE_ARRAY_OFFSET + i) & 0xff)
-            << (56 - 8 * i);
-      }
-      return p;
+      p = 0;
+      mask = 0;
     }
+    return (IS_LITTLE_ENDIAN ? java.lang.Long.reverseBytes(p) : p) & ~mask;
   }
 
   public static byte[] subStringSQL(byte[] bytes, int pos, int len) {
diff --git a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
@@ -246,43 +246,7 @@ public int numChars() {
    * Returns a 64-bit integer that can be used as the prefix used in sorting.
    */
   public long getPrefix() {
-    // Since JVMs are either 4-byte aligned or 8-byte aligned, we check the size of the string.
-    // If size is 0, just return 0.
-    // If size is between 0 and 4 (inclusive), assume data is 4-byte aligned under the hood and
-    // use a getInt to fetch the prefix.
-    // If size is greater than 4, assume we have at least 8 bytes of data to fetch.
-    // After getting the data, we use a mask to mask out data that is not part of the string.
-    long p;
-    long mask = 0;
-    if (IS_LITTLE_ENDIAN) {
-      if (numBytes >= 8) {
-        p = Platform.getLong(base, offset);
-      } else if (numBytes > 4) {
-        p = Platform.getLong(base, offset);
-        mask = (1L << (8 - numBytes) * 8) - 1;
-      } else if (numBytes > 0) {
-        p = (long) Platform.getInt(base, offset);
-        mask = (1L << (8 - numBytes) * 8) - 1;
-      } else {
-        p = 0;
-      }
-      p = java.lang.Long.reverseBytes(p);
-    } else {
-      // byteOrder == ByteOrder.BIG_ENDIAN
-      if (numBytes >= 8) {
-        p = Platform.getLong(base, offset);
-      } else if (numBytes > 4) {
-        p = Platform.getLong(base, offset);
-        mask = (1L << (8 - numBytes) * 8) - 1;
-      } else if (numBytes > 0) {
-        p = ((long) Platform.getInt(base, offset)) << 32;
-        mask = (1L << (8 - numBytes) * 8) - 1;
-      } else {
-        p = 0;
-      }
-    }
-    p &= ~mask;
-    return p;
+    return ByteArray.getPrefix(base, offset, numBytes);
   }
 
   /**
diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/array/ByteArraySuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/array/ByteArraySuite.java
@@ -0,0 +1,51 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.unsafe.array;
+
+import org.apache.spark.unsafe.Platform;
+import org.apache.spark.unsafe.types.ByteArray;
+import org.junit.Assert;
+import org.junit.Test;
+
+public class ByteArraySuite {
+  private long getPrefixByByte(byte[] bytes) {
+    final int minLen = Math.min(bytes.length, 8);
+    long p = 0;
+    for (int i = 0; i < minLen; ++i) {
+      p |= ((long) Platform.getByte(bytes, Platform.BYTE_ARRAY_OFFSET + i) & 0xff)
+              << (56 - 8 * i);
+    }
+    return p;
+  }
+
+  @Test
+  public void testGetPrefix() {
+    for (int i = 0; i <= 9; i++) {
+      byte[] bytes = new byte[i];
+      int prefix = i - 1;
+      while (prefix >= 0) {
+        bytes[prefix] = (byte) prefix;
+        prefix -= 1;
+      }
+
+      long result = ByteArray.getPrefix(bytes);
+      long expected = getPrefixByByte(bytes);
+      Assert.assertEquals(result, expected);
+    }
+  }
+}