[SPARK-22222][CORE] Fix the ARRAY_MAX in BufferHolder and add a test

Feng Liu · gatorsmile · commit bebd2e1ce10a · 2017-10-09T21:34:37.000-07:00
## What changes were proposed in this pull request? We should not break the assumption that the length of the allocated byte array is word rounded: https://github.com/apache/spark/blob/master/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java#L170 So we want to use `Integer.MAX_VALUE - 15` instead of `Integer.MAX_VALUE - 8` as the upper bound of an allocated byte array. cc: srowen gatorsmile ## How was this patch tested? Since the Spark unit test JVM has less than 1GB heap, here we run the test code as a submit job, so it can run on a JVM has 4GB memory. Please review http://spark.apache.org/contributing.html before opening a pull request. Author: Feng Liu <fengliu@databricks.com> Closes apache#19460 from liufengdb/fix_array_max.
diff --git a/common/unsafe/src/main/java/org/apache/spark/unsafe/array/ByteArrayMethods.java b/common/unsafe/src/main/java/org/apache/spark/unsafe/array/ByteArrayMethods.java
@@ -40,6 +40,13 @@ public static int roundNumberOfBytesToNearestWord(int numBytes) {
     }
   }
 
+  // Some JVMs can't allocate arrays of length Integer.MAX_VALUE; actual max is somewhat smaller.
+  // Be conservative and lower the cap a little.
+  // Refer to "http://hg.openjdk.java.net/jdk8/jdk8/jdk/file/tip/src/share/classes/java/util/ArrayList.java#l229"
+  // This value is word rounded. Use this value if the allocated byte arrays are used to store other
+  // types rather than bytes.
+  public static int MAX_ROUNDED_ARRAY_LENGTH = Integer.MAX_VALUE - 15;
+
   private static final boolean unaligned = Platform.unaligned();
   /**
    * Optimized byte array equality check for byte arrays.
diff --git a/core/src/main/java/org/apache/spark/unsafe/map/HashMapGrowthStrategy.java b/core/src/main/java/org/apache/spark/unsafe/map/HashMapGrowthStrategy.java
@@ -17,6 +17,8 @@
 
 package org.apache.spark.unsafe.map;
 
+import org.apache.spark.unsafe.array.ByteArrayMethods;
+
 /**
  * Interface that defines how we can grow the size of a hash map when it is over a threshold.
  */
@@ -31,9 +33,7 @@ public interface HashMapGrowthStrategy {
 
   class Doubling implements HashMapGrowthStrategy {
 
-    // Some JVMs can't allocate arrays of length Integer.MAX_VALUE; actual max is somewhat
-    // smaller. Be conservative and lower the cap a little.
-    private static final int ARRAY_MAX = Integer.MAX_VALUE - 8;
+    private static final int ARRAY_MAX = ByteArrayMethods.MAX_ROUNDED_ARRAY_LENGTH;
 
     @Override
     public int nextCapacity(int currentCapacity) {
diff --git a/core/src/main/scala/org/apache/spark/util/collection/PartitionedPairBuffer.scala b/core/src/main/scala/org/apache/spark/util/collection/PartitionedPairBuffer.scala
@@ -19,6 +19,8 @@ package org.apache.spark.util.collection
 
 import java.util.Comparator
 
+import org.apache.spark.unsafe.Platform
+import org.apache.spark.unsafe.array.ByteArrayMethods
 import org.apache.spark.util.collection.WritablePartitionedPairCollection._
 
 /**
@@ -96,7 +98,5 @@ private[spark] class PartitionedPairBuffer[K, V](initialCapacity: Int = 64)
 }
 
 private object PartitionedPairBuffer {
-  // Some JVMs can't allocate arrays of length Integer.MAX_VALUE; actual max is somewhat
-  // smaller. Be conservative and lower the cap a little.
-  val MAXIMUM_CAPACITY: Int = (Int.MaxValue - 8) / 2
+  val MAXIMUM_CAPACITY: Int = ByteArrayMethods.MAX_ROUNDED_ARRAY_LENGTH / 2
 }
diff --git a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala
@@ -100,6 +100,8 @@ class SparkSubmitSuite
   with TimeLimits
   with TestPrematureExit {
 
+  import SparkSubmitSuite._
+
   override def beforeEach() {
     super.beforeEach()
     System.setProperty("spark.testing", "true")
@@ -974,30 +976,6 @@ class SparkSubmitSuite
     }
   }
 
-  // NOTE: This is an expensive operation in terms of time (10 seconds+). Use sparingly.
-  private def runSparkSubmit(args: Seq[String]): Unit = {
-    val sparkHome = sys.props.getOrElse("spark.test.home", fail("spark.test.home is not set!"))
-    val sparkSubmitFile = if (Utils.isWindows) {
-      new File("..\\bin\\spark-submit.cmd")
-    } else {
-      new File("../bin/spark-submit")
-    }
-    val process = Utils.executeCommand(
-      Seq(sparkSubmitFile.getCanonicalPath) ++ args,
-      new File(sparkHome),
-      Map("SPARK_TESTING" -> "1", "SPARK_HOME" -> sparkHome))
-
-    try {
-      val exitCode = failAfter(60 seconds) { process.waitFor() }
-      if (exitCode != 0) {
-        fail(s"Process returned with exit code $exitCode. See the log4j logs for more detail.")
-      }
-    } finally {
-      // Ensure we still kill the process in case it timed out
-      process.destroy()
-    }
-  }
-
   private def forConfDir(defaults: Map[String, String]) (f: String => Unit) = {
     val tmpDir = Utils.createTempDir()
 
@@ -1020,6 +998,32 @@ class SparkSubmitSuite
   }
 }
 
+object SparkSubmitSuite extends SparkFunSuite with TimeLimits {
+  // NOTE: This is an expensive operation in terms of time (10 seconds+). Use sparingly.
+  def runSparkSubmit(args: Seq[String], root: String = ".."): Unit = {
+    val sparkHome = sys.props.getOrElse("spark.test.home", fail("spark.test.home is not set!"))
+    val sparkSubmitFile = if (Utils.isWindows) {
+      new File(s"$root\\bin\\spark-submit.cmd")
+    } else {
+      new File(s"$root/bin/spark-submit")
+    }
+    val process = Utils.executeCommand(
+      Seq(sparkSubmitFile.getCanonicalPath) ++ args,
+      new File(sparkHome),
+      Map("SPARK_TESTING" -> "1", "SPARK_HOME" -> sparkHome))
+
+    try {
+      val exitCode = failAfter(60 seconds) { process.waitFor() }
+      if (exitCode != 0) {
+        fail(s"Process returned with exit code $exitCode. See the log4j logs for more detail.")
+      }
+    } finally {
+      // Ensure we still kill the process in case it timed out
+      process.destroy()
+    }
+  }
+}
+
 object JarCreationTest extends Logging {
   def main(args: Array[String]) {
     Utils.configTestLog4j("INFO")
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/codegen/BufferHolder.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/codegen/BufferHolder.java
@@ -19,6 +19,7 @@
 
 import org.apache.spark.sql.catalyst.expressions.UnsafeRow;
 import org.apache.spark.unsafe.Platform;
+import org.apache.spark.unsafe.array.ByteArrayMethods;
 
 /**
  * A helper class to manage the data buffer for an unsafe row.  The data buffer can grow and
@@ -36,9 +37,7 @@
  */
 public class BufferHolder {
 
-  // Some JVMs can't allocate arrays of length Integer.MAX_VALUE; actual max is somewhat
-  // smaller. Be conservative and lower the cap a little.
-  private static final int ARRAY_MAX = Integer.MAX_VALUE - 8;
+  private static final int ARRAY_MAX = ByteArrayMethods.MAX_ROUNDED_ARRAY_LENGTH;
 
   public byte[] buffer;
   public int cursor = Platform.BYTE_ARRAY_OFFSET;
@@ -51,7 +50,7 @@ public BufferHolder(UnsafeRow row) {
 
   public BufferHolder(UnsafeRow row, int initialSize) {
     int bitsetWidthInBytes = UnsafeRow.calculateBitSetWidthInBytes(row.numFields());
-    if (row.numFields() > (Integer.MAX_VALUE - initialSize - bitsetWidthInBytes) / 8) {
+    if (row.numFields() > (ARRAY_MAX - initialSize - bitsetWidthInBytes) / 8) {
       throw new UnsupportedOperationException(
         "Cannot create BufferHolder for input UnsafeRow because there are " +
           "too many fields (number of fields: " + row.numFields() + ")");
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/BufferHolderSparkSubmitSutie.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/BufferHolderSparkSubmitSutie.scala
@@ -0,0 +1,78 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.expressions.codegen
+
+import org.scalatest.{BeforeAndAfterEach, Matchers}
+import org.scalatest.concurrent.Timeouts
+
+import org.apache.spark.{SparkFunSuite, TestUtils}
+import org.apache.spark.deploy.SparkSubmitSuite
+import org.apache.spark.sql.catalyst.expressions.UnsafeRow
+import org.apache.spark.unsafe.array.ByteArrayMethods
+import org.apache.spark.util.ResetSystemProperties
+
+// A test for growing the buffer holder to nearly 2GB. Due to the heap size limitation of the Spark
+// unit tests JVM, the actually test code is running as a submit job.
+class BufferHolderSparkSubmitSuite
+  extends SparkFunSuite
+    with Matchers
+    with BeforeAndAfterEach
+    with ResetSystemProperties
+    with Timeouts {
+
+  test("SPARK-22222: Buffer holder should be able to allocate memory larger than 1GB") {
+    val unusedJar = TestUtils.createJarWithClasses(Seq.empty)
+
+    val argsForSparkSubmit = Seq(
+      "--class", BufferHolderSparkSubmitSuite.getClass.getName.stripSuffix("$"),
+      "--name", "SPARK-22222",
+      "--master", "local-cluster[2,1,1024]",
+      "--driver-memory", "4g",
+      "--conf", "spark.ui.enabled=false",
+      "--conf", "spark.master.rest.enabled=false",
+      "--conf", "spark.driver.extraJavaOptions=-ea",
+      unusedJar.toString)
+    SparkSubmitSuite.runSparkSubmit(argsForSparkSubmit, "../..")
+  }
+}
+
+object BufferHolderSparkSubmitSuite {
+
+  def main(args: Array[String]): Unit = {
+
+    val ARRAY_MAX = ByteArrayMethods.MAX_ROUNDED_ARRAY_LENGTH
+
+    val holder = new BufferHolder(new UnsafeRow(1000))
+
+    holder.reset()
+    holder.grow(roundToWord(ARRAY_MAX / 2))
+
+    holder.reset()
+    holder.grow(roundToWord(ARRAY_MAX / 2 + 8))
+
+    holder.reset()
+    holder.grow(roundToWord(Integer.MAX_VALUE / 2))
+
+    holder.reset()
+    holder.grow(roundToWord(Integer.MAX_VALUE))
+  }
+
+  private def roundToWord(len: Int): Int = {
+    ByteArrayMethods.roundNumberOfBytesToNearestWord(len)
+  }
+}
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/WritableColumnVector.java b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/WritableColumnVector.java
@@ -23,6 +23,7 @@
 
 import org.apache.spark.sql.internal.SQLConf;
 import org.apache.spark.sql.types.*;
+import org.apache.spark.unsafe.array.ByteArrayMethods;
 import org.apache.spark.unsafe.types.UTF8String;
 
 /**
@@ -595,7 +596,7 @@ public final int appendStruct(boolean isNull) {
    * Upper limit for the maximum capacity for this column.
    */
   @VisibleForTesting
-  protected int MAX_CAPACITY = Integer.MAX_VALUE - 8;
+  protected int MAX_CAPACITY = ByteArrayMethods.MAX_ROUNDED_ARRAY_LENGTH;
 
   /**
    * Number of nulls in this column. This is an optimization for the reader, to skip NULL checks.

Original file line number	Diff line number	Diff line change
`@@ -40,6 +40,13 @@ public static int roundNumberOfBytesToNearestWord(int numBytes) {`
`40`	`40`	`}`
`41`	`41`	`}`
`42`	`42`
	`43`	`+ // Some JVMs can't allocate arrays of length Integer.MAX_VALUE; actual max is somewhat smaller.`
	`44`	`+ // Be conservative and lower the cap a little.`
	`45`	`+ // Refer to "http://hg.openjdk.java.net/jdk8/jdk8/jdk/file/tip/src/share/classes/java/util/ArrayList.java#l229"`
	`46`	`+ // This value is word rounded. Use this value if the allocated byte arrays are used to store other`
	`47`	`+ // types rather than bytes.`
	`48`	`+ public static int MAX_ROUNDED_ARRAY_LENGTH = Integer.MAX_VALUE - 15;`
	`49`	`+`
`43`	`50`	`private static final boolean unaligned = Platform.unaligned();`
`44`	`51`	`/**`
`45`	`52`	`* Optimized byte array equality check for byte arrays.`
Original file line number	Diff line number	Diff line change
`@@ -19,6 +19,8 @@ package org.apache.spark.util.collection`
`19`	`19`
`20`	`20`	`import java.util.Comparator`
`21`	`21`
	`22`	`+import org.apache.spark.unsafe.Platform`
	`23`	`+import org.apache.spark.unsafe.array.ByteArrayMethods`
`22`	`24`	`import org.apache.spark.util.collection.WritablePartitionedPairCollection._`
`23`	`25`
`24`	`26`	`/**`
`@@ -96,7 +98,5 @@ private[spark] class PartitionedPairBuffer[K, V](initialCapacity: Int = 64)`
`96`	`98`	`}`
`97`	`99`
`98`	`100`	`private object PartitionedPairBuffer {`
`99`		`- // Some JVMs can't allocate arrays of length Integer.MAX_VALUE; actual max is somewhat`
`100`		`- // smaller. Be conservative and lower the cap a little.`
`101`		`- val MAXIMUM_CAPACITY: Int = (Int.MaxValue - 8) / 2`
	`101`	`+ val MAXIMUM_CAPACITY: Int = ByteArrayMethods.MAX_ROUNDED_ARRAY_LENGTH / 2`
`102`	`102`	`}`