[SPARK-23997][SQL] Configurable maximum number of buckets

ferdonline · gatorsmile · commit de46df549ace · 2018-08-28T10:31:47.000-07:00
## What changes were proposed in this pull request? This PR implements the possibility of the user to override the maximum number of buckets when saving to a table. Currently the limit is a hard-coded 100k, which might be insufficient for large workloads. A new configuration entry is proposed: `spark.sql.bucketing.maxBuckets`, which defaults to the previous 100k. ## How was this patch tested? Added unit tests in the following spark.sql test suites: - CreateTableAsSelectSuite - BucketedWriteSuite Author: Fernando Pereira <fernando.pereira@epfl.ch> Closes apache#21087 from ferdonline/enh/configurable_bucket_limit.
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
@@ -32,6 +32,7 @@ import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.plans.logical.statsEstimation.EstimationUtils
 import org.apache.spark.sql.catalyst.util.{CaseInsensitiveMap, DateTimeUtils}
 import org.apache.spark.sql.catalyst.util.quoteIdentifier
+import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.types._
 
 
@@ -173,9 +174,12 @@ case class BucketSpec(
     numBuckets: Int,
     bucketColumnNames: Seq[String],
     sortColumnNames: Seq[String]) {
-  if (numBuckets <= 0 || numBuckets >= 100000) {
+  def conf: SQLConf = SQLConf.get
+
+  if (numBuckets <= 0 || numBuckets > conf.bucketingMaxBuckets) {
     throw new AnalysisException(
-      s"Number of buckets should be greater than 0 but less than 100000. Got `$numBuckets`")
+      s"Number of buckets should be greater than 0 but less than bucketing.maxBuckets " +
+        s"(`${conf.bucketingMaxBuckets}`). Got `$numBuckets`")
   }
 
   override def toString: String = {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -674,6 +674,12 @@ object SQLConf {
     .booleanConf
     .createWithDefault(true)
 
+  val BUCKETING_MAX_BUCKETS = buildConf("spark.sql.sources.bucketing.maxBuckets")
+    .doc("The maximum number of buckets allowed. Defaults to 100000")
+    .intConf
+    .checkValue(_ > 0, "the value of spark.sql.sources.bucketing.maxBuckets must be larger than 0")
+    .createWithDefault(100000)
+
   val CROSS_JOINS_ENABLED = buildConf("spark.sql.crossJoin.enabled")
     .doc("When false, we will throw an error if a query contains a cartesian product without " +
         "explicit CROSS JOIN syntax.")
@@ -1803,6 +1809,8 @@ class SQLConf extends Serializable with Logging {
 
   def bucketingEnabled: Boolean = getConf(SQLConf.BUCKETING_ENABLED)
 
+  def bucketingMaxBuckets: Int = getConf(SQLConf.BUCKETING_MAX_BUCKETS)
+
   def dataFrameSelfJoinAutoResolveAmbiguity: Boolean =
     getConf(DATAFRAME_SELF_JOIN_AUTO_RESOLVE_AMBIGUITY)
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/BucketedWriteSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/BucketedWriteSuite.scala
@@ -18,9 +18,10 @@
 package org.apache.spark.sql.sources
 
 import java.io.File
-import java.net.URI
 
 import org.apache.spark.sql.{AnalysisException, QueryTest}
+import org.apache.spark.sql.catalyst.TableIdentifier
+import org.apache.spark.sql.catalyst.catalog.BucketSpec
 import org.apache.spark.sql.catalyst.expressions.UnsafeProjection
 import org.apache.spark.sql.catalyst.plans.physical.HashPartitioning
 import org.apache.spark.sql.execution.datasources.BucketingUtils
@@ -48,16 +49,40 @@ abstract class BucketedWriteSuite extends QueryTest with SQLTestUtils {
     intercept[AnalysisException](df.write.bucketBy(2, "k").saveAsTable("tt"))
   }
 
-  test("numBuckets be greater than 0 but less than 100000") {
+  test("numBuckets be greater than 0 but less/eq than default bucketing.maxBuckets (100000)") {
     val df = Seq(1 -> "a", 2 -> "b").toDF("i", "j")
 
-    Seq(-1, 0, 100000).foreach(numBuckets => {
+    Seq(-1, 0, 100001).foreach(numBuckets => {
       val e = intercept[AnalysisException](df.write.bucketBy(numBuckets, "i").saveAsTable("tt"))
       assert(
-        e.getMessage.contains("Number of buckets should be greater than 0 but less than 100000"))
+        e.getMessage.contains("Number of buckets should be greater than 0 but less than"))
     })
   }
 
+  test("numBuckets be greater than 0 but less/eq than overridden bucketing.maxBuckets (200000)") {
+    val maxNrBuckets: Int = 200000
+    val catalog = spark.sessionState.catalog
+
+    withSQLConf("spark.sql.sources.bucketing.maxBuckets" -> maxNrBuckets.toString) {
+      // within the new limit
+      Seq(100001, maxNrBuckets).foreach(numBuckets => {
+        withTable("t") {
+          df.write.bucketBy(numBuckets, "i").saveAsTable("t")
+          val table = catalog.getTableMetadata(TableIdentifier("t"))
+          assert(table.bucketSpec == Option(BucketSpec(numBuckets, Seq("i"), Seq())))
+        }
+      })
+
+      // over the new limit
+      withTable("t") {
+        val e = intercept[AnalysisException](
+          df.write.bucketBy(maxNrBuckets + 1, "i").saveAsTable("t"))
+        assert(
+          e.getMessage.contains("Number of buckets should be greater than 0 but less than"))
+      }
+    }
+  }
+
   test("specify sorting columns without bucketing columns") {
     val df = Seq(1 -> "a", 2 -> "b").toDF("i", "j")
     val e = intercept[AnalysisException] {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/CreateTableAsSelectSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/CreateTableAsSelectSuite.scala
@@ -225,7 +225,7 @@ class CreateTableAsSelectSuite
 
   test("create table using as select - with invalid number of buckets") {
     withTable("t") {
-      Seq(0, 100000).foreach(numBuckets => {
+      Seq(0, 100001).foreach(numBuckets => {
         val e = intercept[AnalysisException] {
           sql(
             s"""
@@ -236,11 +236,42 @@ class CreateTableAsSelectSuite
              """.stripMargin
           )
         }.getMessage
-        assert(e.contains("Number of buckets should be greater than 0 but less than 100000"))
+        assert(e.contains("Number of buckets should be greater than 0 but less than"))
       })
     }
   }
 
+  test("create table using as select - with overriden max number of buckets") {
+    def createTableSql(numBuckets: Int): String =
+      s"""
+         |CREATE TABLE t USING PARQUET
+         |OPTIONS (PATH '${path.toURI}')
+         |CLUSTERED BY (a) SORTED BY (b) INTO $numBuckets BUCKETS
+         |AS SELECT 1 AS a, 2 AS b
+       """.stripMargin
+
+    val maxNrBuckets: Int = 200000
+    val catalog = spark.sessionState.catalog
+    withSQLConf("spark.sql.sources.bucketing.maxBuckets" -> maxNrBuckets.toString) {
+
+      // Within the new limit
+      Seq(100001, maxNrBuckets).foreach(numBuckets => {
+        withTable("t") {
+          sql(createTableSql(numBuckets))
+          val table = catalog.getTableMetadata(TableIdentifier("t"))
+          assert(table.bucketSpec == Option(BucketSpec(numBuckets, Seq("a"), Seq("b"))))
+        }
+      })
+
+      // Over the new limit
+      withTable("t") {
+        val e = intercept[AnalysisException](sql(createTableSql(maxNrBuckets + 1)))
+        assert(
+          e.getMessage.contains("Number of buckets should be greater than 0 but less than "))
+      }
+    }
+  }
+
   test("SPARK-17409: CTAS of decimal calculation") {
     withTable("tab2") {
       withTempView("tab1") {