Skip to content
This repository was archived by the owner on Jan 9, 2020. It is now read-only.

Commit e8866f9

Browse files
rxinhvanhovell
authored andcommitted
[SPARK-18853][SQL] Project (UnaryNode) is way too aggressive in estimating statistics
## What changes were proposed in this pull request? This patch reduces the default number element estimation for arrays and maps from 100 to 1. The issue with the 100 number is that when nested (e.g. an array of map), 100 * 100 would be used as the default size. This sounds like just an overestimation which doesn't seem that bad (since it is usually better to overestimate than underestimate). However, due to the way we assume the size output for Project (new estimated column size / old estimated column size), this overestimation can become underestimation. It is actually in general in this case safer to assume 1 default element. ## How was this patch tested? This should be covered by existing tests. Author: Reynold Xin <[email protected]> Closes apache#16274 from rxin/SPARK-18853. (cherry picked from commit 5d79947) Signed-off-by: Herman van Hovell <[email protected]>
1 parent af12a21 commit e8866f9

File tree

3 files changed

+13
-13
lines changed

3 files changed

+13
-13
lines changed

sql/catalyst/src/main/scala/org/apache/spark/sql/types/ArrayType.scala

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -78,10 +78,10 @@ case class ArrayType(elementType: DataType, containsNull: Boolean) extends DataT
7878
("containsNull" -> containsNull)
7979

8080
/**
81-
* The default size of a value of the ArrayType is 100 * the default size of the element type.
82-
* (We assume that there are 100 elements).
81+
* The default size of a value of the ArrayType is the default size of the element type.
82+
* We assume that there is only 1 element on average in an array. See SPARK-18853.
8383
*/
84-
override def defaultSize: Int = 100 * elementType.defaultSize
84+
override def defaultSize: Int = 1 * elementType.defaultSize
8585

8686
override def simpleString: String = s"array<${elementType.simpleString}>"
8787

sql/catalyst/src/main/scala/org/apache/spark/sql/types/MapType.scala

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -56,10 +56,10 @@ case class MapType(
5656

5757
/**
5858
* The default size of a value of the MapType is
59-
* 100 * (the default size of the key type + the default size of the value type).
60-
* (We assume that there are 100 elements).
59+
* (the default size of the key type + the default size of the value type).
60+
* We assume that there is only 1 element on average in a map. See SPARK-18853.
6161
*/
62-
override def defaultSize: Int = 100 * (keyType.defaultSize + valueType.defaultSize)
62+
override def defaultSize: Int = 1 * (keyType.defaultSize + valueType.defaultSize)
6363

6464
override def simpleString: String = s"map<${keyType.simpleString},${valueType.simpleString}>"
6565

sql/catalyst/src/test/scala/org/apache/spark/sql/types/DataTypeSuite.scala

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -253,7 +253,7 @@ class DataTypeSuite extends SparkFunSuite {
253253
checkDataTypeJsonRepr(structType)
254254

255255
def checkDefaultSize(dataType: DataType, expectedDefaultSize: Int): Unit = {
256-
test(s"Check the default size of ${dataType}") {
256+
test(s"Check the default size of $dataType") {
257257
assert(dataType.defaultSize === expectedDefaultSize)
258258
}
259259
}
@@ -272,18 +272,18 @@ class DataTypeSuite extends SparkFunSuite {
272272
checkDefaultSize(TimestampType, 8)
273273
checkDefaultSize(StringType, 20)
274274
checkDefaultSize(BinaryType, 100)
275-
checkDefaultSize(ArrayType(DoubleType, true), 800)
276-
checkDefaultSize(ArrayType(StringType, false), 2000)
277-
checkDefaultSize(MapType(IntegerType, StringType, true), 2400)
278-
checkDefaultSize(MapType(IntegerType, ArrayType(DoubleType), false), 80400)
279-
checkDefaultSize(structType, 812)
275+
checkDefaultSize(ArrayType(DoubleType, true), 8)
276+
checkDefaultSize(ArrayType(StringType, false), 20)
277+
checkDefaultSize(MapType(IntegerType, StringType, true), 24)
278+
checkDefaultSize(MapType(IntegerType, ArrayType(DoubleType), false), 12)
279+
checkDefaultSize(structType, 20)
280280

281281
def checkEqualsIgnoreCompatibleNullability(
282282
from: DataType,
283283
to: DataType,
284284
expected: Boolean): Unit = {
285285
val testName =
286-
s"equalsIgnoreCompatibleNullability: (from: ${from}, to: ${to})"
286+
s"equalsIgnoreCompatibleNullability: (from: $from, to: $to)"
287287
test(testName) {
288288
assert(DataType.equalsIgnoreCompatibleNullability(from, to) === expected)
289289
}

0 commit comments

Comments
 (0)