-
Notifications
You must be signed in to change notification settings - Fork 270
fix: Remove fallback for maps containing complex types #2943
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
1805d91
c69316a
a130f7e
db3e78e
fbc93c5
28eff64
21ef95e
5a6ab65
d07021b
a886f03
20c98c9
997f14a
012cbad
43e28b2
a85c9e4
fb5e7e3
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -592,34 +592,12 @@ case class CometScanRule(session: SparkSession) extends Rule[SparkPlan] with Com | |
| val partitionSchemaSupported = | ||
| typeChecker.isSchemaSupported(partitionSchema, fallbackReasons) | ||
|
|
||
| def hasUnsupportedType(dataType: DataType): Boolean = { | ||
|
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this check is no longer needed |
||
| dataType match { | ||
| case s: StructType => s.exists(field => hasUnsupportedType(field.dataType)) | ||
| case a: ArrayType => hasUnsupportedType(a.elementType) | ||
| case m: MapType => | ||
| // maps containing complex types are not supported | ||
| isComplexType(m.keyType) || isComplexType(m.valueType) || | ||
| hasUnsupportedType(m.keyType) || hasUnsupportedType(m.valueType) | ||
| case dt if isStringCollationType(dt) => true | ||
| case _ => false | ||
| } | ||
| } | ||
|
|
||
| val knownIssues = | ||
| scanExec.requiredSchema.exists(field => hasUnsupportedType(field.dataType)) || | ||
| partitionSchema.exists(field => hasUnsupportedType(field.dataType)) | ||
|
|
||
| if (knownIssues) { | ||
| fallbackReasons += "Schema contains data types that are not supported by " + | ||
| s"$SCAN_NATIVE_ICEBERG_COMPAT" | ||
| } | ||
|
|
||
| val cometExecEnabled = COMET_EXEC_ENABLED.get() | ||
| if (!cometExecEnabled) { | ||
| fallbackReasons += s"$SCAN_NATIVE_ICEBERG_COMPAT requires ${COMET_EXEC_ENABLED.key}=true" | ||
| } | ||
|
|
||
| if (cometExecEnabled && schemaSupported && partitionSchemaSupported && !knownIssues && | ||
| if (cometExecEnabled && schemaSupported && partitionSchemaSupported && | ||
| fallbackReasons.isEmpty) { | ||
| logInfo(s"Auto scan mode selecting $SCAN_NATIVE_ICEBERG_COMPAT") | ||
| SCAN_NATIVE_ICEBERG_COMPAT | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -35,12 +35,15 @@ import org.apache.spark.sql.execution.SparkPlan | |
| import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanHelper | ||
| import org.apache.spark.sql.internal.SQLConf | ||
|
|
||
| import org.apache.comet.testing.{DataGenOptions, ParquetGenerator, SchemaGenOptions} | ||
| import org.apache.comet.testing.{DataGenOptions, FuzzDataGenerator, ParquetGenerator, SchemaGenOptions} | ||
|
|
||
| class CometFuzzTestBase extends CometTestBase with AdaptiveSparkPlanHelper { | ||
|
|
||
| var filename: String = null | ||
|
|
||
| /** Filename for data file with deeply nested complex types */ | ||
| var complexTypesFilename: String = null | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. would be great to add if this filename is input or output or temp location?
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I will improve the comments in my next PR later today |
||
|
|
||
| /** | ||
| * We use Asia/Kathmandu because it has a non-zero number of minutes as the offset, so is an | ||
| * interesting edge case. Also, this timezone tends to be different from the default system | ||
|
|
@@ -53,18 +56,20 @@ class CometFuzzTestBase extends CometTestBase with AdaptiveSparkPlanHelper { | |
| override def beforeAll(): Unit = { | ||
| super.beforeAll() | ||
| val tempDir = System.getProperty("java.io.tmpdir") | ||
| filename = s"$tempDir/CometFuzzTestSuite_${System.currentTimeMillis()}.parquet" | ||
| val random = new Random(42) | ||
| val dataGenOptions = DataGenOptions( | ||
| generateNegativeZero = false, | ||
| // override base date due to known issues with experimental scans | ||
| baseDate = new SimpleDateFormat("YYYY-MM-DD hh:mm:ss").parse("2024-05-25 12:34:56").getTime) | ||
|
|
||
| // generate Parquet file with primitives, structs, and arrays, but no maps | ||
| // and no nested complex types | ||
| filename = s"$tempDir/CometFuzzTestSuite_${System.currentTimeMillis()}.parquet" | ||
| withSQLConf( | ||
| CometConf.COMET_ENABLED.key -> "false", | ||
| SQLConf.SESSION_LOCAL_TIMEZONE.key -> defaultTimezone) { | ||
| val schemaGenOptions = | ||
| SchemaGenOptions(generateArray = true, generateStruct = true) | ||
| val dataGenOptions = DataGenOptions( | ||
| generateNegativeZero = false, | ||
| // override base date due to known issues with experimental scans | ||
| baseDate = | ||
| new SimpleDateFormat("YYYY-MM-DD hh:mm:ss").parse("2024-05-25 12:34:56").getTime) | ||
| ParquetGenerator.makeParquetFile( | ||
| random, | ||
| spark, | ||
|
|
@@ -73,6 +78,30 @@ class CometFuzzTestBase extends CometTestBase with AdaptiveSparkPlanHelper { | |
| schemaGenOptions, | ||
| dataGenOptions) | ||
| } | ||
|
|
||
| // generate Parquet file with complex nested types | ||
| complexTypesFilename = | ||
| s"$tempDir/CometFuzzTestSuite_nested_${System.currentTimeMillis()}.parquet" | ||
| withSQLConf( | ||
| CometConf.COMET_ENABLED.key -> "false", | ||
| SQLConf.SESSION_LOCAL_TIMEZONE.key -> defaultTimezone) { | ||
| val schemaGenOptions = | ||
| SchemaGenOptions(generateArray = true, generateStruct = true, generateMap = true) | ||
| val schema = FuzzDataGenerator.generateNestedSchema( | ||
| random, | ||
| numCols = 10, | ||
| minDepth = 2, | ||
| maxDepth = 4, | ||
| options = schemaGenOptions) | ||
| ParquetGenerator.makeParquetFile( | ||
| random, | ||
| spark, | ||
| complexTypesFilename, | ||
| schema, | ||
| 1000, | ||
| dataGenOptions) | ||
| } | ||
|
|
||
| } | ||
|
|
||
| protected override def afterAll(): Unit = { | ||
|
|
@@ -84,6 +113,7 @@ class CometFuzzTestBase extends CometTestBase with AdaptiveSparkPlanHelper { | |
| pos: Position): Unit = { | ||
| Seq("native", "jvm").foreach { shuffleMode => | ||
| Seq( | ||
| CometConf.SCAN_AUTO, | ||
| CometConf.SCAN_NATIVE_COMET, | ||
| CometConf.SCAN_NATIVE_DATAFUSION, | ||
| CometConf.SCAN_NATIVE_ICEBERG_COMPAT).foreach { scanImpl => | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -28,7 +28,7 @@ import org.apache.spark.sql.internal.SQLConf.ParquetOutputTimestampType | |
| import org.apache.spark.sql.types._ | ||
|
|
||
| import org.apache.comet.DataTypeSupport.isComplexType | ||
| import org.apache.comet.testing.{DataGenOptions, ParquetGenerator, SchemaGenOptions} | ||
| import org.apache.comet.testing.{DataGenOptions, ParquetGenerator} | ||
| import org.apache.comet.testing.FuzzDataGenerator.{doubleNaNLiteral, floatNaNLiteral} | ||
|
|
||
| class CometFuzzTestSuite extends CometFuzzTestBase { | ||
|
|
@@ -44,6 +44,17 @@ class CometFuzzTestSuite extends CometFuzzTestBase { | |
| } | ||
| } | ||
|
|
||
| test("select * with deeply nested complex types") { | ||
| val df = spark.read.parquet(complexTypesFilename) | ||
| df.createOrReplaceTempView("t1") | ||
| val sql = "SELECT * FROM t1" | ||
| if (CometConf.COMET_NATIVE_SCAN_IMPL.get() != CometConf.SCAN_NATIVE_COMET) { | ||
| checkSparkAnswerAndOperator(sql) | ||
| } else { | ||
| checkSparkAnswer(sql) | ||
| } | ||
| } | ||
|
|
||
| test("select * with limit") { | ||
| val df = spark.read.parquet(filename) | ||
| df.createOrReplaceTempView("t1") | ||
|
|
@@ -179,7 +190,7 @@ class CometFuzzTestSuite extends CometFuzzTestBase { | |
| case CometConf.SCAN_NATIVE_COMET => | ||
| // native_comet does not support reading complex types | ||
| 0 | ||
| case CometConf.SCAN_NATIVE_ICEBERG_COMPAT | CometConf.SCAN_NATIVE_DATAFUSION => | ||
| case _ => | ||
| CometConf.COMET_SHUFFLE_MODE.get() match { | ||
| case "jvm" => | ||
| 1 | ||
|
|
@@ -202,7 +213,7 @@ class CometFuzzTestSuite extends CometFuzzTestBase { | |
| case CometConf.SCAN_NATIVE_COMET => | ||
| // native_comet does not support reading complex types | ||
| 0 | ||
| case CometConf.SCAN_NATIVE_ICEBERG_COMPAT | CometConf.SCAN_NATIVE_DATAFUSION => | ||
| case _ => | ||
| CometConf.COMET_SHUFFLE_MODE.get() match { | ||
| case "jvm" => | ||
| 1 | ||
|
|
@@ -272,12 +283,7 @@ class CometFuzzTestSuite extends CometFuzzTestBase { | |
| } | ||
|
|
||
| private def testParquetTemporalTypes( | ||
| outputTimestampType: ParquetOutputTimestampType.Value, | ||
| generateArray: Boolean = true, | ||
| generateStruct: Boolean = true): Unit = { | ||
|
|
||
| val schemaGenOptions = | ||
| SchemaGenOptions(generateArray = generateArray, generateStruct = generateStruct) | ||
| outputTimestampType: ParquetOutputTimestampType.Value): Unit = { | ||
|
|
||
| val dataGenOptions = DataGenOptions(generateNegativeZero = false) | ||
|
|
||
|
|
@@ -287,12 +293,23 @@ class CometFuzzTestSuite extends CometFuzzTestBase { | |
| CometConf.COMET_ENABLED.key -> "false", | ||
| SQLConf.PARQUET_OUTPUT_TIMESTAMP_TYPE.key -> outputTimestampType.toString, | ||
| SQLConf.SESSION_LOCAL_TIMEZONE.key -> defaultTimezone) { | ||
|
|
||
| // TODO test with MapType | ||
| // https://github.com/apache/datafusion-comet/issues/2945 | ||
| val schema = StructType( | ||
| Seq( | ||
| StructField("c0", DataTypes.DateType), | ||
| StructField("c1", DataTypes.createArrayType(DataTypes.DateType)), | ||
| StructField( | ||
| "c2", | ||
| DataTypes.createStructType(Array(StructField("c3", DataTypes.DateType)))))) | ||
|
|
||
| ParquetGenerator.makeParquetFile( | ||
| random, | ||
| spark, | ||
| filename.toString, | ||
| schema, | ||
| 100, | ||
| schemaGenOptions, | ||
| dataGenOptions) | ||
| } | ||
|
|
||
|
|
@@ -309,18 +326,10 @@ class CometFuzzTestSuite extends CometFuzzTestBase { | |
|
|
||
| val df = spark.read.parquet(filename.toString) | ||
| df.createOrReplaceTempView("t1") | ||
|
|
||
| def hasTemporalType(t: DataType): Boolean = t match { | ||
| case DataTypes.DateType | DataTypes.TimestampType | | ||
| DataTypes.TimestampNTZType => | ||
| true | ||
| case t: StructType => t.exists(f => hasTemporalType(f.dataType)) | ||
| case t: ArrayType => hasTemporalType(t.elementType) | ||
|
Comment on lines
-317
to
-318
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this was missing a check for |
||
| case _ => false | ||
| } | ||
|
|
||
| val columns = | ||
| df.schema.fields.filter(f => hasTemporalType(f.dataType)).map(_.name) | ||
| df.schema.fields | ||
| .filter(f => DataTypeSupport.hasTemporalType(f.dataType)) | ||
| .map(_.name) | ||
|
|
||
| for (col <- columns) { | ||
| checkSparkAnswer(s"SELECT $col FROM t1 ORDER BY $col") | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
wondering if interval can also be considered as temporal? Probably not, it is more like duration rather than representing any point of time