Skip to content

Commit 57a4dca

Browse files
authored
fix: disable checking for uint_8 and uint_16 if complex type readers are enabled (#1376)
1 parent d885f4a commit 57a4dca

File tree

8 files changed

+296
-81
lines changed

8 files changed

+296
-81
lines changed

common/src/main/scala/org/apache/comet/CometConf.scala

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -608,6 +608,14 @@ object CometConf extends ShimCometConf {
608608
.booleanConf
609609
.createWithDefault(false)
610610

611+
val COMET_SCAN_ALLOW_INCOMPATIBLE: ConfigEntry[Boolean] =
612+
conf("spark.comet.scan.allowIncompatible")
613+
.doc(
614+
"Comet is not currently fully compatible with Spark for all datatypes. " +
615+
s"Set this config to true to allow them anyway. $COMPAT_GUIDE.")
616+
.booleanConf
617+
.createWithDefault(true)
618+
611619
val COMET_EXPR_ALLOW_INCOMPATIBLE: ConfigEntry[Boolean] =
612620
conf("spark.comet.expression.allowIncompatible")
613621
.doc(

docs/source/user-guide/configs.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,7 @@ Comet provides the following configuration settings.
7676
| spark.comet.parquet.read.parallel.io.enabled | Whether to enable Comet's parallel reader for Parquet files. The parallel reader reads ranges of consecutive data in a file in parallel. It is faster for large files and row groups but uses more resources. | true |
7777
| spark.comet.parquet.read.parallel.io.thread-pool.size | The maximum number of parallel threads the parallel reader will use in a single executor. For executors configured with a smaller number of cores, use a smaller number. | 16 |
7878
| spark.comet.regexp.allowIncompatible | Comet is not currently fully compatible with Spark for all regular expressions. Set this config to true to allow them anyway. For more information, refer to the Comet Compatibility Guide (https://datafusion.apache.org/comet/user-guide/compatibility.html). | false |
79+
| spark.comet.scan.allowIncompatible | Comet is not currently fully compatible with Spark for all datatypes. Set this config to true to allow them anyway. For more information, refer to the Comet Compatibility Guide (https://datafusion.apache.org/comet/user-guide/compatibility.html). | true |
7980
| spark.comet.scan.enabled | Whether to enable native scans. When this is turned on, Spark will use Comet to read supported data sources (currently only Parquet is supported natively). Note that to enable native vectorized execution, both this config and 'spark.comet.exec.enabled' need to be enabled. | true |
8081
| spark.comet.scan.preFetch.enabled | Whether to enable pre-fetching feature of CometScan. | false |
8182
| spark.comet.scan.preFetch.threadNum | The number of threads running pre-fetching for CometScan. Effective if spark.comet.scan.preFetch.enabled is enabled. Note that more pre-fetching threads means more memory requirement to store pre-fetched row groups. | 2 |

spark/src/main/scala/org/apache/comet/CometSparkSessionExtensions.scala

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1352,6 +1352,15 @@ object CometSparkSessionExtensions extends Logging {
13521352
org.apache.spark.SPARK_VERSION >= "4.0"
13531353
}
13541354

1355+
def isComplexTypeReaderEnabled(conf: SQLConf): Boolean = {
1356+
CometConf.COMET_NATIVE_SCAN_IMPL.get(conf) == CometConf.SCAN_NATIVE_ICEBERG_COMPAT ||
1357+
CometConf.COMET_NATIVE_SCAN_IMPL.get(conf) == CometConf.SCAN_NATIVE_DATAFUSION
1358+
}
1359+
1360+
def usingDataFusionParquetReader(conf: SQLConf): Boolean = {
1361+
isComplexTypeReaderEnabled(conf) && !CometConf.COMET_SCAN_ALLOW_INCOMPATIBLE.get(conf)
1362+
}
1363+
13551364
/** Calculates required memory overhead in MB per executor process for Comet. */
13561365
def getCometMemoryOverheadInMiB(sparkConf: SparkConf): Long = {
13571366
// `spark.executor.memory` default value is 1g

spark/src/main/scala/org/apache/comet/DataTypeSupport.scala

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919

2020
package org.apache.comet
2121

22+
import org.apache.spark.sql.internal.SQLConf
2223
import org.apache.spark.sql.types._
2324

2425
trait DataTypeSupport {
@@ -35,12 +36,15 @@ trait DataTypeSupport {
3536
def isAdditionallySupported(dt: DataType): Boolean = false
3637

3738
private def isGloballySupported(dt: DataType): Boolean = dt match {
39+
case ByteType | ShortType
40+
if CometSparkSessionExtensions.isComplexTypeReaderEnabled(SQLConf.get) &&
41+
!CometConf.COMET_SCAN_ALLOW_INCOMPATIBLE.get() =>
42+
false
3843
case BooleanType | ByteType | ShortType | IntegerType | LongType | FloatType | DoubleType |
3944
BinaryType | StringType | _: DecimalType | DateType | TimestampType =>
4045
true
4146
case t: DataType if t.typeName == "timestamp_ntz" =>
4247
true
43-
true
4448
case _ => false
4549
}
4650

spark/src/test/scala/org/apache/comet/CometCastSuite.scala

Lines changed: 103 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,9 @@ class CometCastSuite extends CometTestBase with AdaptiveSparkPlanHelper {
5959

6060
private val timestampPattern = "0123456789/:T" + whitespaceChars
6161

62+
lazy val usingDataFusionParquetReader: Boolean =
63+
CometSparkSessionExtensions.usingDataFusionParquetReader(conf)
64+
6265
test("all valid cast combinations covered") {
6366
val names = testNames
6467

@@ -145,88 +148,148 @@ class CometCastSuite extends CometTestBase with AdaptiveSparkPlanHelper {
145148
// CAST from ByteType
146149

147150
test("cast ByteType to BooleanType") {
148-
castTest(generateBytes(), DataTypes.BooleanType)
151+
castTest(
152+
generateBytes(),
153+
DataTypes.BooleanType,
154+
hasIncompatibleType = usingDataFusionParquetReader)
149155
}
150156

151157
test("cast ByteType to ShortType") {
152-
castTest(generateBytes(), DataTypes.ShortType)
158+
castTest(
159+
generateBytes(),
160+
DataTypes.ShortType,
161+
hasIncompatibleType = usingDataFusionParquetReader)
153162
}
154163

155164
test("cast ByteType to IntegerType") {
156-
castTest(generateBytes(), DataTypes.IntegerType)
165+
castTest(
166+
generateBytes(),
167+
DataTypes.IntegerType,
168+
hasIncompatibleType = usingDataFusionParquetReader)
157169
}
158170

159171
test("cast ByteType to LongType") {
160-
castTest(generateBytes(), DataTypes.LongType)
172+
castTest(
173+
generateBytes(),
174+
DataTypes.LongType,
175+
hasIncompatibleType = usingDataFusionParquetReader)
161176
}
162177

163178
test("cast ByteType to FloatType") {
164-
castTest(generateBytes(), DataTypes.FloatType)
179+
castTest(
180+
generateBytes(),
181+
DataTypes.FloatType,
182+
hasIncompatibleType = usingDataFusionParquetReader)
165183
}
166184

167185
test("cast ByteType to DoubleType") {
168-
castTest(generateBytes(), DataTypes.DoubleType)
186+
castTest(
187+
generateBytes(),
188+
DataTypes.DoubleType,
189+
hasIncompatibleType = usingDataFusionParquetReader)
169190
}
170191

171192
test("cast ByteType to DecimalType(10,2)") {
172-
castTest(generateBytes(), DataTypes.createDecimalType(10, 2))
193+
castTest(
194+
generateBytes(),
195+
DataTypes.createDecimalType(10, 2),
196+
hasIncompatibleType = usingDataFusionParquetReader)
173197
}
174198

175199
test("cast ByteType to StringType") {
176-
castTest(generateBytes(), DataTypes.StringType)
200+
castTest(
201+
generateBytes(),
202+
DataTypes.StringType,
203+
hasIncompatibleType = usingDataFusionParquetReader)
177204
}
178205

179206
ignore("cast ByteType to BinaryType") {
180-
castTest(generateBytes(), DataTypes.BinaryType)
207+
castTest(
208+
generateBytes(),
209+
DataTypes.BinaryType,
210+
hasIncompatibleType = usingDataFusionParquetReader)
181211
}
182212

183213
ignore("cast ByteType to TimestampType") {
184214
// input: -1, expected: 1969-12-31 15:59:59.0, actual: 1969-12-31 15:59:59.999999
185-
castTest(generateBytes(), DataTypes.TimestampType)
215+
castTest(
216+
generateBytes(),
217+
DataTypes.TimestampType,
218+
hasIncompatibleType = usingDataFusionParquetReader)
186219
}
187220

188221
// CAST from ShortType
189222

190223
test("cast ShortType to BooleanType") {
191-
castTest(generateShorts(), DataTypes.BooleanType)
224+
castTest(
225+
generateShorts(),
226+
DataTypes.BooleanType,
227+
hasIncompatibleType = usingDataFusionParquetReader)
192228
}
193229

194230
test("cast ShortType to ByteType") {
195231
// https://github.com/apache/datafusion-comet/issues/311
196-
castTest(generateShorts(), DataTypes.ByteType)
232+
castTest(
233+
generateShorts(),
234+
DataTypes.ByteType,
235+
hasIncompatibleType = usingDataFusionParquetReader)
197236
}
198237

199238
test("cast ShortType to IntegerType") {
200-
castTest(generateShorts(), DataTypes.IntegerType)
239+
castTest(
240+
generateShorts(),
241+
DataTypes.IntegerType,
242+
hasIncompatibleType = usingDataFusionParquetReader)
201243
}
202244

203245
test("cast ShortType to LongType") {
204-
castTest(generateShorts(), DataTypes.LongType)
246+
castTest(
247+
generateShorts(),
248+
DataTypes.LongType,
249+
hasIncompatibleType = usingDataFusionParquetReader)
205250
}
206251

207252
test("cast ShortType to FloatType") {
208-
castTest(generateShorts(), DataTypes.FloatType)
253+
castTest(
254+
generateShorts(),
255+
DataTypes.FloatType,
256+
hasIncompatibleType = usingDataFusionParquetReader)
209257
}
210258

211259
test("cast ShortType to DoubleType") {
212-
castTest(generateShorts(), DataTypes.DoubleType)
260+
castTest(
261+
generateShorts(),
262+
DataTypes.DoubleType,
263+
hasIncompatibleType = usingDataFusionParquetReader)
213264
}
214265

215266
test("cast ShortType to DecimalType(10,2)") {
216-
castTest(generateShorts(), DataTypes.createDecimalType(10, 2))
267+
castTest(
268+
generateShorts(),
269+
DataTypes.createDecimalType(10, 2),
270+
hasIncompatibleType = usingDataFusionParquetReader)
217271
}
218272

219273
test("cast ShortType to StringType") {
220-
castTest(generateShorts(), DataTypes.StringType)
274+
castTest(
275+
generateShorts(),
276+
DataTypes.StringType,
277+
hasIncompatibleType = usingDataFusionParquetReader)
221278
}
222279

223280
ignore("cast ShortType to BinaryType") {
224-
castTest(generateShorts(), DataTypes.BinaryType)
281+
castTest(
282+
generateShorts(),
283+
DataTypes.BinaryType,
284+
hasIncompatibleType = usingDataFusionParquetReader)
225285
}
226286

227287
ignore("cast ShortType to TimestampType") {
228288
// input: -1003, expected: 1969-12-31 15:43:17.0, actual: 1969-12-31 15:59:59.998997
229-
castTest(generateShorts(), DataTypes.TimestampType)
289+
castTest(
290+
generateShorts(),
291+
DataTypes.TimestampType,
292+
hasIncompatibleType = usingDataFusionParquetReader)
230293
}
231294

232295
// CAST from integer
@@ -1083,7 +1146,11 @@ class CometCastSuite extends CometTestBase with AdaptiveSparkPlanHelper {
10831146
}
10841147
}
10851148

1086-
private def castTest(input: DataFrame, toType: DataType, testAnsi: Boolean = true): Unit = {
1149+
private def castTest(
1150+
input: DataFrame,
1151+
toType: DataType,
1152+
hasIncompatibleType: Boolean = false,
1153+
testAnsi: Boolean = true): Unit = {
10871154

10881155
// we now support the TryCast expression in Spark 3.3
10891156
withTempPath { dir =>
@@ -1093,12 +1160,20 @@ class CometCastSuite extends CometTestBase with AdaptiveSparkPlanHelper {
10931160
withSQLConf((SQLConf.ANSI_ENABLED.key, "false")) {
10941161
// cast() should return null for invalid inputs when ansi mode is disabled
10951162
val df = spark.sql(s"select a, cast(a as ${toType.sql}) from t order by a")
1096-
checkSparkAnswerAndOperator(df)
1163+
if (hasIncompatibleType) {
1164+
checkSparkAnswer(df)
1165+
} else {
1166+
checkSparkAnswerAndOperator(df)
1167+
}
10971168

10981169
// try_cast() should always return null for invalid inputs
10991170
val df2 =
11001171
spark.sql(s"select a, try_cast(a as ${toType.sql}) from t order by a")
1101-
checkSparkAnswerAndOperator(df2)
1172+
if (hasIncompatibleType) {
1173+
checkSparkAnswer(df2)
1174+
} else {
1175+
checkSparkAnswerAndOperator(df2)
1176+
}
11021177
}
11031178

11041179
if (testAnsi) {
@@ -1154,7 +1229,11 @@ class CometCastSuite extends CometTestBase with AdaptiveSparkPlanHelper {
11541229
// try_cast() should always return null for invalid inputs
11551230
val df2 =
11561231
spark.sql(s"select a, try_cast(a as ${toType.sql}) from t order by a")
1157-
checkSparkAnswerAndOperator(df2)
1232+
if (hasIncompatibleType) {
1233+
checkSparkAnswer(df2)
1234+
} else {
1235+
checkSparkAnswerAndOperator(df2)
1236+
}
11581237

11591238
}
11601239
}

spark/src/test/scala/org/apache/comet/CometExpressionSuite.scala

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -125,6 +125,45 @@ class CometExpressionSuite extends CometTestBase with AdaptiveSparkPlanHelper {
125125
}
126126
}
127127

128+
test("uint data type support") {
129+
Seq(true, false).foreach { dictionaryEnabled =>
130+
// TODO: Once the question of what to get back from uint_8, uint_16 types is resolved,
131+
// we can also update this test to check for COMET_SCAN_ALLOW_INCOMPATIBLE=true
132+
Seq(false).foreach { allowIncompatible =>
133+
{
134+
withSQLConf(CometConf.COMET_SCAN_ALLOW_INCOMPATIBLE.key -> allowIncompatible.toString) {
135+
withTempDir { dir =>
136+
val path = new Path(dir.toURI.toString, "testuint.parquet")
137+
makeParquetFileAllTypes(
138+
path,
139+
dictionaryEnabled = dictionaryEnabled,
140+
Byte.MinValue,
141+
Byte.MaxValue)
142+
withParquetTable(path.toString, "tbl") {
143+
val qry = "select _9 from tbl order by _11"
144+
if (CometSparkSessionExtensions.isComplexTypeReaderEnabled(conf)) {
145+
if (!allowIncompatible) {
146+
checkSparkAnswer(qry)
147+
} else {
148+
// need to convert the values to unsigned values
149+
val expected = (Byte.MinValue to Byte.MaxValue)
150+
.map(v => {
151+
if (v < 0) Byte.MaxValue.toShort - v else v
152+
})
153+
.toDF("a")
154+
checkAnswer(sql(qry), expected)
155+
}
156+
} else {
157+
checkSparkAnswerAndOperator(qry)
158+
}
159+
}
160+
}
161+
}
162+
}
163+
}
164+
}
165+
}
166+
128167
test("null literals") {
129168
val batchSize = 1000
130169
Seq(true, false).foreach { dictionaryEnabled =>
@@ -142,6 +181,7 @@ class CometExpressionSuite extends CometTestBase with AdaptiveSparkPlanHelper {
142181
checkSparkAnswerAndOperator(sqlString)
143182
}
144183
}
184+
145185
}
146186
}
147187

spark/src/test/scala/org/apache/comet/parquet/ParquetReadSuite.scala

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ import org.apache.spark.unsafe.types.UTF8String
4545

4646
import com.google.common.primitives.UnsignedLong
4747

48-
import org.apache.comet.CometConf
48+
import org.apache.comet.{CometConf, CometSparkSessionExtensions}
4949
import org.apache.comet.CometSparkSessionExtensions.{isSpark34Plus, isSpark40Plus}
5050

5151
abstract class ParquetReadSuite extends CometTestBase {
@@ -139,7 +139,10 @@ abstract class ParquetReadSuite extends CometTestBase {
139139
i.toDouble,
140140
DateTimeUtils.toJavaDate(i))
141141
}
142-
checkParquetScan(data)
142+
if (!CometSparkSessionExtensions.isComplexTypeReaderEnabled(
143+
conf) || CometConf.COMET_SCAN_ALLOW_INCOMPATIBLE.get()) {
144+
checkParquetScan(data)
145+
}
143146
checkParquetFile(data)
144147
}
145148
}
@@ -159,7 +162,10 @@ abstract class ParquetReadSuite extends CometTestBase {
159162
i.toDouble,
160163
DateTimeUtils.toJavaDate(i))
161164
}
162-
checkParquetScan(data)
165+
if (!CometSparkSessionExtensions.isComplexTypeReaderEnabled(
166+
conf) || CometConf.COMET_SCAN_ALLOW_INCOMPATIBLE.get()) {
167+
checkParquetScan(data)
168+
}
163169
checkParquetFile(data)
164170
}
165171
}
@@ -178,7 +184,10 @@ abstract class ParquetReadSuite extends CometTestBase {
178184
DateTimeUtils.toJavaDate(i))
179185
}
180186
val filter = (row: Row) => row.getBoolean(0)
181-
checkParquetScan(data, filter)
187+
if (!CometSparkSessionExtensions.isComplexTypeReaderEnabled(
188+
conf) || CometConf.COMET_SCAN_ALLOW_INCOMPATIBLE.get()) {
189+
checkParquetScan(data, filter)
190+
}
182191
checkParquetFile(data, filter)
183192
}
184193

0 commit comments

Comments
 (0)