feat: Add experimental auto mode for COMET_PARQUET_SCAN_IMPL (#1747)

andygrove · web-flow · commit 17a36bcfecd4 · 2025-06-13T13:42:22.000-06:00
diff --git a/.github/workflows/spark_sql_test_native_auto.yml b/.github/workflows/spark_sql_test_native_auto.yml
@@ -0,0 +1,71 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+name: Spark SQL Tests (native_auto)
+
+concurrency:
+  group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }}
+  cancel-in-progress: true
+
+on:
+  # manual trigger
+  # https://docs.github.com/en/actions/managing-workflow-runs/manually-running-a-workflow
+  workflow_dispatch:
+
+env:
+  RUST_VERSION: stable
+
+jobs:
+  spark-sql-catalyst-native-auto:
+    strategy:
+      matrix:
+        os: [ubuntu-24.04]
+        java-version: [11]
+        spark-version: [{short: '3.4', full: '3.4.3'}, {short: '3.5', full: '3.5.5'}]
+        module:
+          - {name: "catalyst", args1: "catalyst/test", args2: ""}
+          - {name: "sql/core-1", args1: "", args2: sql/testOnly * -- -l org.apache.spark.tags.ExtendedSQLTest -l org.apache.spark.tags.SlowSQLTest}
+          - {name: "sql/core-2", args1: "", args2: "sql/testOnly * -- -n org.apache.spark.tags.ExtendedSQLTest"}
+          - {name: "sql/core-3", args1: "", args2: "sql/testOnly * -- -n org.apache.spark.tags.SlowSQLTest"}
+          - {name: "sql/hive-1", args1: "", args2: "hive/testOnly * -- -l org.apache.spark.tags.ExtendedHiveTest -l org.apache.spark.tags.SlowHiveTest"}
+          - {name: "sql/hive-2", args1: "", args2: "hive/testOnly * -- -n org.apache.spark.tags.ExtendedHiveTest"}
+          - {name: "sql/hive-3", args1: "", args2: "hive/testOnly * -- -n org.apache.spark.tags.SlowHiveTest"}
+      fail-fast: false
+    name: spark-sql-native-auto-${{ matrix.module.name }}/${{ matrix.os }}/spark-${{ matrix.spark-version.full }}/java-${{ matrix.java-version }}
+    runs-on: ${{ matrix.os }}
+    container:
+      image: amd64/rust
+    steps:
+      - uses: actions/checkout@v4
+      - name: Setup Rust & Java toolchain
+        uses: ./.github/actions/setup-builder
+        with:
+          rust-version: ${{env.RUST_VERSION}}
+          jdk-version: ${{ matrix.java-version }}
+      - name: Setup Spark
+        uses: ./.github/actions/setup-spark-builder
+        with:
+          spark-version: ${{ matrix.spark-version.full }}
+          spark-short-version: ${{ matrix.spark-version.short }}
+      - name: Run Spark tests
+        run: |
+          cd apache-spark
+          rm -rf /root/.m2/repository/org/apache/parquet # somehow parquet cache requires cleanups
+          ENABLE_COMET=true ENABLE_COMET_SHUFFLE=true COMET_PARQUET_SCAN_IMPL=auto build/sbt ${{ matrix.module.args1 }} "${{ matrix.module.args2 }}"
+        env:
+          LC_ALL: "C.UTF-8"
+
diff --git a/common/src/main/scala/org/apache/comet/CometConf.scala b/common/src/main/scala/org/apache/comet/CometConf.scala
@@ -86,6 +86,7 @@ object CometConf extends ShimCometConf {
   val SCAN_NATIVE_COMET = "native_comet"
   val SCAN_NATIVE_DATAFUSION = "native_datafusion"
   val SCAN_NATIVE_ICEBERG_COMPAT = "native_iceberg_compat"
+  val SCAN_AUTO = "auto"
 
   val COMET_NATIVE_SCAN_IMPL: ConfigEntry[String] = conf("spark.comet.scan.impl")
     .doc(
@@ -95,11 +96,12 @@ object CometConf extends ShimCometConf {
         "parquet file reader and native column decoding. Supports simple types only " +
         s"'$SCAN_NATIVE_DATAFUSION' is a fully native implementation of scan based on DataFusion" +
         s"'$SCAN_NATIVE_ICEBERG_COMPAT' is a native implementation that exposes apis to read " +
-        "parquet columns natively.")
+        s"parquet columns natively. $SCAN_AUTO chooses the best scan.")
     .internal()
     .stringConf
     .transform(_.toLowerCase(Locale.ROOT))
-    .checkValues(Set(SCAN_NATIVE_COMET, SCAN_NATIVE_DATAFUSION, SCAN_NATIVE_ICEBERG_COMPAT))
+    .checkValues(
+      Set(SCAN_NATIVE_COMET, SCAN_NATIVE_DATAFUSION, SCAN_NATIVE_ICEBERG_COMPAT, SCAN_AUTO))
     .createWithDefault(sys.env
       .getOrElse("COMET_PARQUET_SCAN_IMPL", SCAN_NATIVE_COMET)
       .toLowerCase(Locale.ROOT))
@@ -587,8 +589,8 @@ object CometConf extends ShimCometConf {
   val COMET_SCAN_ALLOW_INCOMPATIBLE: ConfigEntry[Boolean] =
     conf("spark.comet.scan.allowIncompatible")
       .doc(
-        "Comet is not currently fully compatible with Spark for all datatypes. " +
-          s"Set this config to true to allow them anyway. $COMPAT_GUIDE.")
+        "Some Comet scan implementations are not currently fully compatible with Spark for " +
+          s"all datatypes. Set this config to true to allow them anyway. $COMPAT_GUIDE.")
       .booleanConf
       .createWithDefault(false)
 
diff --git a/docs/source/user-guide/compatibility.md b/docs/source/user-guide/compatibility.md
@@ -50,6 +50,8 @@ implementation:
 
 The new scans currently have the following limitations:
 
+Issues common to both `native_datafusion` and `native_iceberg_compat`:
+
 - When reading Parquet files written by systems other than Spark that contain columns with the logical types `UINT_8`
 or `UINT_16`, Comet will produce different results than Spark because Spark does not preserve or understand these
 logical types. Arrow-based readers, such as DataFusion and Comet do respect these types and read the data as unsigned
@@ -58,12 +60,21 @@ types (regardless of the logical type). This behavior can be disabled by setting
 `spark.comet.scan.allowIncompatible=true`.
 - There is a known performance issue when pushing filters down to Parquet. See the [Comet Tuning Guide] for more
 information.
+- Reading maps containing complex types can result in errors or incorrect results [#1754]
+- `PARQUET_FIELD_ID_READ_ENABLED` is not respected [#1758]
 - There are failures in the Spark SQL test suite when enabling these new scans (tracking issues: [#1542] and [#1545]).
 - No support for default values that are nested types (e.g., maps, arrays, structs). Literal default values are supported.
 - Setting Spark configs `ignoreMissingFiles` or `ignoreCorruptFiles` to `true` is not compatible with `native_datafusion` scan.
 
+Issues specific to `native_datafusion`:
+
+- Bucketed scans are not supported
+- No support for row indexes
+
 [#1545]: https://github.com/apache/datafusion-comet/issues/1545
 [#1542]: https://github.com/apache/datafusion-comet/issues/1542
+[#1754]: https://github.com/apache/datafusion-comet/issues/1754
+[#1758]: https://github.com/apache/datafusion-comet/issues/1758
 [Comet Tuning Guide]: tuning.md
 
 ## ANSI mode
diff --git a/docs/source/user-guide/configs.md b/docs/source/user-guide/configs.md
@@ -83,7 +83,7 @@ Comet provides the following configuration settings.
 | spark.comet.parquet.read.parallel.io.enabled | Whether to enable Comet's parallel reader for Parquet files. The parallel reader reads ranges of consecutive data in a  file in parallel. It is faster for large files and row groups but uses more resources. | true |
 | spark.comet.parquet.read.parallel.io.thread-pool.size | The maximum number of parallel threads the parallel reader will use in a single executor. For executors configured with a smaller number of cores, use a smaller number. | 16 |
 | spark.comet.regexp.allowIncompatible | Comet is not currently fully compatible with Spark for all regular expressions. Set this config to true to allow them anyway. For more information, refer to the Comet Compatibility Guide (https://datafusion.apache.org/comet/user-guide/compatibility.html). | false |
-| spark.comet.scan.allowIncompatible | Comet is not currently fully compatible with Spark for all datatypes. Set this config to true to allow them anyway. For more information, refer to the Comet Compatibility Guide (https://datafusion.apache.org/comet/user-guide/compatibility.html). | false |
+| spark.comet.scan.allowIncompatible | Some Comet scan implementations are not currently fully compatible with Spark for all datatypes. Set this config to true to allow them anyway. For more information, refer to the Comet Compatibility Guide (https://datafusion.apache.org/comet/user-guide/compatibility.html). | false |
 | spark.comet.scan.enabled | Whether to enable native scans. When this is turned on, Spark will use Comet to read supported data sources (currently only Parquet is supported natively). Note that to enable native vectorized execution, both this config and 'spark.comet.exec.enabled' need to be enabled. | true |
 | spark.comet.scan.preFetch.enabled | Whether to enable pre-fetching feature of CometScan. | false |
 | spark.comet.scan.preFetch.threadNum | The number of threads running pre-fetching for CometScan. Effective if spark.comet.scan.preFetch.enabled is enabled. Note that more pre-fetching threads means more memory requirement to store pre-fetched row groups. | 2 |
diff --git a/docs/templates/compatibility-template.md b/docs/templates/compatibility-template.md
@@ -50,6 +50,8 @@ implementation:
 
 The new scans currently have the following limitations:
 
+Issues common to both `native_datafusion` and `native_iceberg_compat`:
+
 - When reading Parquet files written by systems other than Spark that contain columns with the logical types `UINT_8`
   or `UINT_16`, Comet will produce different results than Spark because Spark does not preserve or understand these
   logical types. Arrow-based readers, such as DataFusion and Comet do respect these types and read the data as unsigned
@@ -58,12 +60,21 @@ The new scans currently have the following limitations:
   `spark.comet.scan.allowIncompatible=true`.
 - There is a known performance issue when pushing filters down to Parquet. See the [Comet Tuning Guide] for more
   information.
+- Reading maps containing complex types can result in errors or incorrect results [#1754]
+- `PARQUET_FIELD_ID_READ_ENABLED` is not respected [#1758]
 - There are failures in the Spark SQL test suite when enabling these new scans (tracking issues: [#1542] and [#1545]).
 - No support for default values that are nested types (e.g., maps, arrays, structs). Literal default values are supported.
 - Setting Spark configs `ignoreMissingFiles` or `ignoreCorruptFiles` to `true` is not compatible with `native_datafusion` scan.
 
+Issues specific to `native_datafusion`:
+
+- Bucketed scans are not supported
+- No support for row indexes
+
 [#1545]: https://github.com/apache/datafusion-comet/issues/1545
 [#1542]: https://github.com/apache/datafusion-comet/issues/1542
+[#1754]: https://github.com/apache/datafusion-comet/issues/1754
+[#1758]: https://github.com/apache/datafusion-comet/issues/1758
 [Comet Tuning Guide]: tuning.md
 
 ## ANSI mode
diff --git a/spark/src/main/scala/org/apache/comet/rules/CometScanRule.scala b/spark/src/main/scala/org/apache/comet/rules/CometScanRule.scala
@@ -105,8 +105,14 @@ case class CometScanRule(session: SparkSession) extends Rule[SparkPlan] {
           return withInfos(scanExec, fallbackReasons.toSet)
         }
 
-        val scanImpl = COMET_NATIVE_SCAN_IMPL.get()
-        if (scanImpl == CometConf.SCAN_NATIVE_DATAFUSION && !COMET_EXEC_ENABLED.get()) {
+        var scanImpl = COMET_NATIVE_SCAN_IMPL.get()
+
+        // if scan is auto then pick the best available scan
+        if (scanImpl == SCAN_AUTO) {
+          scanImpl = selectScan(scanExec, r.partitionSchema)
+        }
+
+        if (scanImpl == SCAN_NATIVE_DATAFUSION && !COMET_EXEC_ENABLED.get()) {
           fallbackReasons +=
             s"Full native scan disabled because ${COMET_EXEC_ENABLED.key} disabled"
           return withInfos(scanExec, fallbackReasons.toSet)
@@ -251,6 +257,57 @@ case class CometScanRule(session: SparkSession) extends Rule[SparkPlan] {
     }
   }
 
+  private def selectScan(scanExec: FileSourceScanExec, partitionSchema: StructType): String = {
+    // TODO these checks are not yet exhaustive. For example, native_iceberg_compat does
+    //  not support reading from S3
+
+    val fallbackReasons = new ListBuffer[String]()
+
+    val typeChecker = CometScanTypeChecker(SCAN_NATIVE_ICEBERG_COMPAT)
+    val schemaSupported =
+      typeChecker.isSchemaSupported(scanExec.requiredSchema, fallbackReasons)
+    val partitionSchemaSupported =
+      typeChecker.isSchemaSupported(partitionSchema, fallbackReasons)
+
+    def isComplexType(dt: DataType): Boolean = dt match {
+      case _: StructType | _: ArrayType | _: MapType => true
+      case _ => false
+    }
+
+    def hasMapsContainingStructs(dataType: DataType): Boolean = {
+      dataType match {
+        case s: StructType => s.exists(field => hasMapsContainingStructs(field.dataType))
+        case a: ArrayType => hasMapsContainingStructs(a.elementType)
+        case m: MapType => isComplexType(m.keyType) || isComplexType(m.valueType)
+        case _ => false
+      }
+    }
+
+    val knownIssues =
+      scanExec.requiredSchema.exists(field => hasMapsContainingStructs(field.dataType)) ||
+        partitionSchema.exists(field => hasMapsContainingStructs(field.dataType))
+
+    if (knownIssues) {
+      fallbackReasons += "There are known issues with maps containing structs when using " +
+        s"$SCAN_NATIVE_ICEBERG_COMPAT"
+    }
+
+    val cometExecEnabled = COMET_EXEC_ENABLED.get()
+    if (!cometExecEnabled) {
+      fallbackReasons += s"$SCAN_NATIVE_ICEBERG_COMPAT requires ${COMET_EXEC_ENABLED.key}=true"
+    }
+
+    if (cometExecEnabled && schemaSupported && partitionSchemaSupported && !knownIssues) {
+      logInfo(s"Auto scan mode selecting $SCAN_NATIVE_ICEBERG_COMPAT")
+      SCAN_NATIVE_ICEBERG_COMPAT
+    } else {
+      logInfo(
+        s"Auto scan mode falling back to $SCAN_NATIVE_COMET due to " +
+          s"${fallbackReasons.mkString(", ")}")
+      SCAN_NATIVE_COMET
+    }
+  }
+
 }
 
 case class CometScanTypeChecker(scanImpl: String) extends DataTypeSupport {
diff --git a/spark/src/test/scala/org/apache/comet/CometExpressionSuite.scala b/spark/src/test/scala/org/apache/comet/CometExpressionSuite.scala
diff --git a/spark/src/test/scala/org/apache/comet/exec/CometExecSuite.scala b/spark/src/test/scala/org/apache/comet/exec/CometExecSuite.scala
diff --git a/spark/src/test/scala/org/apache/spark/sql/CometTestBase.scala b/spark/src/test/scala/org/apache/spark/sql/CometTestBase.scala