Skip row index Spark SQL tests for native_datafusion Parquet scan. (#1724)

mbutrovich · web-flow · commit 4aa89b51b03b · 2025-05-08T21:42:43.000-06:00
diff --git a/dev/diffs/3.4.3.diff b/dev/diffs/3.4.3.diff
@@ -2231,18 +2231,34 @@ index 240bb4e6dcb..8287ffa03ca 100644
  
    import testImplicits._
 diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRowIndexSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRowIndexSuite.scala
-index 351c6d698fc..36492fe936d 100644
+index 351c6d698fc..583d9225cca 100644
 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRowIndexSuite.scala
 +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRowIndexSuite.scala
-@@ -26,6 +26,7 @@ import org.apache.parquet.hadoop.{ParquetFileReader, ParquetOutputFormat}
+@@ -20,12 +20,14 @@ import java.io.File
+ 
+ import scala.collection.JavaConverters._
+ 
++import org.apache.comet.CometConf
+ import org.apache.hadoop.fs.Path
+ import org.apache.parquet.column.ParquetProperties._
+ import org.apache.parquet.hadoop.{ParquetFileReader, ParquetOutputFormat}
  import org.apache.parquet.hadoop.ParquetWriter.DEFAULT_BLOCK_SIZE
  
  import org.apache.spark.sql.QueryTest
 +import org.apache.spark.sql.comet.{CometBatchScanExec, CometScanExec}
  import org.apache.spark.sql.execution.FileSourceScanExec
  import org.apache.spark.sql.execution.datasources.FileFormat
  import org.apache.spark.sql.execution.datasources.v2.BatchScanExec
-@@ -230,6 +231,12 @@ class ParquetRowIndexSuite extends QueryTest with SharedSparkSession {
+@@ -172,6 +174,8 @@ class ParquetRowIndexSuite extends QueryTest with SharedSparkSession {
+ 
+   private def testRowIndexGeneration(label: String, conf: RowIndexTestConf): Unit = {
+     test (s"$label - ${conf.desc}") {
++      // native_datafusion Parquet scan does not support row index generation.
++      assume(CometConf.COMET_NATIVE_SCAN_IMPL.get() != CometConf.SCAN_NATIVE_DATAFUSION)
+       withSQLConf(conf.sqlConfs: _*) {
+         withTempPath { path =>
+           val rowIndexColName = FileFormat.ROW_INDEX_TEMPORARY_COLUMN_NAME
+@@ -230,6 +234,12 @@ class ParquetRowIndexSuite extends QueryTest with SharedSparkSession {
              case f: FileSourceScanExec =>
                numPartitions += f.inputRDD.partitions.length
                numOutputRows += f.metrics("numOutputRows").value
@@ -2255,6 +2271,15 @@ index 351c6d698fc..36492fe936d 100644
              case _ =>
            }
            assert(numPartitions > 0)
+@@ -291,6 +301,8 @@ class ParquetRowIndexSuite extends QueryTest with SharedSparkSession {
+     val conf = RowIndexTestConf(useDataSourceV2 = useDataSourceV2)
+ 
+     test(s"invalid row index column type - ${conf.desc}") {
++      // native_datafusion Parquet scan does not support row index generation.
++      assume(CometConf.COMET_NATIVE_SCAN_IMPL.get() != CometConf.SCAN_NATIVE_DATAFUSION)
+       withSQLConf(conf.sqlConfs: _*) {
+         withTempPath{ path =>
+           val df = spark.range(0, 10, 1, 1).toDF("id")
 diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaPruningSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaPruningSuite.scala
 index 5c0b7def039..151184bc98c 100644
 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaPruningSuite.scala
diff --git a/dev/diffs/3.5.4.diff b/dev/diffs/3.5.4.diff
@@ -2248,18 +2248,34 @@ index 4f906411345..6cc69f7e915 100644
  
    import testImplicits._
 diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRowIndexSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRowIndexSuite.scala
-index 27c2a2148fd..1d93d0eb8bc 100644
+index 27c2a2148fd..df04a15fb1f 100644
 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRowIndexSuite.scala
 +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRowIndexSuite.scala
-@@ -26,6 +26,7 @@ import org.apache.parquet.hadoop.{ParquetFileReader, ParquetOutputFormat}
+@@ -20,12 +20,14 @@ import java.io.File
+ 
+ import scala.collection.JavaConverters._
+ 
++import org.apache.comet.CometConf
+ import org.apache.hadoop.fs.Path
+ import org.apache.parquet.column.ParquetProperties._
+ import org.apache.parquet.hadoop.{ParquetFileReader, ParquetOutputFormat}
  import org.apache.parquet.hadoop.ParquetWriter.DEFAULT_BLOCK_SIZE
  
  import org.apache.spark.sql.QueryTest
 +import org.apache.spark.sql.comet.{CometBatchScanExec, CometScanExec}
  import org.apache.spark.sql.execution.FileSourceScanExec
  import org.apache.spark.sql.execution.datasources.FileFormat
  import org.apache.spark.sql.execution.datasources.v2.BatchScanExec
-@@ -243,6 +244,12 @@ class ParquetRowIndexSuite extends QueryTest with SharedSparkSession {
+@@ -172,6 +174,8 @@ class ParquetRowIndexSuite extends QueryTest with SharedSparkSession {
+ 
+   private def testRowIndexGeneration(label: String, conf: RowIndexTestConf): Unit = {
+     test (s"$label - ${conf.desc}") {
++      // native_datafusion Parquet scan does not support row index generation.
++      assume(CometConf.COMET_NATIVE_SCAN_IMPL.get() != CometConf.SCAN_NATIVE_DATAFUSION)
+       withSQLConf(conf.sqlConfs: _*) {
+         withTempPath { path =>
+           // Read row index using _metadata.row_index if that is supported by the file format.
+@@ -243,6 +247,12 @@ class ParquetRowIndexSuite extends QueryTest with SharedSparkSession {
              case f: FileSourceScanExec =>
                numPartitions += f.inputRDD.partitions.length
                numOutputRows += f.metrics("numOutputRows").value
@@ -2272,6 +2288,15 @@ index 27c2a2148fd..1d93d0eb8bc 100644
              case _ =>
            }
            assert(numPartitions > 0)
+@@ -301,6 +311,8 @@ class ParquetRowIndexSuite extends QueryTest with SharedSparkSession {
+     val conf = RowIndexTestConf(useDataSourceV2 = useDataSourceV2)
+ 
+     test(s"invalid row index column type - ${conf.desc}") {
++      // native_datafusion Parquet scan does not support row index generation.
++      assume(CometConf.COMET_NATIVE_SCAN_IMPL.get() != CometConf.SCAN_NATIVE_DATAFUSION)
+       withSQLConf(conf.sqlConfs: _*) {
+         withTempPath{ path =>
+           val df = spark.range(0, 10, 1, 1).toDF("id")
 diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaPruningSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaPruningSuite.scala
 index 5c0b7def039..151184bc98c 100644
 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaPruningSuite.scala
diff --git a/dev/diffs/3.5.5.diff b/dev/diffs/3.5.5.diff
@@ -2248,18 +2248,34 @@ index 4f906411345..6cc69f7e915 100644
  
    import testImplicits._
 diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRowIndexSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRowIndexSuite.scala
-index 27c2a2148fd..1d93d0eb8bc 100644
+index 27c2a2148fd..df04a15fb1f 100644
 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRowIndexSuite.scala
 +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRowIndexSuite.scala
-@@ -26,6 +26,7 @@ import org.apache.parquet.hadoop.{ParquetFileReader, ParquetOutputFormat}
+@@ -20,12 +20,14 @@ import java.io.File
+ 
+ import scala.collection.JavaConverters._
+ 
++import org.apache.comet.CometConf
+ import org.apache.hadoop.fs.Path
+ import org.apache.parquet.column.ParquetProperties._
+ import org.apache.parquet.hadoop.{ParquetFileReader, ParquetOutputFormat}
  import org.apache.parquet.hadoop.ParquetWriter.DEFAULT_BLOCK_SIZE
  
  import org.apache.spark.sql.QueryTest
 +import org.apache.spark.sql.comet.{CometBatchScanExec, CometScanExec}
  import org.apache.spark.sql.execution.FileSourceScanExec
  import org.apache.spark.sql.execution.datasources.FileFormat
  import org.apache.spark.sql.execution.datasources.v2.BatchScanExec
-@@ -243,6 +244,12 @@ class ParquetRowIndexSuite extends QueryTest with SharedSparkSession {
+@@ -172,6 +174,8 @@ class ParquetRowIndexSuite extends QueryTest with SharedSparkSession {
+ 
+   private def testRowIndexGeneration(label: String, conf: RowIndexTestConf): Unit = {
+     test (s"$label - ${conf.desc}") {
++      // native_datafusion Parquet scan does not support row index generation.
++      assume(CometConf.COMET_NATIVE_SCAN_IMPL.get() != CometConf.SCAN_NATIVE_DATAFUSION)
+       withSQLConf(conf.sqlConfs: _*) {
+         withTempPath { path =>
+           // Read row index using _metadata.row_index if that is supported by the file format.
+@@ -243,6 +247,12 @@ class ParquetRowIndexSuite extends QueryTest with SharedSparkSession {
              case f: FileSourceScanExec =>
                numPartitions += f.inputRDD.partitions.length
                numOutputRows += f.metrics("numOutputRows").value
@@ -2272,6 +2288,15 @@ index 27c2a2148fd..1d93d0eb8bc 100644
              case _ =>
            }
            assert(numPartitions > 0)
+@@ -301,6 +311,8 @@ class ParquetRowIndexSuite extends QueryTest with SharedSparkSession {
+     val conf = RowIndexTestConf(useDataSourceV2 = useDataSourceV2)
+ 
+     test(s"invalid row index column type - ${conf.desc}") {
++      // native_datafusion Parquet scan does not support row index generation.
++      assume(CometConf.COMET_NATIVE_SCAN_IMPL.get() != CometConf.SCAN_NATIVE_DATAFUSION)
+       withSQLConf(conf.sqlConfs: _*) {
+         withTempPath{ path =>
+           val df = spark.range(0, 10, 1, 1).toDF("id")
 diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaPruningSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaPruningSuite.scala
 index 5c0b7def039..151184bc98c 100644
 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaPruningSuite.scala
diff --git a/dev/diffs/4.0.0-preview1.diff b/dev/diffs/4.0.0-preview1.diff
@@ -2476,18 +2476,35 @@ index 6d9092391a9..6da095120d1 100644
  
    import testImplicits._
 diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRowIndexSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRowIndexSuite.scala
-index 95378d94674..0c915fdc634 100644
+index 95378d94674..2b75ffad4d7 100644
 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRowIndexSuite.scala
 +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRowIndexSuite.scala
-@@ -27,6 +27,7 @@ import org.apache.parquet.hadoop.ParquetWriter.DEFAULT_BLOCK_SIZE
+@@ -20,6 +20,7 @@ import java.io.File
+ 
+ import scala.jdk.CollectionConverters._
+ 
++import org.apache.comet.CometConf
+ import org.apache.hadoop.fs.Path
+ import org.apache.parquet.column.ParquetProperties._
+ import org.apache.parquet.hadoop.{ParquetFileReader, ParquetOutputFormat}
+@@ -27,6 +28,7 @@ import org.apache.parquet.hadoop.ParquetWriter.DEFAULT_BLOCK_SIZE
  
  import org.apache.spark.SparkException
  import org.apache.spark.sql.QueryTest
 +import org.apache.spark.sql.comet.{CometBatchScanExec, CometScanExec}
  import org.apache.spark.sql.execution.FileSourceScanExec
  import org.apache.spark.sql.execution.datasources.FileFormat
  import org.apache.spark.sql.execution.datasources.v2.BatchScanExec
-@@ -245,6 +246,12 @@ class ParquetRowIndexSuite extends QueryTest with SharedSparkSession {
+@@ -174,6 +176,8 @@ class ParquetRowIndexSuite extends QueryTest with SharedSparkSession {
+ 
+   private def testRowIndexGeneration(label: String, conf: RowIndexTestConf): Unit = {
+     test (s"$label - ${conf.desc}") {
++      // native_datafusion Parquet scan does not support row index generation.
++      assume(CometConf.COMET_NATIVE_SCAN_IMPL.get() != CometConf.SCAN_NATIVE_DATAFUSION)
+       withSQLConf(conf.sqlConfs: _*) {
+         withTempPath { path =>
+           // Read row index using _metadata.row_index if that is supported by the file format.
+@@ -245,6 +249,12 @@ class ParquetRowIndexSuite extends QueryTest with SharedSparkSession {
              case f: FileSourceScanExec =>
                numPartitions += f.inputRDD.partitions.length
                numOutputRows += f.metrics("numOutputRows").value
@@ -2500,6 +2517,15 @@ index 95378d94674..0c915fdc634 100644
              case _ =>
            }
            assert(numPartitions > 0)
+@@ -303,6 +313,8 @@ class ParquetRowIndexSuite extends QueryTest with SharedSparkSession {
+     val conf = RowIndexTestConf(useDataSourceV2 = useDataSourceV2)
+ 
+     test(s"invalid row index column type - ${conf.desc}") {
++      // native_datafusion Parquet scan does not support row index generation.
++      assume(CometConf.COMET_NATIVE_SCAN_IMPL.get() != CometConf.SCAN_NATIVE_DATAFUSION)
+       withSQLConf(conf.sqlConfs: _*) {
+         withTempPath{ path =>
+           val df = spark.range(0, 10, 1, 1).toDF("id")
 diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaPruningSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaPruningSuite.scala
 index 5c0b7def039..151184bc98c 100644
 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaPruningSuite.scala