parquet_writer

coderfender · coderfender · commit 47bc3bc9b02d · 2026-01-15T14:50:28.000-08:00
diff --git a/spark/src/main/scala/org/apache/comet/serde/operator/CometDataWritingCommand.scala b/spark/src/main/scala/org/apache/comet/serde/operator/CometDataWritingCommand.scala
@@ -24,11 +24,13 @@ import java.util.Locale
 import scala.jdk.CollectionConverters._
 
 import org.apache.spark.SparkException
+import org.apache.spark.sql.{SaveMode, SparkSession}
 import org.apache.spark.sql.comet.{CometNativeExec, CometNativeWriteExec}
 import org.apache.spark.sql.execution.command.DataWritingCommandExec
 import org.apache.spark.sql.execution.datasources.{InsertIntoHadoopFsRelationCommand, WriteFilesExec}
 import org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat
 import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.internal.SQLConf.PartitionOverwriteMode
 
 import org.apache.comet.{CometConf, ConfigEntry, DataTypeSupport}
 import org.apache.comet.CometSparkSessionExtensions.withInfo
@@ -61,6 +63,10 @@ object CometDataWritingCommand extends CometOperatorSerde[DataWritingCommandExec
               return Unsupported(Some("Bucketed writes are not supported"))
             }
 
+            if (SQLConf.get.partitionOverwriteMode == PartitionOverwriteMode.DYNAMIC) {
+              return Unsupported(Some("Dynamic partition overwrite is not supported"))
+            }
+
             if (cmd.partitionColumns.nonEmpty || cmd.staticPartitions.nonEmpty) {
               return Incompatible(Some("Partitioned writes are highly experimental"))
             }
@@ -158,6 +164,14 @@ object CometDataWritingCommand extends CometOperatorSerde[DataWritingCommandExec
     val cmd = op.cmd.asInstanceOf[InsertIntoHadoopFsRelationCommand]
     val outputPath = cmd.outputPath.toString
 
+//    TODO : support dynamic partition overwrite
+    if (cmd.mode == SaveMode.Overwrite) {
+      val fs = cmd.outputPath.getFileSystem(SparkSession.active.sparkContext.hadoopConfiguration)
+      if (fs.exists(cmd.outputPath)) {
+        fs.delete(cmd.outputPath, true)
+      }
+    }
+
     // Get the child plan from the WriteFilesExec or use the child directly
     val childPlan = op.child match {
       case writeFiles: WriteFilesExec =>
@@ -168,8 +182,6 @@ object CometDataWritingCommand extends CometOperatorSerde[DataWritingCommandExec
         other
     }
 
-    val isDynamicOverWriteMode = cmd.partitionColumns.nonEmpty
-
     // Create FileCommitProtocol for atomic writes
     val jobId = java.util.UUID.randomUUID().toString
     val committer =
diff --git a/spark/src/test/scala/org/apache/comet/parquet/CometParquetWriterSuite.scala b/spark/src/test/scala/org/apache/comet/parquet/CometParquetWriterSuite.scala
@@ -27,7 +27,7 @@ import org.apache.spark.sql.{CometTestBase, DataFrame}
 import org.apache.spark.sql.comet.{CometNativeScanExec, CometNativeWriteExec}
 import org.apache.spark.sql.execution.QueryExecution
 import org.apache.spark.sql.execution.command.DataWritingCommandExec
-import org.apache.spark.sql.functions.col
+import org.apache.spark.sql.functions.{col, lit}
 import org.apache.spark.sql.internal.SQLConf
 
 import org.apache.comet.CometConf
@@ -273,4 +273,52 @@ class CometParquetWriterSuite extends CometTestBase {
       }
     }
   }
+
+  test("partitioned write - data correctness per partition") {
+    withTempPath { dir =>
+      val outputPath = new File(dir, "output").getAbsolutePath
+
+      withTempPath { inputDir =>
+        val inputPath = createTestData(inputDir)
+
+        withSQLConf(
+          CometConf.COMET_NATIVE_PARQUET_WRITE_ENABLED.key -> "true",
+          CometConf.getOperatorAllowIncompatConfigKey(
+            classOf[DataWritingCommandExec]) -> "true") {
+
+          val inputDf = spark.read.parquet(inputPath).filter(col("c1") <= lit(10))
+          val partCols = inputDf.columns.take(2)
+          val col1 = partCols(0)
+          val col2 = partCols(1)
+
+          inputDf.write.partitionBy(partCols: _*).parquet(outputPath)
+
+          // unique combinations
+          val combinations = inputDf
+            .select(partCols.head, partCols.last)
+            .distinct()
+            .collect()
+            .map(r => (r.getBoolean(0), r.getByte(1)))
+
+          combinations.foreach { tuple =>
+            val val1 = tuple._1
+            val val2 = tuple._2
+
+            val partitionPath = s"$outputPath/${partCols.head}=$val1/${partCols.last}=$val2"
+
+            val actualDf = spark.read.parquet(partitionPath)
+            val expectedDf = inputDf
+              .filter(col(col1) === val1)
+              .filter(col(col2) === val2)
+              .drop(col1, col2)
+
+            checkAnswer(actualDf, expectedDf)
+          }
+
+          // Verify total count as well
+          checkAnswer(spark.read.parquet(outputPath), inputDf)
+        }
+      }
+    }
+  }
 }