apache
diff --git a/‎spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometNativeShuffleWriter.scala‎
Lines changed: 221 additions & 0 deletions b/‎spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometNativeShuffleWriter.scala‎
Lines changed: 221 additions & 0 deletions
diff --git a/‎spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometShuffleDependency.scala‎
Lines changed: 7 additions & 1 deletion b/‎spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometShuffleDependency.scala‎
Lines changed: 7 additions & 1 deletion
@@ -0,0 +1,221 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.spark.sql.comet.execution.shuffle
+
+import java.nio.{ByteBuffer, ByteOrder}
+import java.nio.file.{Files, Paths}
+
+import scala.collection.JavaConverters.asJavaIterableConverter
+
+import org.apache.spark.{SparkEnv, TaskContext}
+import org.apache.spark.scheduler.MapStatus
+import org.apache.spark.shuffle.{IndexShuffleBlockResolver, ShuffleWriteMetricsReporter, ShuffleWriter}
+import org.apache.spark.sql.catalyst.expressions.Attribute
+import org.apache.spark.sql.catalyst.plans.physical.{HashPartitioning, Partitioning, SinglePartition}
+import org.apache.spark.sql.comet.{CometExec, CometMetricNode}
+import org.apache.spark.sql.execution.metric.{SQLMetric, SQLShuffleWriteMetricsReporter}
+import org.apache.spark.sql.vectorized.ColumnarBatch
+
+import org.apache.comet.CometConf
+import org.apache.comet.serde.{OperatorOuterClass, PartitioningOuterClass, QueryPlanSerde}
+import org.apache.comet.serde.OperatorOuterClass.{CompressionCodec, Operator}
+import org.apache.comet.serde.QueryPlanSerde.serializeDataType
+
+/**
+ * A [[ShuffleWriter]] that will delegate shuffle write to native shuffle.
+ */
+class CometNativeShuffleWriter[K, V](
+    outputPartitioning: Partitioning,
+    outputAttributes: Seq[Attribute],
+    metrics: Map[String, SQLMetric],
+    numParts: Int,
+    shuffleId: Int,
+    mapId: Long,
+    context: TaskContext,
+    metricsReporter: ShuffleWriteMetricsReporter)
+    extends ShuffleWriter[K, V] {
+
+  private val OFFSET_LENGTH = 8
+
+  var partitionLengths: Array[Long] = _
+  var mapStatus: MapStatus = _
+
+  override def write(inputs: Iterator[Product2[K, V]]): Unit = {
+    val shuffleBlockResolver =
+      SparkEnv.get.shuffleManager.shuffleBlockResolver.asInstanceOf[IndexShuffleBlockResolver]
+    val dataFile = shuffleBlockResolver.getDataFile(shuffleId, mapId)
+    val indexFile = shuffleBlockResolver.getIndexFile(shuffleId, mapId)
+    val tempDataFilename = dataFile.getPath.replace(".data", ".data.tmp")
+    val tempIndexFilename = indexFile.getPath.replace(".index", ".index.tmp")
+    val tempDataFilePath = Paths.get(tempDataFilename)
+    val tempIndexFilePath = Paths.get(tempIndexFilename)
+
+    // Call native shuffle write
+    val nativePlan = getNativePlan(tempDataFilename, tempIndexFilename)
+
+    val detailedMetrics = Seq(
+      "elapsed_compute",
+      "encode_time",
+      "repart_time",
+      "mempool_time",
+      "input_batches",
+      "spill_count",
+      "spilled_bytes")
+
+    // Maps native metrics to SQL metrics
+    val nativeSQLMetrics = Map(
+      "output_rows" -> metrics(SQLShuffleWriteMetricsReporter.SHUFFLE_RECORDS_WRITTEN),
+      "data_size" -> metrics("dataSize"),
+      "write_time" -> metrics(SQLShuffleWriteMetricsReporter.SHUFFLE_WRITE_TIME)) ++
+      metrics.filterKeys(detailedMetrics.contains)
+    val nativeMetrics = CometMetricNode(nativeSQLMetrics)
+
+    // Getting rid of the fake partitionId
+    val newInputs = inputs.asInstanceOf[Iterator[_ <: Product2[Any, Any]]].map(_._2)
+
+    val cometIter = CometExec.getCometIterator(
+      Seq(newInputs.asInstanceOf[Iterator[ColumnarBatch]]),
+      outputAttributes.length,
+      nativePlan,
+      nativeMetrics,
+      numParts,
+      context.partitionId())
+
+    while (cometIter.hasNext) {
+      cometIter.next()
+    }
+    cometIter.close()
+
+    // get partition lengths from shuffle write output index file
+    var offset = 0L
+    partitionLengths = Files
+      .readAllBytes(tempIndexFilePath)
+      .grouped(OFFSET_LENGTH)
+      .drop(1) // first partition offset is always 0
+      .map(indexBytes => {
+        val partitionOffset =
+          ByteBuffer.wrap(indexBytes).order(ByteOrder.LITTLE_ENDIAN).getLong
+        val partitionLength = partitionOffset - offset
+        offset = partitionOffset
+        partitionLength
+      })
+      .toArray
+    Files.delete(tempIndexFilePath)
+
+    // Total written bytes at native
+    metricsReporter.incBytesWritten(Files.size(tempDataFilePath))
+
+    // commit
+    shuffleBlockResolver.writeMetadataFileAndCommit(
+      shuffleId,
+      mapId,
+      partitionLengths,
+      Array.empty, // TODO: add checksums
+      tempDataFilePath.toFile)
+    mapStatus =
+      MapStatus.apply(SparkEnv.get.blockManager.shuffleServerId, partitionLengths, mapId)
+  }
+
+  private def getNativePlan(dataFile: String, indexFile: String): Operator = {
+    val scanBuilder = OperatorOuterClass.Scan.newBuilder().setSource("ShuffleWriterInput")
+    val opBuilder = OperatorOuterClass.Operator.newBuilder()
+
+    val scanTypes = outputAttributes.flatten { attr =>
+      serializeDataType(attr.dataType)
+    }
+
+    if (scanTypes.length == outputAttributes.length) {
+      scanBuilder.addAllFields(scanTypes.asJava)
+
+      val shuffleWriterBuilder = OperatorOuterClass.ShuffleWriter.newBuilder()
+      shuffleWriterBuilder.setOutputDataFile(dataFile)
+      shuffleWriterBuilder.setOutputIndexFile(indexFile)
+      shuffleWriterBuilder.setEnableFastEncoding(
+        CometConf.COMET_SHUFFLE_ENABLE_FAST_ENCODING.get())
+
+      if (SparkEnv.get.conf.getBoolean("spark.shuffle.compress", true)) {
+        val codec = CometConf.COMET_EXEC_SHUFFLE_COMPRESSION_CODEC.get() match {
+          case "zstd" => CompressionCodec.Zstd
+          case "lz4" => CompressionCodec.Lz4
+          case "snappy" => CompressionCodec.Snappy
+          case other => throw new UnsupportedOperationException(s"invalid codec: $other")
+        }
+        shuffleWriterBuilder.setCodec(codec)
+      } else {
+        shuffleWriterBuilder.setCodec(CompressionCodec.None)
+      }
+      shuffleWriterBuilder.setCompressionLevel(
+        CometConf.COMET_EXEC_SHUFFLE_COMPRESSION_ZSTD_LEVEL.get)
+
+      outputPartitioning match {
+        case _: HashPartitioning =>
+          val hashPartitioning = outputPartitioning.asInstanceOf[HashPartitioning]
+
+          val partitioning = PartitioningOuterClass.HashRepartition.newBuilder()
+          partitioning.setNumPartitions(outputPartitioning.numPartitions)
+
+          val partitionExprs = hashPartitioning.expressions
+            .flatMap(e => QueryPlanSerde.exprToProto(e, outputAttributes))
+
+          if (partitionExprs.length != hashPartitioning.expressions.length) {
+            throw new UnsupportedOperationException(
+              s"Partitioning $hashPartitioning is not supported.")
+          }
+
+          partitioning.addAllHashExpression(partitionExprs.asJava)
+
+          val partitioningBuilder = PartitioningOuterClass.Partitioning.newBuilder()
+          shuffleWriterBuilder.setPartitioning(
+            partitioningBuilder.setHashPartition(partitioning).build())
+
+        case SinglePartition =>
+          val partitioning = PartitioningOuterClass.SinglePartition.newBuilder()
+
+          val partitioningBuilder = PartitioningOuterClass.Partitioning.newBuilder()
+          shuffleWriterBuilder.setPartitioning(
+            partitioningBuilder.setSinglePartition(partitioning).build())
+
+        case _ =>
+          throw new UnsupportedOperationException(
+            s"Partitioning $outputPartitioning is not supported.")
+      }
+
+      val shuffleWriterOpBuilder = OperatorOuterClass.Operator.newBuilder()
+      shuffleWriterOpBuilder
+        .setShuffleWriter(shuffleWriterBuilder)
+        .addChildren(opBuilder.setScan(scanBuilder).build())
+        .build()
+    } else {
+      // There are unsupported scan type
+      throw new UnsupportedOperationException(
+        s"$outputAttributes contains unsupported data types for CometShuffleExchangeExec.")
+    }
+  }
+
+  override def stop(success: Boolean): Option[MapStatus] = {
+    if (success) {
+      Some(mapStatus)
+    } else {
+      None
+    }
+  }
+
+  override def getPartitionLengths(): Array[Long] = partitionLengths
+}
@@ -25,6 +25,8 @@ import org.apache.spark.{Aggregator, Partitioner, ShuffleDependency, SparkEnv}
 import org.apache.spark.rdd.RDD
 import org.apache.spark.serializer.Serializer
 import org.apache.spark.shuffle.ShuffleWriteProcessor
+import org.apache.spark.sql.catalyst.expressions.Attribute
+import org.apache.spark.sql.catalyst.plans.physical.Partitioning
 import org.apache.spark.sql.execution.metric.SQLMetric
 import org.apache.spark.sql.types.StructType
 
@@ -41,7 +43,11 @@ class CometShuffleDependency[K: ClassTag, V: ClassTag, C: ClassTag](
     override val shuffleWriterProcessor: ShuffleWriteProcessor = new ShuffleWriteProcessor,
     val shuffleType: ShuffleType = CometNativeShuffle,
     val schema: Option[StructType] = None,
-    val decodeTime: SQLMetric)
+    val decodeTime: SQLMetric,
+    val outputPartitioning: Option[Partitioning] = None,
+    val outputAttributes: Seq[Attribute] = Seq.empty,
+    val shuffleWriteMetrics: Map[String, SQLMetric] = Map.empty,
+    val numParts: Int = 0)
     extends ShuffleDependency[K, V, C](
       _rdd,
       partitioner,