[apache#1824] feat(spark): Support map side combine of shuffle writer (apache#1825)

wForget · Junfan Zhang · commit d2c9b0050e43 · 2025-02-06T17:21:29.000+08:00
Support map side combine of shuffle write Fix: apache#1824 Yes, support new shuffle writer behavior. Added integration test
diff --git a/client-spark/common/src/main/java/org/apache/spark/shuffle/RssSparkConfig.java b/client-spark/common/src/main/java/org/apache/spark/shuffle/RssSparkConfig.java
@@ -94,6 +94,12 @@ public class RssSparkConfig {
           .defaultValue(1)
           .withDescription("The block retry max times when partition reassign is enabled.");
 
+  public static final ConfigOption<Boolean> RSS_CLIENT_MAP_SIDE_COMBINE_ENABLED =
+      ConfigOptions.key("rss.client.mapSideCombine.enabled")
+          .booleanType()
+          .defaultValue(false)
+          .withDescription("Whether to enable map side combine of shuffle writer.");
+
   public static final String SPARK_RSS_CONFIG_PREFIX = "spark.";
 
   public static final ConfigEntry<Integer> RSS_PARTITION_NUM_PER_RANGE =
diff --git a/client-spark/spark3/src/main/java/org/apache/spark/shuffle/writer/RssShuffleWriter.java b/client-spark/spark3/src/main/java/org/apache/spark/shuffle/writer/RssShuffleWriter.java
@@ -41,6 +41,7 @@
 import scala.Function1;
 import scala.Option;
 import scala.Product2;
+import scala.Tuple2;
 import scala.collection.Iterator;
 
 import com.google.common.annotations.VisibleForTesting;
@@ -95,6 +96,7 @@
 import org.apache.uniffle.common.rpc.StatusCode;
 import org.apache.uniffle.storage.util.StorageType;
 
+import static org.apache.spark.shuffle.RssSparkConfig.RSS_CLIENT_MAP_SIDE_COMBINE_ENABLED;
 import static org.apache.spark.shuffle.RssSparkConfig.RSS_PARTITION_REASSIGN_BLOCK_RETRY_MAX_TIMES;
 import static org.apache.spark.shuffle.RssSparkConfig.RSS_TASK_FAILED_CALLBACK_ENABLED;
 
@@ -337,25 +339,27 @@ private void reportTaskFailure(Exception exception) {
   protected void writeImpl(Iterator<Product2<K, V>> records) {
     List<ShuffleBlockInfo> shuffleBlockInfos;
     boolean isCombine = shuffleDependency.mapSideCombine();
-    Function1<V, C> createCombiner = null;
+
+    Iterator<? extends Product2<K, ?>> iterator = records;
     if (isCombine) {
-      createCombiner = shuffleDependency.aggregator().get().createCombiner();
+      if (RssSparkConfig.toRssConf(sparkConf).get(RSS_CLIENT_MAP_SIDE_COMBINE_ENABLED)) {
+        iterator = shuffleDependency.aggregator().get().combineValuesByKey(records, taskContext);
+      } else {
+        Function1<V, C> combiner = shuffleDependency.aggregator().get().createCombiner();
+        iterator =
+            records.map(
+                (Function1<Product2<K, V>, Product2<K, C>>)
+                    x -> new Tuple2<>(x._1(), combiner.apply(x._2())));
+      }
     }
     long recordCount = 0;
-    while (records.hasNext()) {
+    while (iterator.hasNext()) {
       recordCount++;
-
       checkDataIfAnyFailure();
-
-      Product2<K, V> record = records.next();
+      Product2<K, ?> record = iterator.next();
       K key = record._1();
       int partition = getPartition(key);
-      if (isCombine) {
-        Object c = createCombiner.apply(record._2());
-        shuffleBlockInfos = bufferManager.addRecord(partition, record._1(), c);
-      } else {
-        shuffleBlockInfos = bufferManager.addRecord(partition, record._1(), record._2());
-      }
+      shuffleBlockInfos = bufferManager.addRecord(partition, record._1(), record._2());
       if (shuffleBlockInfos != null && !shuffleBlockInfos.isEmpty()) {
         processShuffleBlockInfos(shuffleBlockInfos);
       }
diff --git a/docs/client_guide/spark_client_guide.md b/docs/client_guide/spark_client_guide.md
diff --git a/integration-test/spark-common/src/test/java/org/apache/uniffle/test/WriteAndReadMetricsTest.java b/integration-test/spark-common/src/test/java/org/apache/uniffle/test/WriteAndReadMetricsTest.java
@@ -21,15 +21,14 @@
 import java.util.List;
 import java.util.Map;
 
-import org.apache.spark.executor.TaskMetrics;
-import org.apache.spark.scheduler.SparkListener;
-import org.apache.spark.scheduler.SparkListenerTaskEnd;
 import org.apache.spark.sql.Dataset;
 import org.apache.spark.sql.Row;
 import org.apache.spark.sql.SparkSession;
 import org.apache.spark.sql.functions;
 import org.junit.jupiter.api.Test;
 
+import org.apache.uniffle.test.listener.WriteAndReadMetricsSparkListener;
+
 public class WriteAndReadMetricsTest extends SimpleTestBase {
 
   @Test
@@ -63,6 +62,7 @@ public Map<String, Long> runTest(SparkSession spark, String fileName) throws Exc
 
     // take a rest to make sure all task metrics are updated before read stageData
     Thread.sleep(100);
+
     for (int stageId : spark.sparkContext().statusTracker().getJobInfo(0).get().stageIds()) {
       long writeRecords = listener.getWriteRecords(stageId);
       long readRecords = listener.getReadRecords(stageId);
@@ -72,32 +72,4 @@ public Map<String, Long> runTest(SparkSession spark, String fileName) throws Exc
 
     return result;
   }
-
-  private static class WriteAndReadMetricsSparkListener extends SparkListener {
-    private HashMap<Integer, Long> stageIdToWriteRecords = new HashMap<>();
-    private HashMap<Integer, Long> stageIdToReadRecords = new HashMap<>();
-
-    @Override
-    public void onTaskEnd(SparkListenerTaskEnd event) {
-      int stageId = event.stageId();
-      TaskMetrics taskMetrics = event.taskMetrics();
-      if (taskMetrics != null) {
-        long writeRecords = taskMetrics.shuffleWriteMetrics().recordsWritten();
-        long readRecords = taskMetrics.shuffleReadMetrics().recordsRead();
-        // Accumulate writeRecords and readRecords for the given stageId
-        stageIdToWriteRecords.put(
-            stageId, stageIdToWriteRecords.getOrDefault(stageId, 0L) + writeRecords);
-        stageIdToReadRecords.put(
-            stageId, stageIdToReadRecords.getOrDefault(stageId, 0L) + readRecords);
-      }
-    }
-
-    public long getWriteRecords(int stageId) {
-      return stageIdToWriteRecords.getOrDefault(stageId, 0L);
-    }
-
-    public long getReadRecords(int stageId) {
-      return stageIdToReadRecords.getOrDefault(stageId, 0L);
-    }
-  }
 }
diff --git a/integration-test/spark-common/src/test/java/org/apache/uniffle/test/listener/WriteAndReadMetricsSparkListener.java b/integration-test/spark-common/src/test/java/org/apache/uniffle/test/listener/WriteAndReadMetricsSparkListener.java
@@ -0,0 +1,52 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.uniffle.test.listener;
+
+import java.util.HashMap;
+
+import org.apache.spark.executor.TaskMetrics;
+import org.apache.spark.scheduler.SparkListener;
+import org.apache.spark.scheduler.SparkListenerTaskEnd;
+
+public class WriteAndReadMetricsSparkListener extends SparkListener {
+  private HashMap<Integer, Long> stageIdToWriteRecords = new HashMap<>();
+  private HashMap<Integer, Long> stageIdToReadRecords = new HashMap<>();
+
+  @Override
+  public void onTaskEnd(SparkListenerTaskEnd event) {
+    int stageId = event.stageId();
+    TaskMetrics taskMetrics = event.taskMetrics();
+    if (taskMetrics != null) {
+      long writeRecords = taskMetrics.shuffleWriteMetrics().recordsWritten();
+      long readRecords = taskMetrics.shuffleReadMetrics().recordsRead();
+      // Accumulate writeRecords and readRecords for the given stageId
+      stageIdToWriteRecords.put(
+          stageId, stageIdToWriteRecords.getOrDefault(stageId, 0L) + writeRecords);
+      stageIdToReadRecords.put(
+          stageId, stageIdToReadRecords.getOrDefault(stageId, 0L) + readRecords);
+    }
+  }
+
+  public long getWriteRecords(int stageId) {
+    return stageIdToWriteRecords.getOrDefault(stageId, 0L);
+  }
+
+  public long getReadRecords(int stageId) {
+    return stageIdToReadRecords.getOrDefault(stageId, 0L);
+  }
+}
diff --git a/integration-test/spark3/src/test/java/org/apache/uniffle/test/MapSideCombineTest.java b/integration-test/spark3/src/test/java/org/apache/uniffle/test/MapSideCombineTest.java