perf: Experimental fix to avoid join strategy regression (#1674)

andygrove · web-flow · commit ea125f56f838 · 2025-04-23T06:13:44.000-06:00
diff --git a/common/src/main/scala/org/apache/comet/CometConf.scala b/common/src/main/scala/org/apache/comet/CometConf.scala
@@ -414,6 +414,16 @@ object CometConf extends ShimCometConf {
     .doubleConf
     .createWithDefault(10.0)
 
+  val COMET_EXCHANGE_SIZE_MULTIPLIER: ConfigEntry[Double] = conf(
+    "spark.comet.shuffle.sizeInBytesMultiplier")
+    .doc(
+      "Comet reports smaller sizes for shuffle due to using Arrow's columnar memory format " +
+        "and this can result in Spark choosing a different join strategy due to the estimated " +
+        "size of the exchange being smaller. Comet will multiple sizeInBytes by this amount to " +
+        "avoid regressions in join strategy.")
+    .doubleConf
+    .createWithDefault(1.0)
+
   val COMET_DPP_FALLBACK_ENABLED: ConfigEntry[Boolean] =
     conf("spark.comet.dppFallback.enabled")
       .doc("Whether to fall back to Spark for queries that use DPP.")
diff --git a/docs/source/user-guide/configs.md b/docs/source/user-guide/configs.md
@@ -88,4 +88,5 @@ Comet provides the following configuration settings.
 | spark.comet.scan.preFetch.enabled | Whether to enable pre-fetching feature of CometScan. | false |
 | spark.comet.scan.preFetch.threadNum | The number of threads running pre-fetching for CometScan. Effective if spark.comet.scan.preFetch.enabled is enabled. Note that more pre-fetching threads means more memory requirement to store pre-fetched row groups. | 2 |
 | spark.comet.shuffle.preferDictionary.ratio | The ratio of total values to distinct values in a string column to decide whether to prefer dictionary encoding when shuffling the column. If the ratio is higher than this config, dictionary encoding will be used on shuffling string column. This config is effective if it is higher than 1.0. Note that this config is only used when `spark.comet.exec.shuffle.mode` is `jvm`. | 10.0 |
+| spark.comet.shuffle.sizeInBytesMultiplier | Comet reports smaller sizes for shuffle due to using Arrow's columnar memory format and this can result in Spark choosing a different join strategy due to the estimated size of the exchange being smaller. Comet will multiple sizeInBytes by this amount to avoid regressions in join strategy. | 1.0 |
 | spark.comet.sparkToColumnar.supportedOperatorList | A comma-separated list of operators that will be converted to Arrow columnar format when 'spark.comet.sparkToColumnar.enabled' is true | Range,InMemoryTableScan |
diff --git a/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometShuffleExchangeExec.scala b/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometShuffleExchangeExec.scala
@@ -46,6 +46,7 @@ import org.apache.spark.util.random.XORShiftRandom
 
 import com.google.common.base.Objects
 
+import org.apache.comet.CometConf
 import org.apache.comet.shims.ShimCometShuffleExchangeExec
 
 /**
@@ -113,9 +114,10 @@ case class CometShuffleExchangeExec(
     new CometShuffledBatchRDD(shuffleDependency, readMetrics, partitionSpecs)
 
   override def runtimeStatistics: Statistics = {
-    val dataSize = metrics("dataSize").value
+    val dataSize =
+      metrics("dataSize").value * Math.max(CometConf.COMET_EXCHANGE_SIZE_MULTIPLIER.get(conf), 1)
     val rowCount = metrics(SQLShuffleWriteMetricsReporter.SHUFFLE_RECORDS_WRITTEN).value
-    Statistics(dataSize, Some(rowCount))
+    Statistics(dataSize.toLong, Some(rowCount))
   }
 
   // TODO: add `override` keyword after dropping Spark-3.x supports