mbutrovich
diff --git a/‎common/src/main/scala/org/apache/comet/CometConf.scala‎
Lines changed: 9 additions & 50 deletions b/‎common/src/main/scala/org/apache/comet/CometConf.scala‎
Lines changed: 9 additions & 50 deletions
diff --git a/‎docs/source/user-guide/latest/configs.md‎
Lines changed: 13 additions & 0 deletions b/‎docs/source/user-guide/latest/configs.md‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎native/core/src/execution/jni_api.rs‎
Lines changed: 2 additions & 0 deletions b/‎native/core/src/execution/jni_api.rs‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎spark/src/main/scala/org/apache/comet/CometExecIterator.scala‎
Lines changed: 3 additions & 3 deletions b/‎spark/src/main/scala/org/apache/comet/CometExecIterator.scala‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎spark/src/main/scala/org/apache/comet/CometSparkSessionExtensions.scala‎
Lines changed: 8 additions & 31 deletions b/‎spark/src/main/scala/org/apache/comet/CometSparkSessionExtensions.scala‎
Lines changed: 8 additions & 31 deletions
diff --git a/‎spark/src/main/scala/org/apache/comet/GenerateDocs.scala‎
Lines changed: 8 additions & 1 deletion b/‎spark/src/main/scala/org/apache/comet/GenerateDocs.scala‎
Lines changed: 8 additions & 1 deletion
diff --git a/‎spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala‎
Lines changed: 2 additions & 1 deletion b/‎spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎spark/src/main/scala/org/apache/comet/serde/hash.scala‎
Lines changed: 15 additions & 1 deletion b/‎spark/src/main/scala/org/apache/comet/serde/hash.scala‎
Lines changed: 15 additions & 1 deletion
diff --git a/‎spark/src/main/scala/org/apache/comet/testing/FuzzDataGenerator.scala‎
Lines changed: 2 additions & 2 deletions b/‎spark/src/main/scala/org/apache/comet/testing/FuzzDataGenerator.scala‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎spark/src/main/scala/org/apache/spark/Plugins.scala‎
Lines changed: 2 additions & 2 deletions b/‎spark/src/main/scala/org/apache/spark/Plugins.scala‎
Lines changed: 2 additions & 2 deletions
@@ -279,41 +279,13 @@ object CometConf extends ShimCometConf {
     .booleanConf
     .createWithDefault(false)
 
-  val COMET_MEMORY_OVERHEAD: OptionalConfigEntry[Long] = conf("spark.comet.memoryOverhead")
+  val COMET_ONHEAP_MEMORY_OVERHEAD: ConfigEntry[Long] = conf("spark.comet.memoryOverhead")
     .category(CATEGORY_TESTING)
     .doc(
       "The amount of additional memory to be allocated per executor process for Comet, in MiB, " +
-        "when running Spark in on-heap mode. " +
-        "This config is optional. If this is not specified, it will be set to " +
-        s"`spark.comet.memory.overhead.factor` * `spark.executor.memory`. $TUNING_GUIDE.")
-    .internal()
+        "when running Spark in on-heap mode.")
     .bytesConf(ByteUnit.MiB)
-    .createOptional
-
-  val COMET_MEMORY_OVERHEAD_FACTOR: ConfigEntry[Double] =
-    conf("spark.comet.memory.overhead.factor")
-      .category(CATEGORY_TESTING)
-      .doc(
-        "Fraction of executor memory to be allocated as additional memory for Comet " +
-          "when running Spark in on-heap mode. " +
-          s"$TUNING_GUIDE.")
-      .internal()
-      .doubleConf
-      .checkValue(
-        factor => factor > 0,
-        "Ensure that Comet memory overhead factor is a double greater than 0")
-      .createWithDefault(0.2)
-
-  val COMET_MEMORY_OVERHEAD_MIN_MIB: ConfigEntry[Long] = conf("spark.comet.memory.overhead.min")
-    .category(CATEGORY_TESTING)
-    .doc("Minimum amount of additional memory to be allocated per executor process for Comet, " +
-      s"in MiB, when running Spark in on-heap mode. $TUNING_GUIDE.")
-    .internal()
-    .bytesConf(ByteUnit.MiB)
-    .checkValue(
-      _ >= 0,
-      "Ensure that Comet memory overhead min is a long greater than or equal to 0")
-    .createWithDefault(384)
+    .createWithDefault(1024)
 
   val COMET_EXEC_SHUFFLE_ENABLED: ConfigEntry[Boolean] =
     conf(s"$COMET_EXEC_CONFIG_PREFIX.shuffle.enabled")
@@ -436,18 +408,8 @@ object CometConf extends ShimCometConf {
       .intConf
       .createWithDefault(Int.MaxValue)
 
-  val COMET_COLUMNAR_SHUFFLE_MEMORY_SIZE: OptionalConfigEntry[Long] =
-    conf("spark.comet.columnar.shuffle.memorySize")
-      .internal()
-      .category(CATEGORY_TESTING)
-      .doc("Amount of memory to reserve for columnar shuffle when running in on-heap mode. " +
-        s"$TUNING_GUIDE.")
-      .bytesConf(ByteUnit.MiB)
-      .createOptional
-
-  val COMET_COLUMNAR_SHUFFLE_MEMORY_FACTOR: ConfigEntry[Double] =
+  val COMET_ONHEAP_SHUFFLE_MEMORY_FACTOR: ConfigEntry[Double] =
     conf("spark.comet.columnar.shuffle.memory.factor")
-      .internal()
       .category(CATEGORY_TESTING)
       .doc("Fraction of Comet memory to be allocated per executor process for columnar shuffle " +
         s"when running in on-heap mode. $TUNING_GUIDE.")
@@ -534,7 +496,6 @@ object CometConf extends ShimCometConf {
       .category(CATEGORY_EXEC_EXPLAIN)
       .doc("When this setting is enabled, Comet will log all plan transformations performed " +
         "in physical optimizer rules. Default: false")
-      .internal()
       .booleanConf
       .createWithDefault(false)
 
@@ -569,15 +530,14 @@ object CometConf extends ShimCometConf {
       .booleanConf
       .createWithDefault(false)
 
-  val COMET_ENABLE_ONHEAP_MODE: ConfigEntry[Boolean] =
+  val COMET_ONHEAP_ENABLED: ConfigEntry[Boolean] =
     conf("spark.comet.exec.onHeap.enabled")
       .category(CATEGORY_TESTING)
       .doc("Whether to allow Comet to run in on-heap mode. Required for running Spark SQL tests.")
-      .internal()
       .booleanConf
       .createWithDefault(sys.env.getOrElse("ENABLE_COMET_ONHEAP", "false").toBoolean)
 
-  val COMET_EXEC_OFFHEAP_MEMORY_POOL_TYPE: ConfigEntry[String] =
+  val COMET_OFFHEAP_MEMORY_POOL_TYPE: ConfigEntry[String] =
     conf("spark.comet.exec.memoryPool")
       .category(CATEGORY_TUNING)
       .doc(
@@ -587,19 +547,18 @@ object CometConf extends ShimCometConf {
       .stringConf
       .createWithDefault("fair_unified")
 
-  val COMET_EXEC_ONHEAP_MEMORY_POOL_TYPE: ConfigEntry[String] = conf(
+  val COMET_ONHEAP_MEMORY_POOL_TYPE: ConfigEntry[String] = conf(
     "spark.comet.exec.onHeap.memoryPool")
-    .category(CATEGORY_TUNING)
+    .category(CATEGORY_TESTING)
     .doc(
       "The type of memory pool to be used for Comet native execution " +
         "when running Spark in on-heap mode. Available pool types are `greedy`, `fair_spill`, " +
         "`greedy_task_shared`, `fair_spill_task_shared`, `greedy_global`, `fair_spill_global`, " +
         "and `unbounded`.")
-    .internal()
     .stringConf
     .createWithDefault("greedy_task_shared")
 
-  val COMET_EXEC_MEMORY_POOL_FRACTION: ConfigEntry[Double] =
+  val COMET_OFFHEAP_MEMORY_POOL_FRACTION: ConfigEntry[Double] =
     conf("spark.comet.exec.memoryPool.fraction")
       .category(CATEGORY_TUNING)
       .doc(
 
@@ -84,6 +84,7 @@ These settings can be used to determine which parts of the plan are accelerated
 | Config | Description | Default Value |
 |--------|-------------|---------------|
 | `spark.comet.explain.native.enabled` | When this setting is enabled, Comet will provide a tree representation of the native query plan before execution and again after execution, with metrics. | false |
+| `spark.comet.explain.rules` | When this setting is enabled, Comet will log all plan transformations performed in physical optimizer rules. Default: false | false |
 | `spark.comet.explain.verbose.enabled` | When this setting is enabled, Comet's extended explain output will provide the full query plan annotated with fallback reasons as well as a summary of how much of the plan was accelerated by Comet. When this setting is disabled, a list of fallback reasons will be provided instead. | false |
 | `spark.comet.explainFallback.enabled` | When this setting is enabled, Comet will provide logging explaining the reason(s) why a query stage cannot be executed natively. Set this to false to reduce the amount of logging. | false |
 | `spark.comet.logFallbackReasons.enabled` | When this setting is enabled, Comet will log warnings for all fallback reasons. | false |
@@ -120,6 +121,18 @@ These settings can be used to determine which parts of the plan are accelerated
 | `spark.comet.tracing.enabled` | Enable fine-grained tracing of events and memory usage. For more information, refer to the [Comet Tracing Guide](https://datafusion.apache.org/comet/user-guide/tracing.html). | false |
 <!--END:CONFIG_TABLE-->
 
+## Development & Testing Settings
+
+<!-- WARNING! DO NOT MANUALLY MODIFY CONTENT BETWEEN THE BEGIN AND END TAGS -->
+<!--BEGIN:CONFIG_TABLE[testing]-->
+| Config | Description | Default Value |
+|--------|-------------|---------------|
+| `spark.comet.columnar.shuffle.memory.factor` | Fraction of Comet memory to be allocated per executor process for columnar shuffle when running in on-heap mode. For more information, refer to the [Comet Tuning Guide](https://datafusion.apache.org/comet/user-guide/tuning.html). | 1.0 |
+| `spark.comet.exec.onHeap.enabled` | Whether to allow Comet to run in on-heap mode. Required for running Spark SQL tests. | false |
+| `spark.comet.exec.onHeap.memoryPool` | The type of memory pool to be used for Comet native execution when running Spark in on-heap mode. Available pool types are `greedy`, `fair_spill`, `greedy_task_shared`, `fair_spill_task_shared`, `greedy_global`, `fair_spill_global`, and `unbounded`. | greedy_task_shared |
+| `spark.comet.memoryOverhead` | The amount of additional memory to be allocated per executor process for Comet, in MiB, when running Spark in on-heap mode. | 1024 MiB |
+<!--END:CONFIG_TABLE-->
+
 ## Enabling or Disabling Individual Operators
 
 <!-- WARNING! DO NOT MANUALLY MODIFY CONTENT BETWEEN THE BEGIN AND END TAGS -->
 
@@ -43,6 +43,7 @@ use datafusion_comet_proto::spark_operator::Operator;
 use datafusion_spark::function::bitwise::bit_get::SparkBitGet;
 use datafusion_spark::function::datetime::date_add::SparkDateAdd;
 use datafusion_spark::function::datetime::date_sub::SparkDateSub;
+use datafusion_spark::function::hash::sha1::SparkSha1;
 use datafusion_spark::function::hash::sha2::SparkSha2;
 use datafusion_spark::function::math::expm1::SparkExpm1;
 use datafusion_spark::function::string::char::CharFunc;
@@ -332,6 +333,7 @@ fn prepare_datafusion_session_context(
     session_ctx.register_udf(ScalarUDF::new_from_impl(SparkBitGet::default()));
     session_ctx.register_udf(ScalarUDF::new_from_impl(SparkDateAdd::default()));
     session_ctx.register_udf(ScalarUDF::new_from_impl(SparkDateSub::default()));
+    session_ctx.register_udf(ScalarUDF::new_from_impl(SparkSha1::default()));
 
     // Must be the last one to override existing functions with the same name
     datafusion_comet_spark_expr::register_all_comet_functions(&mut session_ctx)?;
 
@@ -273,10 +273,10 @@ object CometExecIterator extends Logging {
     if (offHeapMode) {
       // in off-heap mode, Comet uses unified memory management to share off-heap memory with Spark
       val offHeapSize = ByteUnit.MiB.toBytes(conf.getSizeAsMb("spark.memory.offHeap.size"))
-      val memoryFraction = CometConf.COMET_EXEC_MEMORY_POOL_FRACTION.get()
+      val memoryFraction = CometConf.COMET_OFFHEAP_MEMORY_POOL_FRACTION.get()
       val memoryLimit = (offHeapSize * memoryFraction).toLong
       val memoryLimitPerTask = (memoryLimit.toDouble * coresPerTask / numCores).toLong
-      val memoryPoolType = COMET_EXEC_OFFHEAP_MEMORY_POOL_TYPE.get()
+      val memoryPoolType = COMET_OFFHEAP_MEMORY_POOL_TYPE.get()
       logInfo(
         s"memoryPoolType=$memoryPoolType, " +
           s"offHeapSize=${toMB(offHeapSize)}, " +
@@ -291,7 +291,7 @@ object CometExecIterator extends Logging {
       // example 16GB maxMemory * 16 cores with 4 cores per task results
       // in memory_limit_per_task = 16 GB * 4 / 16 = 16 GB / 4 = 4GB
       val memoryLimitPerTask = (memoryLimit.toDouble * coresPerTask / numCores).toLong
-      val memoryPoolType = COMET_EXEC_ONHEAP_MEMORY_POOL_TYPE.get()
+      val memoryPoolType = COMET_ONHEAP_MEMORY_POOL_TYPE.get()
       logInfo(
         s"memoryPoolType=$memoryPoolType, " +
           s"memoryLimit=${toMB(memoryLimit)}, " +
 
@@ -243,46 +243,25 @@ object CometSparkSessionExtensions extends Logging {
   }
 
   /**
-   * Calculates required memory overhead in MB per executor process for Comet when running in
+   * Determines required memory overhead in MB per executor process for Comet when running in
    * on-heap mode.
-   *
-   * If `COMET_MEMORY_OVERHEAD` is defined then that value will be used, otherwise the overhead
-   * will be calculated by multiplying executor memory (`spark.executor.memory`) by
-   * `COMET_MEMORY_OVERHEAD_FACTOR`.
-   *
-   * In either case, a minimum value of `COMET_MEMORY_OVERHEAD_MIN_MIB` will be returned.
    */
   def getCometMemoryOverheadInMiB(sparkConf: SparkConf): Long = {
     if (isOffHeapEnabled(sparkConf)) {
       // when running in off-heap mode we use unified memory management to share
       // off-heap memory with Spark so do not add overhead
       return 0
     }
-
-    // `spark.executor.memory` default value is 1g
-    val baseMemoryMiB = ConfigHelpers
-      .byteFromString(sparkConf.get("spark.executor.memory", "1024MB"), ByteUnit.MiB)
-
-    val cometMemoryOverheadMinAsString = sparkConf.get(
-      COMET_MEMORY_OVERHEAD_MIN_MIB.key,
-      COMET_MEMORY_OVERHEAD_MIN_MIB.defaultValueString)
-
-    val minimum = ConfigHelpers.byteFromString(cometMemoryOverheadMinAsString, ByteUnit.MiB)
-    val overheadFactor = getDoubleConf(sparkConf, COMET_MEMORY_OVERHEAD_FACTOR)
-
-    val overHeadMemFromConf = sparkConf
-      .getOption(COMET_MEMORY_OVERHEAD.key)
-      .map(ConfigHelpers.byteFromString(_, ByteUnit.MiB))
-
-    overHeadMemFromConf.getOrElse(math.max((overheadFactor * baseMemoryMiB).toLong, minimum))
+    ConfigHelpers.byteFromString(
+      sparkConf.get(
+        COMET_ONHEAP_MEMORY_OVERHEAD.key,
+        COMET_ONHEAP_MEMORY_OVERHEAD.defaultValueString),
+      ByteUnit.MiB)
   }
 
   private def getBooleanConf(conf: SparkConf, entry: ConfigEntry[Boolean]) =
     conf.getBoolean(entry.key, entry.defaultValue.get)
 
-  private def getDoubleConf(conf: SparkConf, entry: ConfigEntry[Double]) =
-    conf.getDouble(entry.key, entry.defaultValue.get)
-
   /**
    * Calculates required memory overhead in bytes per executor process for Comet when running in
    * on-heap mode.
@@ -300,11 +279,9 @@ object CometSparkSessionExtensions extends Logging {
 
     val cometMemoryOverhead = getCometMemoryOverheadInMiB(sparkConf)
 
-    val overheadFactor = COMET_COLUMNAR_SHUFFLE_MEMORY_FACTOR.get(conf)
-    val cometShuffleMemoryFromConf = COMET_COLUMNAR_SHUFFLE_MEMORY_SIZE.get(conf)
+    val overheadFactor = COMET_ONHEAP_SHUFFLE_MEMORY_FACTOR.get(conf)
 
-    val shuffleMemorySize =
-      cometShuffleMemoryFromConf.getOrElse((overheadFactor * cometMemoryOverhead).toLong)
+    val shuffleMemorySize = (overheadFactor * cometMemoryOverhead).toLong
     if (shuffleMemorySize > cometMemoryOverhead) {
       logWarning(
         s"Configured shuffle memory size $shuffleMemorySize is larger than Comet memory overhead " +
 
@@ -25,6 +25,7 @@ import scala.collection.mutable.ListBuffer
 
 import org.apache.spark.sql.catalyst.expressions.Cast
 
+import org.apache.comet.CometConf.COMET_ONHEAP_MEMORY_OVERHEAD
 import org.apache.comet.expressions.{CometCast, CometEvalMode}
 import org.apache.comet.serde.{Compatible, Incompatible, QueryPlanSerde}
 
@@ -78,7 +79,13 @@ object GenerateDocs {
                 if (conf.defaultValue.isEmpty) {
                   w.write(s"| `${conf.key}` | $doc | |\n".getBytes)
                 } else {
-                  w.write(s"| `${conf.key}` | $doc | ${conf.defaultValueString} |\n".getBytes)
+                  val isBytesConf = conf.key == COMET_ONHEAP_MEMORY_OVERHEAD.key
+                  if (isBytesConf) {
+                    val bytes = conf.defaultValue.get.asInstanceOf[Long]
+                    w.write(s"| `${conf.key}` | $doc | $bytes MiB |\n".getBytes)
+                  } else {
+                    w.write(s"| `${conf.key}` | $doc | ${conf.defaultValueString} |\n".getBytes)
+                  }
                 }
               }
           }
 
@@ -157,7 +157,8 @@ object QueryPlanSerde extends Logging with CometExprShim {
     classOf[Md5] -> CometScalarFunction("md5"),
     classOf[Murmur3Hash] -> CometMurmur3Hash,
     classOf[Sha2] -> CometSha2,
-    classOf[XxHash64] -> CometXxHash64)
+    classOf[XxHash64] -> CometXxHash64,
+    classOf[Sha1] -> CometSha1)
 
   private val stringExpressions: Map[Class[_ <: Expression], CometExpressionSerde[_]] = Map(
     classOf[Ascii] -> CometScalarFunction("ascii"),
 
@@ -19,7 +19,7 @@
 
 package org.apache.comet.serde
 
-import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, Murmur3Hash, Sha2, XxHash64}
+import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, Murmur3Hash, Sha1, Sha2, XxHash64}
 import org.apache.spark.sql.types.{DecimalType, IntegerType, LongType, StringType}
 
 import org.apache.comet.CometSparkSessionExtensions.withInfo
@@ -89,6 +89,20 @@ object CometSha2 extends CometExpressionSerde[Sha2] {
   }
 }
 
+object CometSha1 extends CometExpressionSerde[Sha1] {
+  override def convert(
+      expr: Sha1,
+      inputs: Seq[Attribute],
+      binding: Boolean): Option[ExprOuterClass.Expr] = {
+    if (!HashUtils.isSupportedType(expr)) {
+      withInfo(expr, s"HashUtils doesn't support dataType: ${expr.child.dataType}")
+      return None
+    }
+    val childExpr = exprToProtoInternal(expr.child, inputs, binding)
+    scalarFunctionExprToProtoWithReturnType("sha1", StringType, false, childExpr)
+  }
+}
+
 private object HashUtils {
   def isSupportedType(expr: Expression): Boolean = {
     for (child <- expr.children) {
 
@@ -117,7 +117,7 @@ object FuzzDataGenerator {
     // generate schema using random data types
     val fields = dataTypes.zipWithIndex
       .map(i => StructField(s"c${i._2}", i._1, nullable = true))
-    val schema = StructType(fields)
+    val schema = StructType(fields.toSeq)
 
     // generate columnar data
     val cols: Seq[Seq[Any]] =
@@ -147,7 +147,7 @@ object FuzzDataGenerator {
             list += Range(0, r.nextInt(5)).map(j => values((i + j) % values.length)).toArray
           }
         }
-        list
+        list.toSeq
       case StructType(fields) =>
         val values = fields.map(f => generateColumn(r, f.dataType, numRows, options))
         Range(0, numRows).map(i => Row(values.indices.map(j => values(j)(i)): _*))
 
@@ -28,7 +28,7 @@ import org.apache.spark.internal.Logging
 import org.apache.spark.internal.config.{EXECUTOR_MEMORY, EXECUTOR_MEMORY_OVERHEAD, EXECUTOR_MEMORY_OVERHEAD_FACTOR}
 import org.apache.spark.sql.internal.StaticSQLConf
 
-import org.apache.comet.CometConf.COMET_ENABLE_ONHEAP_MODE
+import org.apache.comet.CometConf.COMET_ONHEAP_ENABLED
 import org.apache.comet.CometSparkSessionExtensions
 
 /**
@@ -49,7 +49,7 @@ class CometDriverPlugin extends DriverPlugin with Logging with ShimCometDriverPl
     logInfo("CometDriverPlugin init")
 
     if (!CometSparkSessionExtensions.isOffHeapEnabled(sc.getConf) &&
-      !sc.getConf.getBoolean(COMET_ENABLE_ONHEAP_MODE.key, false)) {
+      !sc.getConf.getBoolean(COMET_ONHEAP_ENABLED.key, false)) {
       logWarning("Comet plugin is disabled because Spark is not running in off-heap mode.")
       return Collections.emptyMap[String, String]
     }
Original file line number	Diff line number	Diff line change
`@@ -25,6 +25,7 @@ import scala.collection.mutable.ListBuffer`
`25`	`25`
`26`	`26`	`import org.apache.spark.sql.catalyst.expressions.Cast`
`27`	`27`
	`28`	`+import org.apache.comet.CometConf.COMET_ONHEAP_MEMORY_OVERHEAD`
`28`	`29`	`import org.apache.comet.expressions.{CometCast, CometEvalMode}`
`29`	`30`	`import org.apache.comet.serde.{Compatible, Incompatible, QueryPlanSerde}`
`30`	`31`
`@@ -78,7 +79,13 @@ object GenerateDocs {`
`78`	`79`	`if (conf.defaultValue.isEmpty) {`
`79`	`80`	w.write(s"\| `${conf.key}` \| $doc \| \|\n".getBytes)
`80`	`81`	`} else {`
`81`		- w.write(s"\| `${conf.key}` \| $doc \| ${conf.defaultValueString} \|\n".getBytes)
	`82`	`+ val isBytesConf = conf.key == COMET_ONHEAP_MEMORY_OVERHEAD.key`
	`83`	`+ if (isBytesConf) {`
	`84`	`+ val bytes = conf.defaultValue.get.asInstanceOf[Long]`
	`85`	+ w.write(s"\| `${conf.key}` \| $doc \| $bytes MiB \|\n".getBytes)
	`86`	`+ } else {`
	`87`	+ w.write(s"\| `${conf.key}` \| $doc \| ${conf.defaultValueString} \|\n".getBytes)
	`88`	`+ }`
`82`	`89`	`}`
`83`	`90`	`}`
`84`	`91`	`}`