feat: monotonically_increasing_id and spark_partition_id implementation (#2037)

akupchinskiy · web-flow · commit e73fff09b95c · 2025-07-30T17:43:42.000-06:00
diff --git a/dev/diffs/4.0.0.diff b/dev/diffs/4.0.0.diff
@@ -1,5 +1,5 @@
 diff --git a/pom.xml b/pom.xml
-index 443d46a4302..3b8483173f1 100644
+index a4b1b2c3c9f..63ec4784625 100644
 --- a/pom.xml
 +++ b/pom.xml
 @@ -148,6 +148,8 @@
@@ -1732,14 +1732,14 @@ index aed11badb71..ab7e9456e26 100644
            spark.range(1).foreach { _ =>
              columnarToRowExec.canonicalized
 diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/WholeStageCodegenSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/WholeStageCodegenSuite.scala
-index a3cfdc5a240..f4afc393ba0 100644
+index a3cfdc5a240..1b08a1f42ee 100644
 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/WholeStageCodegenSuite.scala
 +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/WholeStageCodegenSuite.scala
 @@ -22,6 +22,7 @@ import org.apache.spark.rdd.MapPartitionsWithEvaluatorRDD
  import org.apache.spark.sql.{Dataset, QueryTest, Row, SaveMode}
  import org.apache.spark.sql.catalyst.expressions.CodegenObjectFactoryMode
  import org.apache.spark.sql.catalyst.expressions.codegen.{ByteCodeStats, CodeAndComment, CodeGenerator}
-+import org.apache.spark.sql.comet.{CometHashJoinExec, CometSortExec, CometSortMergeJoinExec}
++import org.apache.spark.sql.comet.{CometColumnarToRowExec, CometHashJoinExec, CometSortExec, CometSortMergeJoinExec}
  import org.apache.spark.sql.execution.adaptive.DisableAdaptiveExecutionSuite
  import org.apache.spark.sql.execution.aggregate.{HashAggregateExec, SortAggregateExec}
  import org.apache.spark.sql.execution.columnar.InMemoryTableScanExec
@@ -1952,6 +1952,16 @@ index a3cfdc5a240..f4afc393ba0 100644
            val projection = Seq.tabulate(columnNum)(i => s"c$i + c$i as newC$i")
            val df = spark.read.parquet(path).selectExpr(projection: _*)
  
+@@ -815,6 +852,9 @@ class WholeStageCodegenSuite extends QueryTest with SharedSparkSession
+       assert(distinctWithId.queryExecution.executedPlan.exists {
+         case WholeStageCodegenExec(
+           ProjectExec(_, BroadcastHashJoinExec(_, _, _, _, _, _: HashAggregateExec, _, _))) => true
++        case WholeStageCodegenExec(
++          ProjectExec(_, BroadcastHashJoinExec(_, _, _, _, _, _: CometColumnarToRowExec, _, _))) =>
++            true
+         case _ => false
+       })
+       checkAnswer(distinctWithId, Seq(Row(1, 0), Row(1, 0)))
 diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala
 index 272be70f9fe..06957694002 100644
 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala
diff --git a/docs/spark_expressions_support.md b/docs/spark_expressions_support.md
@@ -349,11 +349,11 @@
  - [ ] input_file_block_length
  - [ ] input_file_block_start
  - [ ] input_file_name
- - [ ] monotonically_increasing_id
+ - [x] monotonically_increasing_id
  - [ ] raise_error
  - [x] rand
  - [x] randn
- - [ ] spark_partition_id
+ - [x] spark_partition_id
  - [ ] typeof
  - [x] user
  - [ ] uuid
diff --git a/native/core/src/execution/planner.rs b/native/core/src/execution/planner.rs
@@ -100,6 +100,7 @@ use datafusion_comet_proto::{
     },
     spark_partitioning::{partitioning::PartitioningStruct, Partitioning as SparkPartitioning},
 };
+use datafusion_comet_spark_expr::monotonically_increasing_id::MonotonicallyIncreasingId;
 use datafusion_comet_spark_expr::{
     ArrayInsert, Avg, AvgDecimal, Cast, CheckOverflow, Correlation, Covariance, CreateNamedStruct,
     GetArrayStructFields, GetStructField, IfExpr, ListExtract, NormalizeNaNAndZero, RLike,
@@ -831,6 +832,12 @@ impl PhysicalPlanner {
                 let seed = expr.seed.wrapping_add(self.partition.into());
                 Ok(Arc::new(RandnExpr::new(seed)))
             }
+            ExprStruct::SparkPartitionId(_) => Ok(Arc::new(DataFusionLiteral::new(
+                ScalarValue::Int32(Some(self.partition)),
+            ))),
+            ExprStruct::MonotonicallyIncreasingId(_) => Ok(Arc::new(
+                MonotonicallyIncreasingId::from_partition_id(self.partition),
+            )),
             expr => Err(GeneralError(format!("Not implemented: {expr:?}"))),
         }
     }
diff --git a/native/proto/src/proto/expr.proto b/native/proto/src/proto/expr.proto
@@ -82,6 +82,8 @@ message Expr {
     ToPrettyString to_pretty_string = 60;
     Rand rand = 61;
     Rand randn = 62;
+    EmptyExpr spark_partition_id = 63;
+    EmptyExpr monotonically_increasing_id = 64;
   }
 }
 
@@ -250,6 +252,9 @@ message UnaryExpr {
   Expr child = 1;
 }
 
+message EmptyExpr {
+}
+
 // Bound to a particular vector array in input batch.
 message BoundReference {
   int32 index = 1;
diff --git a/native/spark-expr/src/nondetermenistic_funcs/mod.rs b/native/spark-expr/src/nondetermenistic_funcs/mod.rs
@@ -16,6 +16,7 @@
 // under the License.
 
 pub mod internal;
+pub mod monotonically_increasing_id;
 pub mod rand;
 pub mod randn;
 
diff --git a/native/spark-expr/src/nondetermenistic_funcs/monotonically_increasing_id.rs b/native/spark-expr/src/nondetermenistic_funcs/monotonically_increasing_id.rs
@@ -0,0 +1,162 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::{Int64Array, RecordBatch};
+use arrow::datatypes::{DataType, Schema};
+use datafusion::common::Result;
+use datafusion::logical_expr::ColumnarValue;
+use datafusion::physical_expr::PhysicalExpr;
+use std::any::Any;
+use std::fmt::{Debug, Display, Formatter};
+use std::hash::{Hash, Hasher};
+use std::sync::atomic::{AtomicI64, Ordering};
+use std::sync::Arc;
+
+#[derive(Debug)]
+pub struct MonotonicallyIncreasingId {
+    initial_offset: i64,
+    current_offset: AtomicI64,
+}
+
+impl MonotonicallyIncreasingId {
+    pub fn from_offset(offset: i64) -> Self {
+        Self {
+            initial_offset: offset,
+            current_offset: AtomicI64::new(offset),
+        }
+    }
+
+    pub fn from_partition_id(partition: i32) -> Self {
+        Self::from_offset((partition as i64) << 33)
+    }
+}
+
+impl Display for MonotonicallyIncreasingId {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        write!(f, "monotonically_increasing_id()")
+    }
+}
+
+impl PartialEq for MonotonicallyIncreasingId {
+    fn eq(&self, other: &Self) -> bool {
+        self.initial_offset == other.initial_offset
+    }
+}
+
+impl Eq for MonotonicallyIncreasingId {}
+
+impl Hash for MonotonicallyIncreasingId {
+    fn hash<H: Hasher>(&self, state: &mut H) {
+        self.initial_offset.hash(state);
+    }
+}
+
+impl PhysicalExpr for MonotonicallyIncreasingId {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn evaluate(&self, batch: &RecordBatch) -> Result<ColumnarValue> {
+        let start = self
+            .current_offset
+            .fetch_add(batch.num_rows() as i64, Ordering::Relaxed);
+        let end = start + batch.num_rows() as i64;
+        let array_ref = Arc::new(Int64Array::from_iter_values(start..end));
+        Ok(ColumnarValue::Array(array_ref))
+    }
+
+    fn children(&self) -> Vec<&Arc<dyn PhysicalExpr>> {
+        vec![]
+    }
+
+    fn with_new_children(
+        self: Arc<Self>,
+        _: Vec<Arc<dyn PhysicalExpr>>,
+    ) -> Result<Arc<dyn PhysicalExpr>> {
+        Ok(self)
+    }
+
+    fn fmt_sql(&self, _: &mut Formatter<'_>) -> std::fmt::Result {
+        unimplemented!()
+    }
+
+    fn data_type(&self, _input_schema: &Schema) -> Result<DataType> {
+        Ok(DataType::Int64)
+    }
+
+    fn nullable(&self, _input_schema: &Schema) -> Result<bool> {
+        Ok(false)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use arrow::array::{Array, Int64Array};
+    use arrow::compute::concat;
+    use arrow::{array::StringArray, datatypes::*};
+    use datafusion::common::cast::as_int64_array;
+
+    #[test]
+    fn test_monotonically_increasing_id_single_batch() -> Result<()> {
+        let schema = Schema::new(vec![Field::new("a", DataType::Utf8, true)]);
+        let data = StringArray::from(vec![Some("foo"), None, None, Some("bar"), None]);
+        let batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(data)])?;
+        let mid_expr = MonotonicallyIncreasingId::from_offset(0);
+        let result = mid_expr.evaluate(&batch)?.into_array(batch.num_rows())?;
+        let result = as_int64_array(&result)?;
+        let expected = &Int64Array::from_iter_values(0..batch.num_rows() as i64);
+        assert_eq!(result, expected);
+        Ok(())
+    }
+
+    #[test]
+    fn test_monotonically_increasing_id_multi_batch() -> Result<()> {
+        let first_batch_schema = Schema::new(vec![Field::new("a", DataType::Int64, true)]);
+        let first_batch_data = Int64Array::from(vec![Some(42), None]);
+        let second_batch_schema = first_batch_schema.clone();
+        let second_batch_data = Int64Array::from(vec![None, Some(-42), None]);
+        let starting_offset: i64 = 100;
+        let mid_expr = MonotonicallyIncreasingId::from_offset(starting_offset);
+        let first_batch = RecordBatch::try_new(
+            Arc::new(first_batch_schema),
+            vec![Arc::new(first_batch_data)],
+        )?;
+        let first_batch_result = mid_expr
+            .evaluate(&first_batch)?
+            .into_array(first_batch.num_rows())?;
+        let second_batch = RecordBatch::try_new(
+            Arc::new(second_batch_schema),
+            vec![Arc::new(second_batch_data)],
+        )?;
+        let second_batch_result = mid_expr
+            .evaluate(&second_batch)?
+            .into_array(second_batch.num_rows())?;
+        let result_arrays: Vec<&dyn Array> = vec![
+            as_int64_array(&first_batch_result)?,
+            as_int64_array(&second_batch_result)?,
+        ];
+        let result_arrays = &concat(&result_arrays)?;
+        let final_result = as_int64_array(result_arrays)?;
+        let range_start = starting_offset;
+        let range_end =
+            starting_offset + first_batch.num_rows() as i64 + second_batch.num_rows() as i64;
+        let expected = &Int64Array::from_iter_values(range_start..range_end);
+        assert_eq!(final_result, expected);
+        Ok(())
+    }
+}
diff --git a/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala b/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala
@@ -125,7 +125,11 @@ object QueryPlanSerde extends Logging with CometExprShim {
     classOf[MapKeys] -> CometMapKeys,
     classOf[MapValues] -> CometMapValues,
     classOf[MapFromArrays] -> CometMapFromArrays,
-    classOf[GetMapValue] -> CometMapExtract)
+    classOf[GetMapValue] -> CometMapExtract,
+    classOf[Rand] -> CometRand,
+    classOf[Randn] -> CometRandn,
+    classOf[SparkPartitionID] -> CometSparkPartitionId,
+    classOf[MonotonicallyIncreasingID] -> CometMonotonicallyIncreasingId)
 
   def emitWarning(reason: String): Unit = {
     logWarning(s"Comet native execution is disabled due to: $reason")
@@ -1729,28 +1733,6 @@ object QueryPlanSerde extends Logging with CometExprShim {
         convert(CometArrayCompact)
       case _: ArrayExcept =>
         convert(CometArrayExcept)
-      case Rand(child, _) =>
-        val seed = child match {
-          case Literal(seed: Long, _) => Some(seed)
-          case Literal(null, _) => Some(0L)
-          case _ => None
-        }
-        seed.map(seed =>
-          ExprOuterClass.Expr
-            .newBuilder()
-            .setRand(ExprOuterClass.Rand.newBuilder().setSeed(seed))
-            .build())
-      case Randn(child, _) =>
-        val seed = child match {
-          case Literal(seed: Long, _) => Some(seed)
-          case Literal(null, _) => Some(0L)
-          case _ => None
-        }
-        seed.map(seed =>
-          ExprOuterClass.Expr
-            .newBuilder()
-            .setRandn(ExprOuterClass.Rand.newBuilder().setSeed(seed))
-            .build())
       case expr =>
         QueryPlanSerde.exprSerdeMap.get(expr.getClass) match {
           case Some(handler) => convert(handler)
diff --git a/spark/src/main/scala/org/apache/comet/serde/nondetermenistic.scala b/spark/src/main/scala/org/apache/comet/serde/nondetermenistic.scala
@@ -0,0 +1,90 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.comet.serde
+
+import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, Literal, MonotonicallyIncreasingID, Rand, Randn, SparkPartitionID}
+
+object CometSparkPartitionId extends CometExpressionSerde {
+  override def convert(
+      expr: Expression,
+      _inputs: Seq[Attribute],
+      _binding: Boolean): Option[ExprOuterClass.Expr] = {
+    assert(expr.isInstanceOf[SparkPartitionID])
+    Some(
+      ExprOuterClass.Expr
+        .newBuilder()
+        .setSparkPartitionId(ExprOuterClass.EmptyExpr.newBuilder())
+        .build())
+  }
+}
+
+object CometMonotonicallyIncreasingId extends CometExpressionSerde {
+  override def convert(
+      expr: Expression,
+      _inputs: Seq[Attribute],
+      _binding: Boolean): Option[ExprOuterClass.Expr] = {
+    assert(expr.isInstanceOf[MonotonicallyIncreasingID])
+    Some(
+      ExprOuterClass.Expr
+        .newBuilder()
+        .setMonotonicallyIncreasingId(ExprOuterClass.EmptyExpr.newBuilder())
+        .build())
+  }
+}
+
+sealed abstract class CometRandCommonSerde extends CometExpressionSerde {
+  protected def extractSeedFromExpr(expr: Expression): Option[Long] = {
+    expr match {
+      case Literal(seed: Long, _) => Some(seed)
+      case Literal(null, _) => Some(0L)
+      case _ => None
+    }
+  }
+}
+
+object CometRand extends CometRandCommonSerde {
+  override def convert(
+      expr: Expression,
+      inputs: Seq[Attribute],
+      binding: Boolean): Option[ExprOuterClass.Expr] = {
+    val Rand(child, _) = expr
+    extractSeedFromExpr(child).map { seed =>
+      ExprOuterClass.Expr
+        .newBuilder()
+        .setRand(ExprOuterClass.Rand.newBuilder().setSeed(seed))
+        .build()
+    }
+  }
+}
+
+object CometRandn extends CometRandCommonSerde {
+  override def convert(
+      expr: Expression,
+      inputs: Seq[Attribute],
+      binding: Boolean): Option[ExprOuterClass.Expr] = {
+    val Randn(child, _) = expr
+    extractSeedFromExpr(child).map { seed =>
+      ExprOuterClass.Expr
+        .newBuilder()
+        .setRandn(ExprOuterClass.Rand.newBuilder().setSeed(seed))
+        .build()
+    }
+  }
+}
diff --git a/spark/src/test/scala/org/apache/comet/CometExpressionSuite.scala b/spark/src/test/scala/org/apache/comet/CometExpressionSuite.scala

Original file line number	Diff line number	Diff line change
`@@ -82,6 +82,8 @@ message Expr {`
`82`	`82`	`ToPrettyString to_pretty_string = 60;`
`83`	`83`	`Rand rand = 61;`
`84`	`84`	`Rand randn = 62;`
	`85`	`+ EmptyExpr spark_partition_id = 63;`
	`86`	`+ EmptyExpr monotonically_increasing_id = 64;`
`85`	`87`	`}`
`86`	`88`	`}`
`87`	`89`
`@@ -250,6 +252,9 @@ message UnaryExpr {`
`250`	`252`	`Expr child = 1;`
`251`	`253`	`}`
`252`	`254`
	`255`	`+message EmptyExpr {`
	`256`	`+}`
	`257`	`+`
`253`	`258`	`// Bound to a particular vector array in input batch.`
`254`	`259`	`message BoundReference {`
`255`	`260`	`int32 index = 1;`