apache
diff --git a/‎.github/workflows/pr_build_macos.yml‎
Lines changed: 9 additions & 0 deletions b/‎.github/workflows/pr_build_macos.yml‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎docs/source/user-guide/latest/compatibility.md‎
Lines changed: 5 additions & 5 deletions b/‎docs/source/user-guide/latest/compatibility.md‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎docs/source/user-guide/latest/configs.md‎
Lines changed: 1 addition & 0 deletions b/‎docs/source/user-guide/latest/configs.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎native/Cargo.lock‎
Lines changed: 2 additions & 2 deletions b/‎native/Cargo.lock‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎native/core/src/execution/planner.rs‎
Lines changed: 7 additions & 0 deletions b/‎native/core/src/execution/planner.rs‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎native/spark-expr/Cargo.toml‎
Lines changed: 4 additions & 0 deletions b/‎native/spark-expr/Cargo.toml‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎native/spark-expr/benches/padding.rs‎
Lines changed: 121 additions & 0 deletions b/‎native/spark-expr/benches/padding.rs‎
Lines changed: 121 additions & 0 deletions
diff --git a/‎native/spark-expr/src/agg_funcs/mod.rs‎
Lines changed: 2 additions & 0 deletions b/‎native/spark-expr/src/agg_funcs/mod.rs‎
Lines changed: 2 additions & 0 deletions
@@ -131,6 +131,7 @@ jobs:
           - name: "sql"
             value: |
               org.apache.spark.sql.CometToPrettyStringSuite
+
       fail-fast: false
     name: ${{ matrix.os }}/${{ matrix.profile.name }} [${{ matrix.suite.name }}]
     runs-on: ${{ matrix.os }}
@@ -143,6 +144,14 @@ jobs:
           jdk-version: ${{ matrix.profile.java_version }}
           jdk-architecture: aarch64
           protoc-architecture: aarch_64
+      - name: Set thread thresholds envs for spark test on macOS
+        # see: https://github.com/apache/datafusion-comet/issues/2965
+        shell: bash
+        run: |
+          echo "SPARK_TEST_SQL_SHUFFLE_EXCHANGE_MAX_THREAD_THRESHOLD=256" >> $GITHUB_ENV
+          echo "SPARK_TEST_SQL_RESULT_QUERY_STAGE_MAX_THREAD_THRESHOLD=256" >> $GITHUB_ENV
+          echo "SPARK_TEST_HIVE_SHUFFLE_EXCHANGE_MAX_THREAD_THRESHOLD=48" >> $GITHUB_ENV
+          echo "SPARK_TEST_HIVE_RESULT_QUERY_STAGE_MAX_THREAD_THRESHOLD=48" >> $GITHUB_ENV
       - name: Java test steps
         uses: ./.github/actions/java-test
         with:
 
@@ -32,12 +32,11 @@ Comet has the following limitations when reading Parquet files:
 
 ## ANSI Mode
 
-Comet will fall back to Spark for the following expressions when ANSI mode is enabled. Thes expressions can be enabled by setting
+Comet will fall back to Spark for the following expressions when ANSI mode is enabled. These expressions can be enabled by setting
 `spark.comet.expression.EXPRNAME.allowIncompatible=true`, where `EXPRNAME` is the Spark expression class name. See
 the [Comet Supported Expressions Guide](expressions.md) for more information on this configuration setting.
 
 - Average
-- Sum
 - Cast (in some cases)
 
 There is an [epic](https://github.com/apache/datafusion-comet/issues/313) where we are tracking the work to fully implement ANSI support.
@@ -159,6 +158,8 @@ The following cast operations are generally compatible with Spark except for the
 | string | short |  |
 | string | integer |  |
 | string | long |  |
+| string | float |  |
+| string | double |  |
 | string | binary |  |
 | string | date | Only supports years between 262143 BC and 262142 AD |
 | binary | string |  |
@@ -181,9 +182,8 @@ The following cast operations are not compatible with Spark for all inputs and a
 |-|-|-|
 | float | decimal  | There can be rounding differences |
 | double | decimal  | There can be rounding differences |
-| string | float  | Does not support inputs ending with 'd' or 'f'. Does not support 'inf'. Does not support ANSI mode. |
-| string | double  | Does not support inputs ending with 'd' or 'f'. Does not support 'inf'. Does not support ANSI mode. |
-| string | decimal  | Does not support inputs ending with 'd' or 'f'. Does not support 'inf'. Does not support ANSI mode. Returns 0.0 instead of null if input contains no digits |
+| string | decimal  | Does not support fullwidth unicode digits (e.g \\uFF10)
+or strings containing null bytes (e.g \\u0000) |
 | string | timestamp  | Not all valid formats are supported |
 <!-- prettier-ignore-end -->
 <!--END:INCOMPAT_CAST_TABLE-->
 
@@ -309,6 +309,7 @@ These settings can be used to determine which parts of the plan are accelerated
 | `spark.comet.expression.Signum.enabled` | Enable Comet acceleration for `Signum` | true |
 | `spark.comet.expression.Sin.enabled` | Enable Comet acceleration for `Sin` | true |
 | `spark.comet.expression.Sinh.enabled` | Enable Comet acceleration for `Sinh` | true |
+| `spark.comet.expression.Size.enabled` | Enable Comet acceleration for `Size` | true |
 | `spark.comet.expression.SortOrder.enabled` | Enable Comet acceleration for `SortOrder` | true |
 | `spark.comet.expression.SparkPartitionID.enabled` | Enable Comet acceleration for `SparkPartitionID` | true |
 | `spark.comet.expression.Sqrt.enabled` | Enable Comet acceleration for `Sqrt` | true |
 
@@ -71,6 +71,7 @@ use datafusion::{
 use datafusion_comet_spark_expr::{
     create_comet_physical_fun, create_comet_physical_fun_with_eval_mode, BinaryOutputStyle,
     BloomFilterAgg, BloomFilterMightContain, EvalMode, SparkHour, SparkMinute, SparkSecond,
+    SumInteger,
 };
 use iceberg::expr::Bind;
 
@@ -1813,6 +1814,12 @@ impl PhysicalPlanner {
                             AggregateUDF::new_from_impl(SumDecimal::try_new(datatype, eval_mode)?);
                         AggregateExprBuilder::new(Arc::new(func), vec![child])
                     }
+                    DataType::Int8 | DataType::Int16 | DataType::Int32 | DataType::Int64 => {
+                        let eval_mode = from_protobuf_eval_mode(expr.eval_mode)?;
+                        let func =
+                            AggregateUDF::new_from_impl(SumInteger::try_new(datatype, eval_mode)?);
+                        AggregateExprBuilder::new(Arc::new(func), vec![child])
+                    }
                     _ => {
                         // cast to the result data type of SUM if necessary, we should not expect
                         // a cast failure since it should have already been checked at Spark side
 
@@ -76,6 +76,10 @@ harness = false
 name = "bloom_filter_agg"
 harness = false
 
+[[bench]]
+name = "padding"
+harness = false
+
 [[test]]
 name = "test_udf_registration"
 path = "tests/spark_expr_reg.rs"
@@ -0,0 +1,121 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::builder::StringBuilder;
+use arrow::array::ArrayRef;
+use criterion::{criterion_group, criterion_main, Criterion};
+use datafusion::common::ScalarValue;
+use datafusion::physical_plan::ColumnarValue;
+use datafusion_comet_spark_expr::{spark_lpad, spark_rpad};
+use std::hint::black_box;
+use std::sync::Arc;
+
+fn create_string_array(size: usize) -> ArrayRef {
+    let mut builder = StringBuilder::new();
+    for i in 0..size {
+        if i % 10 == 0 {
+            builder.append_null();
+        } else {
+            builder.append_value(format!("string{}", i % 100));
+        }
+    }
+    Arc::new(builder.finish())
+}
+
+fn criterion_benchmark(c: &mut Criterion) {
+    let size = 8192;
+    let string_array = create_string_array(size);
+
+    // lpad with default padding (space)
+    c.bench_function("spark_lpad: default padding", |b| {
+        let args = vec![
+            ColumnarValue::Array(Arc::clone(&string_array)),
+            ColumnarValue::Scalar(ScalarValue::Int32(Some(20))),
+        ];
+        b.iter(|| black_box(spark_lpad(black_box(&args)).unwrap()))
+    });
+
+    // lpad with custom padding character
+    c.bench_function("spark_lpad: custom padding", |b| {
+        let args = vec![
+            ColumnarValue::Array(Arc::clone(&string_array)),
+            ColumnarValue::Scalar(ScalarValue::Int32(Some(20))),
+            ColumnarValue::Scalar(ScalarValue::Utf8(Some("*".to_string()))),
+        ];
+        b.iter(|| black_box(spark_lpad(black_box(&args)).unwrap()))
+    });
+
+    // rpad with default padding (space)
+    c.bench_function("spark_rpad: default padding", |b| {
+        let args = vec![
+            ColumnarValue::Array(Arc::clone(&string_array)),
+            ColumnarValue::Scalar(ScalarValue::Int32(Some(20))),
+        ];
+        b.iter(|| black_box(spark_rpad(black_box(&args)).unwrap()))
+    });
+
+    // rpad with custom padding character
+    c.bench_function("spark_rpad: custom padding", |b| {
+        let args = vec![
+            ColumnarValue::Array(Arc::clone(&string_array)),
+            ColumnarValue::Scalar(ScalarValue::Int32(Some(20))),
+            ColumnarValue::Scalar(ScalarValue::Utf8(Some("*".to_string()))),
+        ];
+        b.iter(|| black_box(spark_rpad(black_box(&args)).unwrap()))
+    });
+
+    // lpad with multi-character padding string
+    c.bench_function("spark_lpad: multi-char padding", |b| {
+        let args = vec![
+            ColumnarValue::Array(Arc::clone(&string_array)),
+            ColumnarValue::Scalar(ScalarValue::Int32(Some(20))),
+            ColumnarValue::Scalar(ScalarValue::Utf8(Some("abc".to_string()))),
+        ];
+        b.iter(|| black_box(spark_lpad(black_box(&args)).unwrap()))
+    });
+
+    // rpad with multi-character padding string
+    c.bench_function("spark_rpad: multi-char padding", |b| {
+        let args = vec![
+            ColumnarValue::Array(Arc::clone(&string_array)),
+            ColumnarValue::Scalar(ScalarValue::Int32(Some(20))),
+            ColumnarValue::Scalar(ScalarValue::Utf8(Some("abc".to_string()))),
+        ];
+        b.iter(|| black_box(spark_rpad(black_box(&args)).unwrap()))
+    });
+
+    // lpad with truncation (target length shorter than some strings)
+    c.bench_function("spark_lpad: with truncation", |b| {
+        let args = vec![
+            ColumnarValue::Array(Arc::clone(&string_array)),
+            ColumnarValue::Scalar(ScalarValue::Int32(Some(5))),
+        ];
+        b.iter(|| black_box(spark_lpad(black_box(&args)).unwrap()))
+    });
+
+    // rpad with truncation (target length shorter than some strings)
+    c.bench_function("spark_rpad: with truncation", |b| {
+        let args = vec![
+            ColumnarValue::Array(Arc::clone(&string_array)),
+            ColumnarValue::Scalar(ScalarValue::Int32(Some(5))),
+        ];
+        b.iter(|| black_box(spark_rpad(black_box(&args)).unwrap()))
+    });
+}
+
+criterion_group!(benches, criterion_benchmark);
+criterion_main!(benches);
@@ -21,6 +21,7 @@ mod correlation;
 mod covariance;
 mod stddev;
 mod sum_decimal;
+mod sum_int;
 mod variance;
 
 pub use avg::Avg;
@@ -29,4 +30,5 @@ pub use correlation::Correlation;
 pub use covariance::Covariance;
 pub use stddev::Stddev;
 pub use sum_decimal::SumDecimal;
+pub use sum_int::SumInteger;
 pub use variance::Variance;