Skip to content

Commit 65b2390

Browse files
Merge remote-tracking branch 'apache/main' into regexp-extract-impl
1 parent 4b599d9 commit 65b2390

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

42 files changed

+2430
-456
lines changed

.github/workflows/pr_build_macos.yml

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -131,6 +131,7 @@ jobs:
131131
- name: "sql"
132132
value: |
133133
org.apache.spark.sql.CometToPrettyStringSuite
134+
134135
fail-fast: false
135136
name: ${{ matrix.os }}/${{ matrix.profile.name }} [${{ matrix.suite.name }}]
136137
runs-on: ${{ matrix.os }}
@@ -143,6 +144,14 @@ jobs:
143144
jdk-version: ${{ matrix.profile.java_version }}
144145
jdk-architecture: aarch64
145146
protoc-architecture: aarch_64
147+
- name: Set thread thresholds envs for spark test on macOS
148+
# see: https://github.com/apache/datafusion-comet/issues/2965
149+
shell: bash
150+
run: |
151+
echo "SPARK_TEST_SQL_SHUFFLE_EXCHANGE_MAX_THREAD_THRESHOLD=256" >> $GITHUB_ENV
152+
echo "SPARK_TEST_SQL_RESULT_QUERY_STAGE_MAX_THREAD_THRESHOLD=256" >> $GITHUB_ENV
153+
echo "SPARK_TEST_HIVE_SHUFFLE_EXCHANGE_MAX_THREAD_THRESHOLD=48" >> $GITHUB_ENV
154+
echo "SPARK_TEST_HIVE_RESULT_QUERY_STAGE_MAX_THREAD_THRESHOLD=48" >> $GITHUB_ENV
146155
- name: Java test steps
147156
uses: ./.github/actions/java-test
148157
with:

docs/source/user-guide/latest/compatibility.md

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -32,12 +32,11 @@ Comet has the following limitations when reading Parquet files:
3232

3333
## ANSI Mode
3434

35-
Comet will fall back to Spark for the following expressions when ANSI mode is enabled. Thes expressions can be enabled by setting
35+
Comet will fall back to Spark for the following expressions when ANSI mode is enabled. These expressions can be enabled by setting
3636
`spark.comet.expression.EXPRNAME.allowIncompatible=true`, where `EXPRNAME` is the Spark expression class name. See
3737
the [Comet Supported Expressions Guide](expressions.md) for more information on this configuration setting.
3838

3939
- Average
40-
- Sum
4140
- Cast (in some cases)
4241

4342
There is an [epic](https://github.com/apache/datafusion-comet/issues/313) where we are tracking the work to fully implement ANSI support.
@@ -159,6 +158,8 @@ The following cast operations are generally compatible with Spark except for the
159158
| string | short | |
160159
| string | integer | |
161160
| string | long | |
161+
| string | float | |
162+
| string | double | |
162163
| string | binary | |
163164
| string | date | Only supports years between 262143 BC and 262142 AD |
164165
| binary | string | |
@@ -181,9 +182,8 @@ The following cast operations are not compatible with Spark for all inputs and a
181182
|-|-|-|
182183
| float | decimal | There can be rounding differences |
183184
| double | decimal | There can be rounding differences |
184-
| string | float | Does not support inputs ending with 'd' or 'f'. Does not support 'inf'. Does not support ANSI mode. |
185-
| string | double | Does not support inputs ending with 'd' or 'f'. Does not support 'inf'. Does not support ANSI mode. |
186-
| string | decimal | Does not support inputs ending with 'd' or 'f'. Does not support 'inf'. Does not support ANSI mode. Returns 0.0 instead of null if input contains no digits |
185+
| string | decimal | Does not support fullwidth unicode digits (e.g \\uFF10)
186+
or strings containing null bytes (e.g \\u0000) |
187187
| string | timestamp | Not all valid formats are supported |
188188
<!-- prettier-ignore-end -->
189189
<!--END:INCOMPAT_CAST_TABLE-->

docs/source/user-guide/latest/configs.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -309,6 +309,7 @@ These settings can be used to determine which parts of the plan are accelerated
309309
| `spark.comet.expression.Signum.enabled` | Enable Comet acceleration for `Signum` | true |
310310
| `spark.comet.expression.Sin.enabled` | Enable Comet acceleration for `Sin` | true |
311311
| `spark.comet.expression.Sinh.enabled` | Enable Comet acceleration for `Sinh` | true |
312+
| `spark.comet.expression.Size.enabled` | Enable Comet acceleration for `Size` | true |
312313
| `spark.comet.expression.SortOrder.enabled` | Enable Comet acceleration for `SortOrder` | true |
313314
| `spark.comet.expression.SparkPartitionID.enabled` | Enable Comet acceleration for `SparkPartitionID` | true |
314315
| `spark.comet.expression.Sqrt.enabled` | Enable Comet acceleration for `Sqrt` | true |

native/Cargo.lock

Lines changed: 2 additions & 2 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

native/core/src/execution/planner.rs

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,7 @@ use datafusion::{
7171
use datafusion_comet_spark_expr::{
7272
create_comet_physical_fun, create_comet_physical_fun_with_eval_mode, BinaryOutputStyle,
7373
BloomFilterAgg, BloomFilterMightContain, EvalMode, SparkHour, SparkMinute, SparkSecond,
74+
SumInteger,
7475
};
7576
use iceberg::expr::Bind;
7677

@@ -1813,6 +1814,12 @@ impl PhysicalPlanner {
18131814
AggregateUDF::new_from_impl(SumDecimal::try_new(datatype, eval_mode)?);
18141815
AggregateExprBuilder::new(Arc::new(func), vec![child])
18151816
}
1817+
DataType::Int8 | DataType::Int16 | DataType::Int32 | DataType::Int64 => {
1818+
let eval_mode = from_protobuf_eval_mode(expr.eval_mode)?;
1819+
let func =
1820+
AggregateUDF::new_from_impl(SumInteger::try_new(datatype, eval_mode)?);
1821+
AggregateExprBuilder::new(Arc::new(func), vec![child])
1822+
}
18161823
_ => {
18171824
// cast to the result data type of SUM if necessary, we should not expect
18181825
// a cast failure since it should have already been checked at Spark side

native/spark-expr/Cargo.toml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,10 @@ harness = false
7676
name = "bloom_filter_agg"
7777
harness = false
7878

79+
[[bench]]
80+
name = "padding"
81+
harness = false
82+
7983
[[test]]
8084
name = "test_udf_registration"
8185
path = "tests/spark_expr_reg.rs"
Lines changed: 121 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,121 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
use arrow::array::builder::StringBuilder;
19+
use arrow::array::ArrayRef;
20+
use criterion::{criterion_group, criterion_main, Criterion};
21+
use datafusion::common::ScalarValue;
22+
use datafusion::physical_plan::ColumnarValue;
23+
use datafusion_comet_spark_expr::{spark_lpad, spark_rpad};
24+
use std::hint::black_box;
25+
use std::sync::Arc;
26+
27+
fn create_string_array(size: usize) -> ArrayRef {
28+
let mut builder = StringBuilder::new();
29+
for i in 0..size {
30+
if i % 10 == 0 {
31+
builder.append_null();
32+
} else {
33+
builder.append_value(format!("string{}", i % 100));
34+
}
35+
}
36+
Arc::new(builder.finish())
37+
}
38+
39+
fn criterion_benchmark(c: &mut Criterion) {
40+
let size = 8192;
41+
let string_array = create_string_array(size);
42+
43+
// lpad with default padding (space)
44+
c.bench_function("spark_lpad: default padding", |b| {
45+
let args = vec![
46+
ColumnarValue::Array(Arc::clone(&string_array)),
47+
ColumnarValue::Scalar(ScalarValue::Int32(Some(20))),
48+
];
49+
b.iter(|| black_box(spark_lpad(black_box(&args)).unwrap()))
50+
});
51+
52+
// lpad with custom padding character
53+
c.bench_function("spark_lpad: custom padding", |b| {
54+
let args = vec![
55+
ColumnarValue::Array(Arc::clone(&string_array)),
56+
ColumnarValue::Scalar(ScalarValue::Int32(Some(20))),
57+
ColumnarValue::Scalar(ScalarValue::Utf8(Some("*".to_string()))),
58+
];
59+
b.iter(|| black_box(spark_lpad(black_box(&args)).unwrap()))
60+
});
61+
62+
// rpad with default padding (space)
63+
c.bench_function("spark_rpad: default padding", |b| {
64+
let args = vec![
65+
ColumnarValue::Array(Arc::clone(&string_array)),
66+
ColumnarValue::Scalar(ScalarValue::Int32(Some(20))),
67+
];
68+
b.iter(|| black_box(spark_rpad(black_box(&args)).unwrap()))
69+
});
70+
71+
// rpad with custom padding character
72+
c.bench_function("spark_rpad: custom padding", |b| {
73+
let args = vec![
74+
ColumnarValue::Array(Arc::clone(&string_array)),
75+
ColumnarValue::Scalar(ScalarValue::Int32(Some(20))),
76+
ColumnarValue::Scalar(ScalarValue::Utf8(Some("*".to_string()))),
77+
];
78+
b.iter(|| black_box(spark_rpad(black_box(&args)).unwrap()))
79+
});
80+
81+
// lpad with multi-character padding string
82+
c.bench_function("spark_lpad: multi-char padding", |b| {
83+
let args = vec![
84+
ColumnarValue::Array(Arc::clone(&string_array)),
85+
ColumnarValue::Scalar(ScalarValue::Int32(Some(20))),
86+
ColumnarValue::Scalar(ScalarValue::Utf8(Some("abc".to_string()))),
87+
];
88+
b.iter(|| black_box(spark_lpad(black_box(&args)).unwrap()))
89+
});
90+
91+
// rpad with multi-character padding string
92+
c.bench_function("spark_rpad: multi-char padding", |b| {
93+
let args = vec![
94+
ColumnarValue::Array(Arc::clone(&string_array)),
95+
ColumnarValue::Scalar(ScalarValue::Int32(Some(20))),
96+
ColumnarValue::Scalar(ScalarValue::Utf8(Some("abc".to_string()))),
97+
];
98+
b.iter(|| black_box(spark_rpad(black_box(&args)).unwrap()))
99+
});
100+
101+
// lpad with truncation (target length shorter than some strings)
102+
c.bench_function("spark_lpad: with truncation", |b| {
103+
let args = vec![
104+
ColumnarValue::Array(Arc::clone(&string_array)),
105+
ColumnarValue::Scalar(ScalarValue::Int32(Some(5))),
106+
];
107+
b.iter(|| black_box(spark_lpad(black_box(&args)).unwrap()))
108+
});
109+
110+
// rpad with truncation (target length shorter than some strings)
111+
c.bench_function("spark_rpad: with truncation", |b| {
112+
let args = vec![
113+
ColumnarValue::Array(Arc::clone(&string_array)),
114+
ColumnarValue::Scalar(ScalarValue::Int32(Some(5))),
115+
];
116+
b.iter(|| black_box(spark_rpad(black_box(&args)).unwrap()))
117+
});
118+
}
119+
120+
criterion_group!(benches, criterion_benchmark);
121+
criterion_main!(benches);

native/spark-expr/src/agg_funcs/mod.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ mod correlation;
2121
mod covariance;
2222
mod stddev;
2323
mod sum_decimal;
24+
mod sum_int;
2425
mod variance;
2526

2627
pub use avg::Avg;
@@ -29,4 +30,5 @@ pub use correlation::Correlation;
2930
pub use covariance::Covariance;
3031
pub use stddev::Stddev;
3132
pub use sum_decimal::SumDecimal;
33+
pub use sum_int::SumInteger;
3234
pub use variance::Variance;

0 commit comments

Comments
 (0)