Skip to content

Commit 117a331

Browse files
authored
Merge branch 'main' into support_overflow_sum_function
2 parents b09002f + a9d0c2b commit 117a331

File tree

64 files changed

+3971
-1472
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

64 files changed

+3971
-1472
lines changed

.github/workflows/benchmark-tpcds.yml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ jobs:
5555
rust-version: ${{env.RUST_VERSION}}
5656
jdk-version: 11
5757
- name: Cache Maven dependencies
58-
uses: actions/cache@v4
58+
uses: actions/cache@v5
5959
with:
6060
path: |
6161
~/.m2/repository
@@ -67,7 +67,7 @@ jobs:
6767
run: make release
6868
- name: Cache TPC-DS generated data
6969
id: cache-tpcds-sf-1
70-
uses: actions/cache@v4
70+
uses: actions/cache@v5
7171
with:
7272
path: ./tpcds-sf-1
7373
key: tpcds-${{ hashFiles('.github/workflows/benchmark.yml') }}
@@ -107,7 +107,7 @@ jobs:
107107
rust-version: ${{env.RUST_VERSION}}
108108
jdk-version: 11
109109
- name: Cache Maven dependencies
110-
uses: actions/cache@v4
110+
uses: actions/cache@v5
111111
with:
112112
path: |
113113
~/.m2/repository
@@ -117,7 +117,7 @@ jobs:
117117
${{ runner.os }}-java-maven-
118118
- name: Restore TPC-DS generated data
119119
id: cache-tpcds-sf-1
120-
uses: actions/cache/restore@v4
120+
uses: actions/cache/restore@v5
121121
with:
122122
path: ./tpcds-sf-1
123123
key: tpcds-${{ hashFiles('.github/workflows/benchmark.yml') }}

.github/workflows/benchmark-tpch.yml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ jobs:
5555
rust-version: ${{env.RUST_VERSION}}
5656
jdk-version: 11
5757
- name: Cache Maven dependencies
58-
uses: actions/cache@v4
58+
uses: actions/cache@v5
5959
with:
6060
path: |
6161
~/.m2/repository
@@ -65,7 +65,7 @@ jobs:
6565
${{ runner.os }}-java-maven-
6666
- name: Cache TPC-H generated data
6767
id: cache-tpch-sf-1
68-
uses: actions/cache@v4
68+
uses: actions/cache@v5
6969
with:
7070
path: ./tpch
7171
key: tpch-${{ hashFiles('.github/workflows/benchmark-tpch.yml') }}
@@ -91,7 +91,7 @@ jobs:
9191
rust-version: ${{env.RUST_VERSION}}
9292
jdk-version: 11
9393
- name: Cache Maven dependencies
94-
uses: actions/cache@v4
94+
uses: actions/cache@v5
9595
with:
9696
path: |
9797
~/.m2/repository
@@ -101,7 +101,7 @@ jobs:
101101
${{ runner.os }}-java-maven-
102102
- name: Restore TPC-H generated data
103103
id: cache-tpch-sf-1
104-
uses: actions/cache/restore@v4
104+
uses: actions/cache/restore@v5
105105
with:
106106
path: ./tpch
107107
key: tpch-${{ hashFiles('.github/workflows/benchmark-tpch.yml') }}

.github/workflows/pr_build_linux.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -161,6 +161,7 @@ jobs:
161161
org.apache.comet.CometStringExpressionSuite
162162
org.apache.comet.CometBitwiseExpressionSuite
163163
org.apache.comet.CometMapExpressionSuite
164+
org.apache.comet.CometJsonExpressionSuite
164165
org.apache.comet.expressions.conditional.CometIfSuite
165166
org.apache.comet.expressions.conditional.CometCoalesceSuite
166167
org.apache.comet.expressions.conditional.CometCaseWhenSuite

.github/workflows/pr_build_macos.yml

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -57,11 +57,9 @@ jobs:
5757
java_version: "17"
5858
maven_opts: "-Pspark-3.5 -Pscala-2.13"
5959

60-
# TODO fails with OOM
61-
# https://github.com/apache/datafusion-comet/issues/1949
62-
# - name: "Spark 4.0, JDK 17, Scala 2.13"
63-
# java_version: "17"
64-
# maven_opts: "-Pspark-4.0 -Pscala-2.13"
60+
- name: "Spark 4.0, JDK 17, Scala 2.13"
61+
java_version: "17"
62+
maven_opts: "-Pspark-4.0 -Pscala-2.13"
6563

6664
suite:
6765
- name: "fuzz"
@@ -126,6 +124,7 @@ jobs:
126124
org.apache.comet.CometStringExpressionSuite
127125
org.apache.comet.CometBitwiseExpressionSuite
128126
org.apache.comet.CometMapExpressionSuite
127+
org.apache.comet.CometJsonExpressionSuite
129128
org.apache.comet.expressions.conditional.CometIfSuite
130129
org.apache.comet.expressions.conditional.CometCoalesceSuite
131130
org.apache.comet.expressions.conditional.CometCaseWhenSuite

.github/workflows/spark_sql_test.yml

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,10 @@ jobs:
5959
- {name: "sql_hive-1", args1: "", args2: "hive/testOnly * -- -l org.apache.spark.tags.ExtendedHiveTest -l org.apache.spark.tags.SlowHiveTest"}
6060
- {name: "sql_hive-2", args1: "", args2: "hive/testOnly * -- -n org.apache.spark.tags.ExtendedHiveTest"}
6161
- {name: "sql_hive-3", args1: "", args2: "hive/testOnly * -- -n org.apache.spark.tags.SlowHiveTest"}
62+
# Skip sql_hive-1 for Spark 4.0 due to https://github.com/apache/datafusion-comet/issues/2946
63+
exclude:
64+
- spark-version: {short: '4.0', full: '4.0.1', java: 17}
65+
module: {name: "sql_hive-1", args1: "", args2: "hive/testOnly * -- -l org.apache.spark.tags.ExtendedHiveTest -l org.apache.spark.tags.SlowHiveTest"}
6266
fail-fast: false
6367
name: spark-sql-${{ matrix.module.name }}/${{ matrix.os }}/spark-${{ matrix.spark-version.full }}/java-${{ matrix.spark-version.java }}
6468
runs-on: ${{ matrix.os }}
@@ -89,7 +93,7 @@ jobs:
8993
LC_ALL: "C.UTF-8"
9094
- name: Upload fallback log
9195
if: ${{ github.event.inputs.collect-fallback-logs == 'true' }}
92-
uses: actions/upload-artifact@v5
96+
uses: actions/upload-artifact@v6
9397
with:
9498
name: fallback-log-spark-sql-${{ matrix.module.name }}-${{ matrix.os }}-spark-${{ matrix.spark-version.full }}-java-${{ matrix.spark-version.java }}
9599
path: "**/fallback.log"
@@ -138,7 +142,7 @@ jobs:
138142
LC_ALL: "C.UTF-8"
139143
- name: Upload fallback log
140144
if: ${{ github.event.inputs.collect-fallback-logs == 'true' }}
141-
uses: actions/upload-artifact@v5
145+
uses: actions/upload-artifact@v6
142146
with:
143147
name: fallback-log-spark-sql-native-comet-${{ matrix.module.name }}-${{ matrix.os }}-spark-${{ matrix.spark-version.full }}-java-${{ matrix.java-version }}
144148
path: "**/fallback.log"
@@ -187,7 +191,7 @@ jobs:
187191
LC_ALL: "C.UTF-8"
188192
- name: Upload fallback log
189193
if: ${{ github.event.inputs.collect-fallback-logs == 'true' }}
190-
uses: actions/upload-artifact@v5
194+
uses: actions/upload-artifact@v6
191195
with:
192196
name: fallback-log-spark-sql-iceberg-compat-${{ matrix.module.name }}-${{ matrix.os }}-spark-${{ matrix.spark-version.full }}-java-${{ matrix.java-version }}
193197
path: "**/fallback.log"
@@ -199,14 +203,14 @@ jobs:
199203
runs-on: ubuntu-24.04
200204
steps:
201205
- name: Download fallback log artifacts
202-
uses: actions/download-artifact@v6
206+
uses: actions/download-artifact@v7
203207
with:
204208
path: fallback-logs/
205209
- name: Merge fallback logs
206210
run: |
207211
find ./fallback-logs/ -type f -name "fallback.log" -print0 | xargs -0 cat | sort -u > all_fallback.log
208212
- name: Upload merged fallback log
209-
uses: actions/upload-artifact@v5
213+
uses: actions/upload-artifact@v6
210214
with:
211215
name: all-fallback-log
212216
path: all_fallback.log

common/src/main/scala/org/apache/comet/CometConf.scala

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -441,6 +441,17 @@ object CometConf extends ShimCometConf {
441441
.intConf
442442
.createWithDefault(8192)
443443

444+
val COMET_SHUFFLE_WRITE_BUFFER_SIZE: ConfigEntry[Long] =
445+
conf(s"$COMET_EXEC_CONFIG_PREFIX.shuffle.writeBufferSize")
446+
.category(CATEGORY_SHUFFLE)
447+
.doc("Size of the write buffer in bytes used by the native shuffle writer when writing " +
448+
"shuffle data to disk. Larger values may improve write performance by reducing " +
449+
"the number of system calls, but will use more memory. " +
450+
"The default is 1MB which provides a good balance between performance and memory usage.")
451+
.bytesConf(ByteUnit.MiB)
452+
.checkValue(v => v > 0, "Write buffer size must be positive")
453+
.createWithDefault(1)
454+
444455
val COMET_SHUFFLE_PREFER_DICTIONARY_RATIO: ConfigEntry[Double] = conf(
445456
"spark.comet.shuffle.preferDictionary.ratio")
446457
.category(CATEGORY_SHUFFLE)

dev/benchmarks/comet-tpch.sh

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,5 +50,4 @@ $SPARK_HOME/bin/spark-submit \
5050
--data $TPCH_DATA \
5151
--queries $TPCH_QUERIES \
5252
--output . \
53-
--write /tmp \
5453
--iterations 1

docs/source/user-guide/latest/compatibility.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -182,7 +182,8 @@ The following cast operations are not compatible with Spark for all inputs and a
182182
| double | decimal | There can be rounding differences |
183183
| string | float | Does not support inputs ending with 'd' or 'f'. Does not support 'inf'. Does not support ANSI mode. |
184184
| string | double | Does not support inputs ending with 'd' or 'f'. Does not support 'inf'. Does not support ANSI mode. |
185-
| string | decimal | Does not support inputs ending with 'd' or 'f'. Does not support 'inf'. Does not support ANSI mode. Returns 0.0 instead of null if input contains no digits |
185+
| string | decimal | Does not support fullwidth unicode digits (e.g \\uFF10)
186+
or strings containing null bytes (e.g \\u0000) |
186187
| string | timestamp | Not all valid formats are supported |
187188
<!-- prettier-ignore-end -->
188189
<!--END:INCOMPAT_CAST_TABLE-->

docs/source/user-guide/latest/configs.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,7 @@ These settings can be used to determine which parts of the plan are accelerated
107107
| `spark.comet.exec.shuffle.compression.codec` | The codec of Comet native shuffle used to compress shuffle data. lz4, zstd, and snappy are supported. Compression can be disabled by setting spark.shuffle.compress=false. | lz4 |
108108
| `spark.comet.exec.shuffle.compression.zstd.level` | The compression level to use when compressing shuffle files with zstd. | 1 |
109109
| `spark.comet.exec.shuffle.enabled` | Whether to enable Comet native shuffle. Note that this requires setting `spark.shuffle.manager` to `org.apache.spark.sql.comet.execution.shuffle.CometShuffleManager`. `spark.shuffle.manager` must be set before starting the Spark application and cannot be changed during the application. | true |
110+
| `spark.comet.exec.shuffle.writeBufferSize` | Size of the write buffer in bytes used by the native shuffle writer when writing shuffle data to disk. Larger values may improve write performance by reducing the number of system calls, but will use more memory. The default is 1MB which provides a good balance between performance and memory usage. | 1048576b |
110111
| `spark.comet.native.shuffle.partitioning.hash.enabled` | Whether to enable hash partitioning for Comet native shuffle. | true |
111112
| `spark.comet.native.shuffle.partitioning.range.enabled` | Whether to enable range partitioning for Comet native shuffle. | true |
112113
| `spark.comet.shuffle.preferDictionary.ratio` | The ratio of total values to distinct values in a string column to decide whether to prefer dictionary encoding when shuffling the column. If the ratio is higher than this config, dictionary encoding will be used on shuffling string column. This config is effective if it is higher than 1.0. Note that this config is only used when `spark.comet.exec.shuffle.mode` is `jvm`. | 10.0 |
@@ -263,6 +264,7 @@ These settings can be used to determine which parts of the plan are accelerated
263264
| `spark.comet.expression.IsNaN.enabled` | Enable Comet acceleration for `IsNaN` | true |
264265
| `spark.comet.expression.IsNotNull.enabled` | Enable Comet acceleration for `IsNotNull` | true |
265266
| `spark.comet.expression.IsNull.enabled` | Enable Comet acceleration for `IsNull` | true |
267+
| `spark.comet.expression.JsonToStructs.enabled` | Enable Comet acceleration for `JsonToStructs` | true |
266268
| `spark.comet.expression.KnownFloatingPointNormalized.enabled` | Enable Comet acceleration for `KnownFloatingPointNormalized` | true |
267269
| `spark.comet.expression.Length.enabled` | Enable Comet acceleration for `Length` | true |
268270
| `spark.comet.expression.LessThan.enabled` | Enable Comet acceleration for `LessThan` | true |

native/Cargo.lock

Lines changed: 7 additions & 6 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)