Skip to content

Commit d0a1633

Browse files
committed
Merge branch 'main' into pr-3289
2 parents af8ccee + 313cf64 commit d0a1633

File tree

3,301 files changed

+47672
-52102
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

3,301 files changed

+47672
-52102
lines changed

.github/workflows/spark_sql_test.yml

Lines changed: 20 additions & 122 deletions
Original file line numberDiff line numberDiff line change
@@ -101,12 +101,11 @@ jobs:
101101
native/target
102102
key: ${{ runner.os }}-cargo-ci-${{ hashFiles('native/**/Cargo.lock', 'native/**/Cargo.toml') }}
103103

104-
spark-sql-auto-scan:
104+
spark-sql-test:
105105
needs: build-native
106106
strategy:
107107
matrix:
108108
os: [ubuntu-24.04]
109-
spark-version: [{short: '3.4', full: '3.4.3', java: 11}, {short: '3.5', full: '3.5.7', java: 11}, {short: '4.0', full: '4.0.1', java: 17}]
110109
module:
111110
- {name: "catalyst", args1: "catalyst/test", args2: ""}
112111
- {name: "sql_core-1", args1: "", args2: sql/testOnly * -- -l org.apache.spark.tags.ExtendedSQLTest -l org.apache.spark.tags.SlowSQLTest}
@@ -115,12 +114,23 @@ jobs:
115114
- {name: "sql_hive-1", args1: "", args2: "hive/testOnly * -- -l org.apache.spark.tags.ExtendedHiveTest -l org.apache.spark.tags.SlowHiveTest"}
116115
- {name: "sql_hive-2", args1: "", args2: "hive/testOnly * -- -n org.apache.spark.tags.ExtendedHiveTest"}
117116
- {name: "sql_hive-3", args1: "", args2: "hive/testOnly * -- -n org.apache.spark.tags.SlowHiveTest"}
117+
# Test combinations:
118+
# - auto scan: all Spark versions (3.4, 3.5, 4.0)
119+
# - native_comet: Spark 3.4, 3.5
120+
# - native_iceberg_compat: Spark 3.5 only
121+
config:
122+
- {spark-short: '3.4', spark-full: '3.4.3', java: 11, scan-impl: 'auto', scan-env: ''}
123+
- {spark-short: '3.5', spark-full: '3.5.7', java: 11, scan-impl: 'auto', scan-env: ''}
124+
- {spark-short: '4.0', spark-full: '4.0.1', java: 17, scan-impl: 'auto', scan-env: ''}
125+
- {spark-short: '3.4', spark-full: '3.4.3', java: 11, scan-impl: 'native_comet', scan-env: 'COMET_PARQUET_SCAN_IMPL=native_comet'}
126+
- {spark-short: '3.5', spark-full: '3.5.7', java: 11, scan-impl: 'native_comet', scan-env: 'COMET_PARQUET_SCAN_IMPL=native_comet'}
127+
- {spark-short: '3.5', spark-full: '3.5.7', java: 11, scan-impl: 'native_iceberg_compat', scan-env: 'COMET_PARQUET_SCAN_IMPL=native_iceberg_compat'}
118128
# Skip sql_hive-1 for Spark 4.0 due to https://github.com/apache/datafusion-comet/issues/2946
119129
exclude:
120-
- spark-version: {short: '4.0', full: '4.0.1', java: 17}
130+
- config: {spark-short: '4.0', spark-full: '4.0.1', java: 17, scan-impl: 'auto', scan-env: ''}
121131
module: {name: "sql_hive-1", args1: "", args2: "hive/testOnly * -- -l org.apache.spark.tags.ExtendedHiveTest -l org.apache.spark.tags.SlowHiveTest"}
122132
fail-fast: false
123-
name: spark-sql-${{ matrix.module.name }}/${{ matrix.os }}/spark-${{ matrix.spark-version.full }}/java-${{ matrix.spark-version.java }}
133+
name: spark-sql-${{ matrix.config.scan-impl }}-${{ matrix.module.name }}/spark-${{ matrix.config.spark-full }}
124134
runs-on: ${{ matrix.os }}
125135
container:
126136
image: amd64/rust
@@ -130,7 +140,7 @@ jobs:
130140
uses: ./.github/actions/setup-builder
131141
with:
132142
rust-version: ${{env.RUST_VERSION}}
133-
jdk-version: ${{ matrix.spark-version.java }}
143+
jdk-version: ${{ matrix.config.java }}
134144
- name: Download native library
135145
uses: actions/download-artifact@v7
136146
with:
@@ -139,14 +149,14 @@ jobs:
139149
- name: Setup Spark
140150
uses: ./.github/actions/setup-spark-builder
141151
with:
142-
spark-version: ${{ matrix.spark-version.full }}
143-
spark-short-version: ${{ matrix.spark-version.short }}
152+
spark-version: ${{ matrix.config.spark-full }}
153+
spark-short-version: ${{ matrix.config.spark-short }}
144154
skip-native-build: true
145155
- name: Run Spark tests
146156
run: |
147157
cd apache-spark
148158
rm -rf /root/.m2/repository/org/apache/parquet # somehow parquet cache requires cleanups
149-
ENABLE_COMET=true ENABLE_COMET_ONHEAP=true ENABLE_COMET_LOG_FALLBACK_REASONS=${{ github.event.inputs.collect-fallback-logs || 'false' }} \
159+
ENABLE_COMET=true ENABLE_COMET_ONHEAP=true ${{ matrix.config.scan-env }} ENABLE_COMET_LOG_FALLBACK_REASONS=${{ github.event.inputs.collect-fallback-logs || 'false' }} \
150160
build/sbt -Dsbt.log.noformat=true ${{ matrix.module.args1 }} "${{ matrix.module.args2 }}"
151161
if [ "${{ github.event.inputs.collect-fallback-logs }}" = "true" ]; then
152162
find . -type f -name "unit-tests.log" -print0 | xargs -0 grep -h "Comet cannot accelerate" | sed 's/.*Comet cannot accelerate/Comet cannot accelerate/' | sort -u > fallback.log
@@ -157,125 +167,13 @@ jobs:
157167
if: ${{ github.event.inputs.collect-fallback-logs == 'true' }}
158168
uses: actions/upload-artifact@v6
159169
with:
160-
name: fallback-log-spark-sql-${{ matrix.module.name }}-${{ matrix.os }}-spark-${{ matrix.spark-version.full }}-java-${{ matrix.spark-version.java }}
161-
path: "**/fallback.log"
162-
163-
spark-sql-native-native-comet:
164-
needs: build-native
165-
strategy:
166-
matrix:
167-
os: [ ubuntu-24.04 ]
168-
java-version: [ 11 ]
169-
spark-version: [ { short: '3.4', full: '3.4.3' }, { short: '3.5', full: '3.5.7' } ]
170-
module:
171-
- { name: "catalyst", args1: "catalyst/test", args2: "" }
172-
- { name: "sql_core-1", args1: "", args2: sql/testOnly * -- -l org.apache.spark.tags.ExtendedSQLTest -l org.apache.spark.tags.SlowSQLTest }
173-
- { name: "sql_core-2", args1: "", args2: "sql/testOnly * -- -n org.apache.spark.tags.ExtendedSQLTest" }
174-
- { name: "sql_core-3", args1: "", args2: "sql/testOnly * -- -n org.apache.spark.tags.SlowSQLTest" }
175-
- { name: "sql_hive-1", args1: "", args2: "hive/testOnly * -- -l org.apache.spark.tags.ExtendedHiveTest -l org.apache.spark.tags.SlowHiveTest" }
176-
- { name: "sql_hive-2", args1: "", args2: "hive/testOnly * -- -n org.apache.spark.tags.ExtendedHiveTest" }
177-
- { name: "sql_hive-3", args1: "", args2: "hive/testOnly * -- -n org.apache.spark.tags.SlowHiveTest" }
178-
fail-fast: false
179-
name: spark-sql-native-comet-${{ matrix.module.name }}/${{ matrix.os }}/spark-${{ matrix.spark-version.full }}/java-${{ matrix.java-version }}
180-
runs-on: ${{ matrix.os }}
181-
container:
182-
image: amd64/rust
183-
steps:
184-
- uses: actions/checkout@v6
185-
- name: Setup Rust & Java toolchain
186-
uses: ./.github/actions/setup-builder
187-
with:
188-
rust-version: ${{env.RUST_VERSION}}
189-
jdk-version: ${{ matrix.java-version }}
190-
- name: Download native library
191-
uses: actions/download-artifact@v7
192-
with:
193-
name: native-lib-linux
194-
path: native/target/release/
195-
- name: Setup Spark
196-
uses: ./.github/actions/setup-spark-builder
197-
with:
198-
spark-version: ${{ matrix.spark-version.full }}
199-
spark-short-version: ${{ matrix.spark-version.short }}
200-
skip-native-build: true
201-
- name: Run Spark tests
202-
run: |
203-
cd apache-spark
204-
rm -rf /root/.m2/repository/org/apache/parquet # somehow parquet cache requires cleanups
205-
ENABLE_COMET=true ENABLE_COMET_ONHEAP=true COMET_PARQUET_SCAN_IMPL=native_comet ENABLE_COMET_LOG_FALLBACK_REASONS=${{ github.event.inputs.collect-fallback-logs || 'false' }} \
206-
build/sbt -Dsbt.log.noformat=true ${{ matrix.module.args1 }} "${{ matrix.module.args2 }}"
207-
if [ "${{ github.event.inputs.collect-fallback-logs }}" = "true" ]; then
208-
find . -type f -name "unit-tests.log" -print0 | xargs -0 grep -h "Comet cannot accelerate" | sed 's/.*Comet cannot accelerate/Comet cannot accelerate/' | sort -u > fallback.log
209-
fi
210-
env:
211-
LC_ALL: "C.UTF-8"
212-
- name: Upload fallback log
213-
if: ${{ github.event.inputs.collect-fallback-logs == 'true' }}
214-
uses: actions/upload-artifact@v6
215-
with:
216-
name: fallback-log-spark-sql-native-comet-${{ matrix.module.name }}-${{ matrix.os }}-spark-${{ matrix.spark-version.full }}-java-${{ matrix.java-version }}
217-
path: "**/fallback.log"
218-
219-
spark-sql-native-iceberg-compat:
220-
needs: build-native
221-
strategy:
222-
matrix:
223-
os: [ubuntu-24.04]
224-
java-version: [11]
225-
spark-version: [{short: '3.4', full: '3.4.3'}, {short: '3.5', full: '3.5.7'}]
226-
module:
227-
- {name: "catalyst", args1: "catalyst/test", args2: ""}
228-
- {name: "sql_core-1", args1: "", args2: sql/testOnly * -- -l org.apache.spark.tags.ExtendedSQLTest -l org.apache.spark.tags.SlowSQLTest}
229-
- {name: "sql_core-2", args1: "", args2: "sql/testOnly * -- -n org.apache.spark.tags.ExtendedSQLTest"}
230-
- {name: "sql_core-3", args1: "", args2: "sql/testOnly * -- -n org.apache.spark.tags.SlowSQLTest"}
231-
- {name: "sql_hive-1", args1: "", args2: "hive/testOnly * -- -l org.apache.spark.tags.ExtendedHiveTest -l org.apache.spark.tags.SlowHiveTest"}
232-
- {name: "sql_hive-2", args1: "", args2: "hive/testOnly * -- -n org.apache.spark.tags.ExtendedHiveTest"}
233-
- {name: "sql_hive-3", args1: "", args2: "hive/testOnly * -- -n org.apache.spark.tags.SlowHiveTest"}
234-
fail-fast: false
235-
name: spark-sql-iceberg-compat-${{ matrix.module.name }}/${{ matrix.os }}/spark-${{ matrix.spark-version.full }}/java-${{ matrix.java-version }}
236-
runs-on: ${{ matrix.os }}
237-
container:
238-
image: amd64/rust
239-
steps:
240-
- uses: actions/checkout@v6
241-
- name: Setup Rust & Java toolchain
242-
uses: ./.github/actions/setup-builder
243-
with:
244-
rust-version: ${{env.RUST_VERSION}}
245-
jdk-version: ${{ matrix.java-version }}
246-
- name: Download native library
247-
uses: actions/download-artifact@v7
248-
with:
249-
name: native-lib-linux
250-
path: native/target/release/
251-
- name: Setup Spark
252-
uses: ./.github/actions/setup-spark-builder
253-
with:
254-
spark-version: ${{ matrix.spark-version.full }}
255-
spark-short-version: ${{ matrix.spark-version.short }}
256-
skip-native-build: true
257-
- name: Run Spark tests
258-
run: |
259-
cd apache-spark
260-
rm -rf /root/.m2/repository/org/apache/parquet # somehow parquet cache requires cleanups
261-
ENABLE_COMET=true ENABLE_COMET_ONHEAP=true COMET_PARQUET_SCAN_IMPL=native_iceberg_compat ENABLE_COMET_LOG_FALLBACK_REASONS=${{ github.event.inputs.collect-fallback-logs || 'false' }} \
262-
build/sbt -Dsbt.log.noformat=true ${{ matrix.module.args1 }} "${{ matrix.module.args2 }}"
263-
if [ "${{ github.event.inputs.collect-fallback-logs }}" = "true" ]; then
264-
find . -type f -name "unit-tests.log" -print0 | xargs -0 grep -h "Comet cannot accelerate" | sed 's/.*Comet cannot accelerate/Comet cannot accelerate/' | sort -u > fallback.log
265-
fi
266-
env:
267-
LC_ALL: "C.UTF-8"
268-
- name: Upload fallback log
269-
if: ${{ github.event.inputs.collect-fallback-logs == 'true' }}
270-
uses: actions/upload-artifact@v6
271-
with:
272-
name: fallback-log-spark-sql-iceberg-compat-${{ matrix.module.name }}-${{ matrix.os }}-spark-${{ matrix.spark-version.full }}-java-${{ matrix.java-version }}
170+
name: fallback-log-spark-sql-${{ matrix.config.scan-impl }}-${{ matrix.module.name }}-spark-${{ matrix.config.spark-full }}
273171
path: "**/fallback.log"
274172

275173
merge-fallback-logs:
276174
if: ${{ github.event.inputs.collect-fallback-logs == 'true' }}
277175
name: merge-fallback-logs
278-
needs: [ spark-sql-auto-scan, spark-sql-native-native-comet, spark-sql-native-iceberg-compat ]
176+
needs: [spark-sql-test]
279177
runs-on: ubuntu-24.04
280178
steps:
281179
- name: Download fallback log artifacts

.github/workflows/take.yml

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
# Licensed to the Apache Software Foundation (ASF) under one
2+
# or more contributor license agreements. See the NOTICE file
3+
# distributed with this work for additional information
4+
# regarding copyright ownership. The ASF licenses this file
5+
# to you under the Apache License, Version 2.0 (the
6+
# "License"); you may not use this file except in compliance
7+
# with the License. You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing,
12+
# software distributed under the License is distributed on an
13+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
# KIND, either express or implied. See the License for the
15+
# specific language governing permissions and limitations
16+
# under the License.
17+
18+
name: Assign/unassign the issue via `take` or `untake` comment
19+
on:
20+
issue_comment:
21+
types: created
22+
23+
permissions:
24+
issues: write
25+
26+
jobs:
27+
issue_assign:
28+
runs-on: ubuntu-latest
29+
if: (!github.event.issue.pull_request) && (github.event.comment.body == 'take' || github.event.comment.body == 'untake')
30+
concurrency:
31+
group: ${{ github.actor }}-issue-assign
32+
steps:
33+
- name: Take or untake issue
34+
env:
35+
COMMENT_BODY: ${{ github.event.comment.body }}
36+
ISSUE_NUMBER: ${{ github.event.issue.number }}
37+
USER_LOGIN: ${{ github.event.comment.user.login }}
38+
REPO: ${{ github.repository }}
39+
TOKEN: ${{ secrets.GITHUB_TOKEN }}
40+
run: |
41+
if [ "$COMMENT_BODY" == "take" ]
42+
then
43+
CODE=$(curl -H "Authorization: token $TOKEN" -LI https://api.github.com/repos/$REPO/issues/$ISSUE_NUMBER/assignees/$USER_LOGIN -o /dev/null -w '%{http_code}\n' -s)
44+
if [ "$CODE" -eq "204" ]
45+
then
46+
echo "Assigning issue $ISSUE_NUMBER to $USER_LOGIN"
47+
curl -X POST -H "Authorization: token $TOKEN" -H "Content-Type: application/json" -d "{\"assignees\": [\"$USER_LOGIN\"]}" https://api.github.com/repos/$REPO/issues/$ISSUE_NUMBER/assignees
48+
else
49+
echo "Cannot assign issue $ISSUE_NUMBER to $USER_LOGIN"
50+
fi
51+
elif [ "$COMMENT_BODY" == "untake" ]
52+
then
53+
echo "Unassigning issue $ISSUE_NUMBER from $USER_LOGIN"
54+
curl -X DELETE -H "Authorization: token $TOKEN" -H "Content-Type: application/json" -d "{\"assignees\": [\"$USER_LOGIN\"]}" https://api.github.com/repos/$REPO/issues/$ISSUE_NUMBER/assignees
55+
fi

common/src/main/scala/org/apache/comet/CometConf.scala

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -304,9 +304,9 @@ object CometConf extends ShimCometConf {
304304
"Whether to enable native columnar to row conversion. When enabled, Comet will use " +
305305
"native Rust code to convert Arrow columnar data to Spark UnsafeRow format instead " +
306306
"of the JVM implementation. This can improve performance for queries that need to " +
307-
"convert between columnar and row formats. This is an experimental feature.")
307+
"convert between columnar and row formats.")
308308
.booleanConf
309-
.createWithDefault(false)
309+
.createWithDefault(true)
310310

311311
val COMET_EXEC_SORT_MERGE_JOIN_WITH_JOIN_FILTER_ENABLED: ConfigEntry[Boolean] =
312312
conf("spark.comet.exec.sortMergeJoinWithJoinFilter.enabled")

common/src/main/scala/org/apache/spark/sql/comet/util/Utils.scala

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ import java.nio.channels.Channels
2626
import scala.jdk.CollectionConverters._
2727

2828
import org.apache.arrow.c.CDataDictionaryProvider
29-
import org.apache.arrow.vector.{BigIntVector, BitVector, DateDayVector, DecimalVector, FieldVector, FixedSizeBinaryVector, Float4Vector, Float8Vector, IntVector, SmallIntVector, TimeStampMicroTZVector, TimeStampMicroVector, TinyIntVector, ValueVector, VarBinaryVector, VarCharVector, VectorSchemaRoot}
29+
import org.apache.arrow.vector.{BigIntVector, BitVector, DateDayVector, DecimalVector, FieldVector, FixedSizeBinaryVector, Float4Vector, Float8Vector, IntVector, NullVector, SmallIntVector, TimeStampMicroTZVector, TimeStampMicroVector, TinyIntVector, ValueVector, VarBinaryVector, VarCharVector, VectorSchemaRoot}
3030
import org.apache.arrow.vector.complex.{ListVector, MapVector, StructVector}
3131
import org.apache.arrow.vector.dictionary.DictionaryProvider
3232
import org.apache.arrow.vector.ipc.ArrowStreamWriter
@@ -288,7 +288,7 @@ object Utils extends CometTypeShim {
288288
_: BigIntVector | _: Float4Vector | _: Float8Vector | _: VarCharVector |
289289
_: DecimalVector | _: DateDayVector | _: TimeStampMicroTZVector | _: VarBinaryVector |
290290
_: FixedSizeBinaryVector | _: TimeStampMicroVector | _: StructVector | _: ListVector |
291-
_: MapVector) =>
291+
_: MapVector | _: NullVector) =>
292292
v.asInstanceOf[FieldVector]
293293
case _ =>
294294
throw new SparkException(s"Unsupported Arrow Vector for $reason: ${valueVector.getClass}")

0 commit comments

Comments
 (0)