Skip to content

Commit 93e8a4b

Browse files
committed
Merge remote-tracking branch 'oss-spark/master' into to_avro_improve_NPE
2 parents de49baa + fab0cca commit 93e8a4b

File tree

8,653 files changed

+404048
-169099
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

8,653 files changed

+404048
-169099
lines changed

.asf.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,8 @@ github:
3131
merge: false
3232
squash: true
3333
rebase: true
34+
ghp_branch: master
35+
ghp_path: /docs
3436

3537
notifications:
3638
pullrequests: reviews@spark.apache.org

.github/PULL_REQUEST_TEMPLATE

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,8 @@ Please clarify why the changes are needed. For instance,
3333

3434
### Does this PR introduce _any_ user-facing change?
3535
<!--
36-
Note that it means *any* user-facing change including all aspects such as the documentation fix.
36+
Note that it means *any* user-facing change including all aspects such as new features, bug fixes, or other behavior changes. Documentation-only updates are not considered user-facing changes.
37+
3738
If yes, please clarify the previous behavior and the change this PR proposes - provide the console output, description and/or an example to show the behavior difference if possible.
3839
If possible, please also clarify if this is a user-facing change compared to the released Spark versions or within the unreleased branches such as master.
3940
If no, write 'No'.

.github/labeler.yml

Lines changed: 13 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -26,16 +26,14 @@ INFRA:
2626
'.asf.yaml',
2727
'.gitattributes',
2828
'.gitignore',
29-
'dev/merge_spark_pr.py',
30-
'dev/run-tests-jenkins*'
29+
'dev/merge_spark_pr.py'
3130
]
3231

3332
BUILD:
3433
- changed-files:
3534
- all-globs-to-any-file: [
3635
'dev/**/*',
37-
'!dev/merge_spark_pr.py',
38-
'!dev/run-tests-jenkins*'
36+
'!dev/merge_spark_pr.py'
3937
]
4038
- any-glob-to-any-file: [
4139
'build/**/*',
@@ -95,9 +93,9 @@ SQL:
9593
- changed-files:
9694
- all-globs-to-any-file: [
9795
'**/sql/**/*',
98-
'!python/pyspark/sql/avro/**/*',
99-
'!python/pyspark/sql/streaming/**/*',
100-
'!python/pyspark/sql/tests/streaming/test_streaming*.py'
96+
'!python/**/avro/**/*',
97+
'!python/**/protobuf/**/*',
98+
'!python/**/streaming/**/*'
10199
]
102100
- any-glob-to-any-file: [
103101
'common/unsafe/**/*',
@@ -121,7 +119,7 @@ AVRO:
121119
- changed-files:
122120
- any-glob-to-any-file: [
123121
'connector/avro/**/*',
124-
'python/pyspark/sql/avro/**/*'
122+
'python/**/avro/**/*'
125123
]
126124

127125
DSTREAM:
@@ -154,18 +152,16 @@ ML:
154152
MLLIB:
155153
- changed-files:
156154
- any-glob-to-any-file: [
157-
'**/spark/mllib/**/*',
158-
'mllib-local/**/*',
159-
'python/pyspark/mllib/**/*'
155+
'**/mllib/**/*',
156+
'mllib-local/**/*'
160157
]
161158

162159
STRUCTURED STREAMING:
163160
- changed-files:
164161
- any-glob-to-any-file: [
165162
'**/sql/**/streaming/**/*',
166163
'connector/kafka-0-10-sql/**/*',
167-
'python/pyspark/sql/streaming/**/*',
168-
'python/pyspark/sql/tests/streaming/test_streaming*.py',
164+
'python/pyspark/sql/**/streaming/**/*',
169165
'**/*streaming.R'
170166
]
171167

@@ -199,6 +195,7 @@ YARN:
199195
KUBERNETES:
200196
- changed-files:
201197
- any-glob-to-any-file: [
198+
'bin/docker-image-tool.sh',
202199
'resource-managers/kubernetes/**/*'
203200
]
204201

@@ -225,14 +222,14 @@ DEPLOY:
225222
CONNECT:
226223
- changed-files:
227224
- any-glob-to-any-file: [
225+
'sql/connect/**/*',
228226
'connector/connect/**/*',
229-
'python/pyspark/sql/**/connect/**/*',
230-
'python/pyspark/ml/**/connect/**/*'
227+
'python/**/connect/**/*'
231228
]
232229

233230
PROTOBUF:
234231
- changed-files:
235232
- any-glob-to-any-file: [
236233
'connector/protobuf/**/*',
237-
'python/pyspark/sql/protobuf/**/*'
234+
'python/**/protobuf/**/*'
238235
]

.github/workflows/benchmark.yml

Lines changed: 29 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -27,17 +27,25 @@ on:
2727
required: true
2828
default: '*'
2929
jdk:
30+
type: choice
3031
description: 'JDK version: 17 or 21'
3132
required: true
3233
default: '17'
34+
options:
35+
- '17'
36+
- '21'
3337
scala:
38+
type: choice
3439
description: 'Scala version: 2.13'
3540
required: true
3641
default: '2.13'
42+
options:
43+
- '2.13'
3744
failfast:
38-
description: 'Failfast: true or false'
45+
type: boolean
46+
description: 'Failfast'
3947
required: true
40-
default: 'true'
48+
default: true
4149
num-splits:
4250
description: 'Number of job splits'
4351
required: true
@@ -50,7 +58,7 @@ jobs:
5058
outputs:
5159
matrix: ${{ steps.set-matrix.outputs.matrix }}
5260
env:
53-
SPARK_BENCHMARK_NUM_SPLITS: ${{ github.event.inputs.num-splits }}
61+
SPARK_BENCHMARK_NUM_SPLITS: ${{ inputs.num-splits }}
5462
steps:
5563
- name: Generate matrix
5664
id: set-matrix
@@ -59,7 +67,7 @@ jobs:
5967
# Any TPC-DS related updates on this job need to be applied to tpcds-1g job of build_and_test.yml as well
6068
tpcds-1g-gen:
6169
name: "Generate an input dataset for TPCDSQueryBenchmark with SF=1"
62-
if: contains(github.event.inputs.class, 'TPCDSQueryBenchmark') || contains(github.event.inputs.class, '*')
70+
if: contains(inputs.class, 'TPCDSQueryBenchmark') || contains(inputs.class, '*')
6371
runs-on: ubuntu-20.04
6472
env:
6573
SPARK_LOCAL_IP: localhost
@@ -83,9 +91,9 @@ jobs:
8391
uses: actions/cache@v4
8492
with:
8593
path: ~/.cache/coursier
86-
key: benchmark-coursier-${{ github.event.inputs.jdk }}-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }}
94+
key: benchmark-coursier-${{ inputs.jdk }}-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }}
8795
restore-keys: |
88-
benchmark-coursier-${{ github.event.inputs.jdk }}
96+
benchmark-coursier-${{ inputs.jdk }}
8997
- name: Cache TPC-DS generated data
9098
id: cache-tpcds-sf-1
9199
uses: actions/cache@v4
@@ -102,18 +110,18 @@ jobs:
102110
- name: Build tpcds-kit
103111
if: steps.cache-tpcds-sf-1.outputs.cache-hit != 'true'
104112
run: cd tpcds-kit/tools && make OS=LINUX
105-
- name: Install Java ${{ github.event.inputs.jdk }}
113+
- name: Install Java ${{ inputs.jdk }}
106114
if: steps.cache-tpcds-sf-1.outputs.cache-hit != 'true'
107115
uses: actions/setup-java@v4
108116
with:
109117
distribution: zulu
110-
java-version: ${{ github.event.inputs.jdk }}
118+
java-version: ${{ inputs.jdk }}
111119
- name: Generate TPC-DS (SF=1) table data
112120
if: steps.cache-tpcds-sf-1.outputs.cache-hit != 'true'
113121
run: build/sbt "sql/Test/runMain org.apache.spark.sql.GenTPCDSData --dsdgenDir `pwd`/tpcds-kit/tools --location `pwd`/tpcds-sf-1 --scaleFactor 1 --numPartitions 1 --overwrite"
114122

115123
benchmark:
116-
name: "Run benchmarks: ${{ github.event.inputs.class }} (JDK ${{ github.event.inputs.jdk }}, Scala ${{ github.event.inputs.scala }}, ${{ matrix.split }} out of ${{ github.event.inputs.num-splits }} splits)"
124+
name: "Run benchmarks: ${{ inputs.class }} (JDK ${{ inputs.jdk }}, Scala ${{ inputs.scala }}, ${{ matrix.split }} out of ${{ inputs.num-splits }} splits)"
117125
if: always()
118126
needs: [matrix-gen, tpcds-1g-gen]
119127
runs-on: ubuntu-latest
@@ -122,8 +130,8 @@ jobs:
122130
matrix:
123131
split: ${{fromJSON(needs.matrix-gen.outputs.matrix)}}
124132
env:
125-
SPARK_BENCHMARK_FAILFAST: ${{ github.event.inputs.failfast }}
126-
SPARK_BENCHMARK_NUM_SPLITS: ${{ github.event.inputs.num-splits }}
133+
SPARK_BENCHMARK_FAILFAST: ${{ inputs.failfast }}
134+
SPARK_BENCHMARK_NUM_SPLITS: ${{ inputs.num-splits }}
127135
SPARK_BENCHMARK_CUR_SPLIT: ${{ matrix.split }}
128136
SPARK_GENERATE_BENCHMARK_FILES: 1
129137
SPARK_LOCAL_IP: localhost
@@ -150,24 +158,24 @@ jobs:
150158
uses: actions/cache@v4
151159
with:
152160
path: ~/.cache/coursier
153-
key: benchmark-coursier-${{ github.event.inputs.jdk }}-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }}
161+
key: benchmark-coursier-${{ inputs.jdk }}-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }}
154162
restore-keys: |
155-
benchmark-coursier-${{ github.event.inputs.jdk }}
156-
- name: Install Java ${{ github.event.inputs.jdk }}
163+
benchmark-coursier-${{ inputs.jdk }}
164+
- name: Install Java ${{ inputs.jdk }}
157165
uses: actions/setup-java@v4
158166
with:
159167
distribution: zulu
160-
java-version: ${{ github.event.inputs.jdk }}
168+
java-version: ${{ inputs.jdk }}
161169
- name: Cache TPC-DS generated data
162-
if: contains(github.event.inputs.class, 'TPCDSQueryBenchmark') || contains(github.event.inputs.class, '*')
170+
if: contains(inputs.class, 'TPCDSQueryBenchmark') || contains(inputs.class, '*')
163171
id: cache-tpcds-sf-1
164172
uses: actions/cache@v4
165173
with:
166174
path: ./tpcds-sf-1
167175
key: tpcds-${{ hashFiles('.github/workflows/benchmark.yml', 'sql/core/src/test/scala/org/apache/spark/sql/TPCDSSchema.scala') }}
168176
- name: Run benchmarks
169177
run: |
170-
./build/sbt -Pscala-${{ github.event.inputs.scala }} -Pyarn -Pkubernetes -Phive -Phive-thriftserver -Phadoop-cloud -Pkinesis-asl -Pspark-ganglia-lgpl Test/package
178+
./build/sbt -Pscala-${{ inputs.scala }} -Pyarn -Pkubernetes -Phive -Phive-thriftserver -Phadoop-cloud -Pkinesis-asl -Pspark-ganglia-lgpl Test/package
171179
# Make less noisy
172180
cp conf/log4j2.properties.template conf/log4j2.properties
173181
sed -i 's/rootLogger.level = info/rootLogger.level = warn/g' conf/log4j2.properties
@@ -176,14 +184,14 @@ jobs:
176184
--driver-memory 6g --class org.apache.spark.benchmark.Benchmarks \
177185
--jars "`find . -name '*-SNAPSHOT-tests.jar' -o -name '*avro*-SNAPSHOT.jar' | paste -sd ',' -`,`find ~/.cache/coursier -name 'curator-test-*.jar'`" \
178186
"`find . -name 'spark-core*-SNAPSHOT-tests.jar'`" \
179-
"${{ github.event.inputs.class }}"
187+
"${{ inputs.class }}"
180188
# To keep the directory structure and file permissions, tar them
181189
# See also https://github.com/actions/upload-artifact#maintaining-file-permissions-and-case-sensitive-files
182190
echo "Preparing the benchmark results:"
183-
tar -cvf benchmark-results-${{ github.event.inputs.jdk }}-${{ github.event.inputs.scala }}.tar `git diff --name-only` `git ls-files --others --exclude=tpcds-sf-1 --exclude-standard`
191+
tar -cvf benchmark-results-${{ inputs.jdk }}-${{ inputs.scala }}.tar `git diff --name-only` `git ls-files --others --exclude=tpcds-sf-1 --exclude-standard`
184192
- name: Upload benchmark results
185193
uses: actions/upload-artifact@v4
186194
with:
187-
name: benchmark-results-${{ github.event.inputs.jdk }}-${{ github.event.inputs.scala }}-${{ matrix.split }}
188-
path: benchmark-results-${{ github.event.inputs.jdk }}-${{ github.event.inputs.scala }}.tar
195+
name: benchmark-results-${{ inputs.jdk }}-${{ inputs.scala }}-${{ matrix.split }}
196+
path: benchmark-results-${{ inputs.jdk }}-${{ inputs.scala }}.tar
189197

0 commit comments

Comments
 (0)