Skip to content

Commit 8549ab6

Browse files
committed
merge upstream/master
2 parents 30ea61a + 0534f98 commit 8549ab6

File tree

121 files changed

+3195
-1000
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

121 files changed

+3195
-1000
lines changed

.github/workflows/benchmark.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -136,6 +136,7 @@ jobs:
136136
runs-on: ubuntu-latest
137137
strategy:
138138
fail-fast: false
139+
max-parallel: 20
139140
matrix:
140141
split: ${{fromJSON(needs.matrix-gen.outputs.matrix)}}
141142
env:

.github/workflows/build_and_test.yml

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,8 @@ jobs:
105105
buf=true
106106
ui=true
107107
docs=true
108+
java17=true
109+
java25=true
108110
else
109111
pyspark_install=false
110112
pandas=false
@@ -116,6 +118,8 @@ jobs:
116118
buf=false
117119
ui=false
118120
docs=false
121+
java17=false
122+
java25=false
119123
fi
120124
build=`./dev/is-changed.py -m "core,unsafe,kvstore,avro,utils,utils-java,network-common,network-shuffle,repl,launcher,examples,sketch,variant,api,catalyst,hive-thriftserver,mllib-local,mllib,graphx,streaming,sql-kafka-0-10,streaming-kafka-0-10,streaming-kinesis-asl,kubernetes,hadoop-cloud,spark-ganglia-lgpl,profiler,protobuf,yarn,connect,sql,hive,pipelines"`
121125
precondition="
@@ -128,8 +132,8 @@ jobs:
128132
\"tpcds-1g\": \"$tpcds\",
129133
\"docker-integration-tests\": \"$docker\",
130134
\"lint\" : \"true\",
131-
\"java17\" : \"$build\",
132-
\"java25\" : \"$build\",
135+
\"java17\" : \"$java17\",
136+
\"java25\" : \"$java25\",
133137
\"docs\" : \"$docs\",
134138
\"yarn\" : \"$yarn\",
135139
\"k8s-integration-tests\" : \"$kubernetes\",
@@ -238,6 +242,7 @@ jobs:
238242
timeout-minutes: 150
239243
strategy:
240244
fail-fast: false
245+
max-parallel: 20
241246
matrix:
242247
java:
243248
- ${{ inputs.java }}
@@ -522,6 +527,7 @@ jobs:
522527
--security-opt seccomp=unconfined
523528
strategy:
524529
fail-fast: false
530+
max-parallel: 20
525531
matrix:
526532
java:
527533
- ${{ inputs.java }}
@@ -626,6 +632,7 @@ jobs:
626632
env: ${{ fromJSON(inputs.envs) }}
627633
shell: 'script -q -e -c "bash {0}"'
628634
run: |
635+
uname -a
629636
for py in $(echo $PYTHON_TO_TEST | tr "," "\n")
630637
do
631638
$py --version
@@ -667,7 +674,7 @@ jobs:
667674
verbose: true
668675
- name: Upload test results to Codecov
669676
env: ${{ fromJSON(inputs.envs) }}
670-
if: (!cancelled()) && github.repository == 'apache/spark'
677+
if: fromJSON(inputs.envs).PYSPARK_CODECOV == 'true'
671678
uses: codecov/codecov-action@v5
672679
with:
673680
report_type: 'test_results'

.github/workflows/build_main.yml

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,5 +30,3 @@ jobs:
3030
packages: write
3131
name: Run
3232
uses: ./.github/workflows/build_and_test.yml
33-
secrets:
34-
codecov_token: ${{ secrets.CODECOV_TOKEN }}

.github/workflows/build_python_3.13_nogil.yml

Lines changed: 0 additions & 48 deletions
This file was deleted.

.github/workflows/labeler.yml

Lines changed: 1 addition & 154 deletions
Original file line numberDiff line numberDiff line change
@@ -24,169 +24,16 @@
2424
# See also https://github.community/t/specify-check-suite-when-creating-a-checkrun/118380/10
2525

2626
name: "On pull requests"
27-
on:
28-
pull_request_target:
29-
types: [opened, edited, reopened]
27+
on: pull_request_target
3028

3129
jobs:
3230
label:
3331
name: Label pull requests
3432
runs-on: ubuntu-latest
3533
permissions:
3634
contents: read
37-
pull-requests: write
3835
steps:
3936
- uses: actions/labeler@v5
4037
with:
4138
repo-token: "${{ secrets.GITHUB_TOKEN }}"
4239
sync-labels: true
43-
44-
jira-info:
45-
name: Comment JIRA information
46-
runs-on: ubuntu-latest
47-
permissions:
48-
pull-requests: write
49-
steps:
50-
- name: Extract JIRA IDs and comment
51-
uses: actions/github-script@v7
52-
with:
53-
github-token: ${{ secrets.GITHUB_TOKEN }}
54-
script: |
55-
const prTitle = context.payload.pull_request.title;
56-
const prNumber = context.payload.pull_request.number;
57-
58-
// Extract JIRA IDs from PR title
59-
const jiraIdRegex = /\bSPARK-\d+\b/g;
60-
const jiraIds = prTitle.match(jiraIdRegex);
61-
62-
// If no JIRA IDs found, check for [MINOR] tag
63-
if (!jiraIds || jiraIds.length === 0) {
64-
const minorRegex = /^\[MINOR\]/i;
65-
if (minorRegex.test(prTitle)) {
66-
console.log('PR title has [MINOR] tag, skipping');
67-
return;
68-
}
69-
70-
// Post reminder comment
71-
const reminderComment = `## ⚠️ Pull Request Title Validation\n\nThis pull request title does not contain a JIRA issue ID.\n\nPlease update the title to either:\n- Include a JIRA ID: \`[SPARK-12345] Your description\`\n- Mark as minor change: \`[MINOR] Your description\`\n\nFor minor changes that don't require a JIRA ticket (e.g., typo fixes), please prefix the title with \`[MINOR]\`.\n\n---\n*This comment was automatically generated by GitHub Actions*`;
72-
73-
const comments = await github.rest.issues.listComments({
74-
owner: context.repo.owner,
75-
repo: context.repo.repo,
76-
issue_number: prNumber
77-
});
78-
79-
const botComment = comments.data.find(comment =>
80-
comment.user.type === 'Bot' &&
81-
(comment.body.includes('## JIRA Issue Information') || comment.body.includes('## ⚠️ Pull Request Title Validation'))
82-
);
83-
84-
if (botComment) {
85-
await github.rest.issues.updateComment({
86-
owner: context.repo.owner,
87-
repo: context.repo.repo,
88-
comment_id: botComment.id,
89-
body: reminderComment
90-
});
91-
console.log('Updated reminder comment');
92-
} else {
93-
await github.rest.issues.createComment({
94-
owner: context.repo.owner,
95-
repo: context.repo.repo,
96-
issue_number: prNumber,
97-
body: reminderComment
98-
});
99-
console.log('Created reminder comment');
100-
}
101-
return;
102-
}
103-
104-
// Remove duplicates
105-
const uniqueJiraIds = [...new Set(jiraIds)];
106-
console.log(`Found JIRA IDs: ${uniqueJiraIds.join(', ')}`);
107-
108-
// Fetch JIRA information for each ID
109-
const jiraBaseUrl = 'https://issues.apache.org/jira';
110-
const jiraInfos = [];
111-
112-
for (const jiraId of uniqueJiraIds) {
113-
try {
114-
const response = await fetch(`${jiraBaseUrl}/rest/api/2/issue/${jiraId}`);
115-
116-
if (!response.ok) {
117-
jiraInfos.push({
118-
id: jiraId,
119-
type: 'Unknown',
120-
error: `Failed to fetch (HTTP ${response.status})`
121-
});
122-
continue;
123-
}
124-
125-
const data = await response.json();
126-
const fields = data.fields;
127-
128-
jiraInfos.push({
129-
id: jiraId,
130-
type: fields.issuetype?.name || 'Unknown',
131-
summary: fields.summary || 'N/A',
132-
assignee: fields.assignee ? fields.assignee.displayName : 'None',
133-
status: fields.status ? fields.status.name : 'Unknown',
134-
affected: fields.versions ? fields.versions.map(v => v.name) : []
135-
});
136-
} catch (error) {
137-
console.error(`Error fetching ${jiraId}:`, error);
138-
jiraInfos.push({
139-
id: jiraId,
140-
type: 'Unknown',
141-
error: error.message
142-
});
143-
}
144-
}
145-
146-
// Format comment
147-
let commentBody = '## JIRA Issue Information\n\n';
148-
149-
for (const info of jiraInfos) {
150-
if (info.error) {
151-
commentBody += `=== ${info.type} ${info.id} ===\n`;
152-
commentBody += `Error: ${info.error}\n\n`;
153-
} else {
154-
commentBody += `=== ${info.type} ${info.id} ===\n`;
155-
commentBody += `Summary: ${info.summary}\n`;
156-
commentBody += `Assignee: ${info.assignee}\n`;
157-
commentBody += `Status: ${info.status}\n`;
158-
commentBody += `Affected: ${JSON.stringify(info.affected)}\n\n`;
159-
}
160-
}
161-
162-
commentBody += '---\n*This comment was automatically generated by GitHub Actions*';
163-
164-
// Check if there's an existing comment from this action
165-
const comments = await github.rest.issues.listComments({
166-
owner: context.repo.owner,
167-
repo: context.repo.repo,
168-
issue_number: prNumber
169-
});
170-
171-
const botComment = comments.data.find(comment =>
172-
comment.user.type === 'Bot' &&
173-
(comment.body.includes('## JIRA Issue Information') || comment.body.includes('## ⚠️ Pull Request Title Validation'))
174-
);
175-
176-
if (botComment) {
177-
await github.rest.issues.updateComment({
178-
owner: context.repo.owner,
179-
repo: context.repo.repo,
180-
comment_id: botComment.id,
181-
body: commentBody
182-
});
183-
console.log('Updated existing comment');
184-
} else {
185-
await github.rest.issues.createComment({
186-
owner: context.repo.owner,
187-
repo: context.repo.repo,
188-
issue_number: prNumber,
189-
body: commentBody
190-
});
191-
console.log('Created new comment');
192-
}

.github/workflows/maven_test.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,7 @@ jobs:
6060
# timeout-minutes: 150
6161
strategy:
6262
fail-fast: false
63+
max-parallel: 20
6364
matrix:
6465
java:
6566
- ${{ inputs.java }}

.github/workflows/publish_snapshot.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ jobs:
3636
runs-on: ubuntu-latest
3737
strategy:
3838
fail-fast: false
39+
max-parallel: 20
3940
matrix:
4041
# keep in sync with default value of workflow_dispatch input 'branch'
4142
branch: ${{ fromJSON( inputs.branch || '["master", "branch-4.1", "branch-4.0", "branch-3.5"]' ) }}

.github/workflows/python_hosted_runner_test.yml

Lines changed: 1 addition & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,7 @@ jobs:
6363
# timeout-minutes: 150
6464
strategy:
6565
fail-fast: false
66+
max-parallel: 20
6667
matrix:
6768
java:
6869
- ${{ inputs.java }}
@@ -166,16 +167,6 @@ jobs:
166167
echo "Python Packaging Tests Enabled!"
167168
fi
168169
./dev/run-tests --parallelism 1 --modules "$MODULES_TO_TEST" --python-executables "$PYTHON_TO_TEST"
169-
- name: Upload test results to Codecov
170-
env: ${{ fromJSON(inputs.envs) }}
171-
if: (!cancelled()) && github.repository == 'apache/spark'
172-
uses: codecov/codecov-action@v5
173-
with:
174-
report_type: 'test_results'
175-
files: '**/target/test-reports/*.xml'
176-
flags: ${{ env.PYTHON_TO_TEST }}-${{ inputs.branch }}-${{ inputs.os }}
177-
name: PySpark-Test-Results
178-
token: ${{ secrets.codecov_token }}
179170
- name: Upload test results to report
180171
env: ${{ fromJSON(inputs.envs) }}
181172
if: always()

BUG_TICKET.txt

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
Title: [PySpark] Consolidate per-eval-type logic in Python worker UDF execution
2+
Type: Improvement
3+
4+
In the current Python worker UDF execution path, the logic for each {{PythonEvalType}} is spread across three layers:
5+
6+
- *Wrapper functions* (e.g. {{wrap_arrow_batch_iter_udf}}, {{wrap_scalar_arrow_udf}}, {{wrap_grouped_map_arrow_udf}}, etc.) that pre-process UDF arguments and post-process results.
7+
- *Mapper blocks* in {{read_udfs}} that handle input column selection, UDF invocation, and output assembly — often shared across unrelated eval types.
8+
- *Serializers* (e.g. {{ArrowStreamUDFSerializer}}) that mix data transformation (struct flatten/wrap, pandas conversion) with I/O concerns.
9+
10+
To understand the full data flow for a single eval type, one has to trace through all three layers and reconstruct the implicit contracts between them (e.g. the serializer yields {{[batch]}} lists, the mapper indexes into them, the wrapper attaches {{(result, type)}} tuples).
11+
12+
The goal is to make each eval type self-contained: all input transformation, UDF invocation, result verification, and output transformation should be co-located in one place in {{read_udfs}}, with serializers reduced to pure I/O.
13+
14+
This can be done incrementally, one eval type at a time, starting with the simpler Arrow-based ones.

R/run-tests.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,9 +30,9 @@ if [[ $(echo $SPARK_AVRO_JAR_PATH | wc -l) -eq 1 ]]; then
3030
fi
3131

3232
if [ -z "$SPARK_JARS" ]; then
33-
SPARKR_SUPPRESS_DEPRECATION_WARNING=1 SPARK_TESTING=1 NOT_CRAN=true $FWDIR/../bin/spark-submit --driver-java-options "-Dlog4j.configurationFile=file:$FWDIR/log4j2.properties" --conf spark.hadoop.fs.defaultFS="file:///" --conf spark.driver.extraJavaOptions="-Dio.netty.tryReflectionSetAccessible=true -Xss4M" --conf spark.executor.extraJavaOptions="-Dio.netty.tryReflectionSetAccessible=true -Xss4M" $FWDIR/pkg/tests/run-all.R 2>&1 | tee -a $LOGFILE
33+
SPARKR_SUPPRESS_DEPRECATION_WARNING=1 SPARK_TESTING=1 NOT_CRAN=true $FWDIR/../bin/spark-submit --driver-java-options "-Dlog4j.configurationFile=file:$FWDIR/log4j2.properties" --conf spark.hadoop.fs.defaultFS="file:///" --conf spark.driver.extraJavaOptions="-Xss4M" --conf spark.executor.extraJavaOptions="-Xss4M" $FWDIR/pkg/tests/run-all.R 2>&1 | tee -a $LOGFILE
3434
else
35-
SPARKR_SUPPRESS_DEPRECATION_WARNING=1 SPARK_TESTING=1 NOT_CRAN=true $FWDIR/../bin/spark-submit --jars $SPARK_JARS --driver-java-options "-Dlog4j.configurationFile=file:$FWDIR/log4j2.properties" --conf spark.hadoop.fs.defaultFS="file:///" --conf spark.driver.extraJavaOptions="-Dio.netty.tryReflectionSetAccessible=true -Xss4M" --conf spark.executor.extraJavaOptions="-Dio.netty.tryReflectionSetAccessible=true -Xss4M" $FWDIR/pkg/tests/run-all.R 2>&1 | tee -a $LOGFILE
35+
SPARKR_SUPPRESS_DEPRECATION_WARNING=1 SPARK_TESTING=1 NOT_CRAN=true $FWDIR/../bin/spark-submit --jars $SPARK_JARS --driver-java-options "-Dlog4j.configurationFile=file:$FWDIR/log4j2.properties" --conf spark.hadoop.fs.defaultFS="file:///" --conf spark.driver.extraJavaOptions="-Xss4M" --conf spark.executor.extraJavaOptions="-Xss4M" $FWDIR/pkg/tests/run-all.R 2>&1 | tee -a $LOGFILE
3636
fi
3737

3838
FAILED=$((PIPESTATUS[0]||$FAILED))

0 commit comments

Comments
 (0)