Skip to content

Commit b6362d0

Browse files
committed
GH-48582: [CI][GPU][C++][Python] Add new CUDA jobs using the new self-hosted runners (#48583)
### Rationale for this change The CUDA jobs stopped working when Voltron Data infrastructure went down. We have set up with ASF Infra a [runs-on](https://runs-on.com/runners/gpu/) solution to run CUDA runners. ### What changes are included in this PR? Add the new workflow for `cuda_extra.yml` with CI jobs that use the runs-on CUDA runners. Due to the underlying instances having CUDA 12.9 the jobs to be run are: - AMD64 Ubuntu 22 CUDA 11.7.1 - AMD64 Ubuntu 24 CUDA 12.9.0 - AMD64 Ubuntu 22 CUDA 11.7.1 Python - AMD64 Ubuntu 24 CUDA 12.9.0 Python A follow up issue has been created to add jobs for CUDA 13, see: #48783 A new label `CI: Extra: CUDA` has also been created. ### Are these changes tested? Yes via CI ### Are there any user-facing changes? No * GitHub Issue: #48582 Authored-by: Raúl Cumplido <raulcumplido@gmail.com> Signed-off-by: Raúl Cumplido <raulcumplido@gmail.com>
1 parent 8fb7a53 commit b6362d0

File tree

3 files changed

+136
-78
lines changed

3 files changed

+136
-78
lines changed

.github/workflows/cuda_extra.yml

Lines changed: 136 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,136 @@
1+
# Licensed to the Apache Software Foundation (ASF) under one
2+
# or more contributor license agreements. See the NOTICE file
3+
# distributed with this work for additional information
4+
# regarding copyright ownership. The ASF licenses this file
5+
# to you under the Apache License, Version 2.0 (the
6+
# "License"); you may not use this file except in compliance
7+
# with the License. You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing,
12+
# software distributed under the License is distributed on an
13+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
# KIND, either express or implied. See the License for the
15+
# specific language governing permissions and limitations
16+
# under the License.
17+
18+
name: CUDA Extra
19+
20+
on:
21+
push:
22+
tags:
23+
- '**'
24+
pull_request:
25+
types:
26+
- labeled
27+
- opened
28+
- reopened
29+
- synchronize
30+
schedule:
31+
- cron: |
32+
0 6 * * *
33+
34+
concurrency:
35+
group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }}
36+
cancel-in-progress: true
37+
38+
permissions:
39+
actions: read
40+
contents: read
41+
pull-requests: read
42+
43+
jobs:
44+
check-labels:
45+
if: github.event_name != 'schedule' || github.repository == 'apache/arrow'
46+
uses: ./.github/workflows/check_labels.yml
47+
secrets: inherit
48+
with:
49+
parent-workflow: cuda_extra
50+
51+
docker:
52+
needs: check-labels
53+
name: ${{ matrix.title }}
54+
runs-on: "runs-on=${{ github.run_id }}/family=g4dn.xlarge/image=ubuntu24-gpu-x64/spot=capacity-optimized"
55+
if: >-
56+
needs.check-labels.outputs.force == 'true' ||
57+
contains(fromJSON(needs.check-labels.outputs.ci-extra-labels || '[]'), 'CI: Extra') ||
58+
contains(fromJSON(needs.check-labels.outputs.ci-extra-labels || '[]'), 'CI: Extra: CUDA')
59+
timeout-minutes: 75
60+
strategy:
61+
fail-fast: false
62+
matrix:
63+
include:
64+
- cuda: 12.9.0
65+
ubuntu: 24.04
66+
image: ubuntu-cuda-cpp
67+
title: AMD64 Ubuntu 24 CUDA 12.9.0
68+
- cuda: 11.7.1
69+
ubuntu: 22.04
70+
image: ubuntu-cuda-cpp
71+
title: AMD64 Ubuntu 22 CUDA 11.7.1
72+
- cuda: 12.9.0
73+
ubuntu: 24.04
74+
image: ubuntu-cuda-python
75+
title: AMD64 Ubuntu 24 CUDA 12.9.0 Python
76+
- cuda: 11.7.1
77+
ubuntu: 22.04
78+
image: ubuntu-cuda-python
79+
title: AMD64 Ubuntu 22 CUDA 11.7.1 Python
80+
env:
81+
ARCHERY_DEBUG: 1
82+
ARROW_ENABLE_TIMING_TESTS: OFF
83+
DOCKER_VOLUME_PREFIX: ".docker/"
84+
steps:
85+
- name: Checkout Arrow
86+
uses: actions/checkout@v6
87+
with:
88+
fetch-depth: 0
89+
submodules: recursive
90+
- name: Cache Docker Volumes
91+
uses: actions/cache@v5
92+
with:
93+
path: .docker
94+
key: extra-${{ matrix.image }}-${{ hashFiles('cpp/**') }}
95+
restore-keys: extra-${{ matrix.image }}-
96+
- name: Setup Python
97+
uses: actions/setup-python@v6
98+
with:
99+
python-version: 3
100+
- name: Setup Archery
101+
run: python3 -m pip install -e dev/archery[docker]
102+
- name: Display NVIDIA SMI details
103+
run: |
104+
nvidia-smi
105+
nvidia-smi -L
106+
nvidia-smi -q -d Memory
107+
- name: Execute Docker Build
108+
continue-on-error: ${{ matrix.continue-on-error || false }}
109+
env:
110+
ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }}
111+
ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }}
112+
CUDA: ${{ matrix.cuda }}
113+
UBUNTU: ${{ matrix.ubuntu }}
114+
run: |
115+
# GH-40558: reduce ASLR to avoid ASAN/LSAN crashes
116+
sudo sysctl -w vm.mmap_rnd_bits=28
117+
source ci/scripts/util_enable_core_dumps.sh
118+
archery docker run ${{ matrix.run-options || '' }} ${{ matrix.image }}
119+
- name: Docker Push
120+
if: >-
121+
success() &&
122+
github.event_name == 'push' &&
123+
github.repository == 'apache/arrow' &&
124+
github.ref_name == 'main'
125+
env:
126+
ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }}
127+
ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }}
128+
continue-on-error: true
129+
run: archery docker push ${{ matrix.image }}
130+
131+
report-extra-cpp:
132+
if: github.event_name == 'schedule' && always()
133+
needs:
134+
- docker
135+
uses: ./.github/workflows/report_ci.yml
136+
secrets: inherit

dev/tasks/docker-tests/github.cuda.yml

Lines changed: 0 additions & 52 deletions
This file was deleted.

dev/tasks/tasks.yml

Lines changed: 0 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -41,9 +41,6 @@ groups:
4141

4242
{############################# Testing tasks #################################}
4343

44-
cuda:
45-
- test-cuda-*
46-
4744
test:
4845
- test-*
4946

@@ -762,35 +759,12 @@ tasks:
762759
artifacts:
763760
- docs.tar.gz
764761

765-
############################## CUDA tests #################################
766-
767-
{% for ubuntu, cuda in [("22.04", "11.7.1"), ("24.04", "13.0.2")] %}
768-
test-cuda-cpp-ubuntu-{{ ubuntu }}-cuda-{{ cuda }}:
769-
ci: github
770-
template: docker-tests/github.cuda.yml
771-
params:
772-
env:
773-
CUDA: {{ cuda }}
774-
UBUNTU: {{ ubuntu }}
775-
image: ubuntu-cuda-cpp
776-
777-
test-cuda-python-ubuntu-{{ ubuntu }}-cuda-{{ cuda }}:
778-
ci: github
779-
template: docker-tests/github.cuda.yml
780-
params:
781-
env:
782-
CUDA: {{ cuda }}
783-
UBUNTU: {{ ubuntu }}
784-
image: ubuntu-cuda-python
785-
{% endfor %}
786-
787762
############################## Fuzz tests #################################
788763

789764
test-build-cpp-fuzz:
790765
ci: github
791766
template: fuzz-tests/github.oss-fuzz.yml
792767

793-
794768
############################## vcpkg tests ##################################
795769

796770
test-build-vcpkg-win:

0 commit comments

Comments
 (0)