Skip to content

Commit a8d4d64

Browse files
luraessvchuravysimonbyrne
authored
Add ROCm support (AMDGPU) (#572)
* Add ROCm (AMDGPU) support * Add buildkite script Co-authored-by: Valentin Churavy <[email protected]> Co-authored-by: Simon Byrne <[email protected]>
1 parent d2b4be9 commit a8d4d64

17 files changed

+171
-16
lines changed

.buildkite/pipeline.yml

Lines changed: 91 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,8 @@
77
queue: "juliagpu"
88
cuda: "11.0"
99
env:
10-
OPENMPI_VER: "4.0"
11-
OPENMPI_VER_FULL: "4.0.3"
10+
OPENMPI_VER: "4.1"
11+
OPENMPI_VER_FULL: "4.1.4"
1212
UCX_VER: "1.12.1"
1313
CCACHE_DIR: "/root/ccache"
1414
commands: |
@@ -43,7 +43,7 @@
4343
- "mpi-prefix.tar.gz"
4444

4545
- wait
46-
46+
4747
- label: "Tests -- Julia 1.6"
4848
plugins:
4949
- JuliaCI/julia#v1:
@@ -135,3 +135,91 @@
135135
import Pkg
136136
Pkg.test("MPI")
137137
'
138+
139+
- group: "ROCm"
140+
key: "rocm"
141+
steps:
142+
- label: "Build OpenMPI"
143+
key: "rocm-build-openmpi"
144+
agents:
145+
queue: "juliagpu"
146+
rocm: "*" # todo fix ROCM version
147+
env:
148+
OPENMPI_VER: "4.1"
149+
OPENMPI_VER_FULL: "4.1.4"
150+
UCX_VER: "1.13-rc1"
151+
CCACHE_DIR: "/root/ccache"
152+
commands: |
153+
echo "--- Install packages"
154+
apt-get install --yes --no-install-recommends curl ccache
155+
export PATH="/usr/lib/ccache/:$$PATH"
156+
echo "--- Build UCX"
157+
curl -L https://github.com/openucx/ucx/releases/download/v1.13.0-rc1/ucx-1.13.0.tar.gz --output ucx.tar.gz
158+
tar -zxf ucx.tar.gz
159+
pushd ucx-*
160+
./configure --with-rocm --enable-mt --prefix=$$(realpath ../mpi-prefix)
161+
make -j
162+
make install
163+
popd
164+
echo "--- Build OpenMPI"
165+
curl -L https://download.open-mpi.org/release/open-mpi/v$${OPENMPI_VER}/openmpi-$${OPENMPI_VER_FULL}.tar.gz --output openmpi.tar.gz
166+
tar -zxf openmpi.tar.gz
167+
pushd openmpi-*
168+
./configure --with-ucx=$$(realpath ../mpi-prefix) --prefix=$$(realpath ../mpi-prefix)
169+
make -j
170+
make install
171+
popd
172+
echo "--- Package prefix"
173+
tar -zcf mpi-prefix.tar.gz mpi-prefix/
174+
echo "--- ccache stats"
175+
ccache -s
176+
artifact_paths:
177+
- "mpi-prefix.tar.gz"
178+
179+
- wait
180+
181+
- label: "Tests -- Julia 1.7"
182+
plugins:
183+
- JuliaCI/julia#v1:
184+
version: "1.7"
185+
persist_depot_dirs: packages,artifacts,compiled
186+
agents:
187+
queue: "juliagpu"
188+
rocm: "*" # todo fix ROCM version
189+
if: build.message !~ /\[skip tests\]/
190+
timeout_in_minutes: 60
191+
env:
192+
JULIA_MPI_TEST_ARRAYTYPE: ROCArray
193+
JULIA_MPI_TEST_NPROCS: 2
194+
JULIA_MPI_PATH: "${BUILDKITE_BUILD_CHECKOUT_PATH}/openmpi"
195+
OMPI_ALLOW_RUN_AS_ROOT: 1
196+
OMPI_ALLOW_RUN_AS_ROOT_CONFIRM: 1
197+
OMPI_MCA_btl_vader_single_copy_mechanism: 'none' # https://github.com/open-mpi/ompi/issues/4948
198+
OPAL_PREFIX: "${BUILDKITE_BUILD_CHECKOUT_PATH}/openmpi" # Should we set this for the user?
199+
JULIA_CUDA_MEMORY_POOL: "none"
200+
commands: |
201+
echo "--- Configure MPI"
202+
buildkite-agent artifact download --step "rocm-build-openmpi" mpi-prefix.tar.gz .
203+
mkdir -p $${JULIA_MPI_PATH}
204+
tar -zxf mpi-prefix.tar.gz --strip-components 1 -C $${JULIA_MPI_PATH}
205+
export PATH=$${JULIA_MPI_PATH}/bin:$${PATH}
206+
export LD_LIBRARY_PATH=$${JULIA_MPI_PATH}/lib:$${LD_LIBRARY_PATH}
207+
208+
echo "--- Setup Julia packages"
209+
julia --color=yes --project=. -e '
210+
import Pkg
211+
Pkg.develop(; path = joinpath(pwd(), "lib", "MPIPreferences"))
212+
'
213+
julia --color=yes --project=test -e '
214+
using Pkg
215+
Pkg.develop(path="lib/MPIPreferences")
216+
using MPIPreferences
217+
MPIPreferences.use_system_binary(export_prefs=true)
218+
rm("test/Manifest.toml")
219+
'
220+
221+
echo "+++ Run tests"
222+
julia --color=yes --project=. -e '
223+
import Pkg
224+
Pkg.test("MPI")
225+
'

docs/src/configuration.md

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ By default, MPI.jl will download and link against the following MPI implementati
77
This is suitable for most single-node use cases, but for larger systems, such as HPC
88
clusters or multi-GPU machines, you will probably want to configure against a
99
system-provided MPI implementation in order to exploit features such as fast network
10-
interfaces and CUDA-aware MPI interfaces.
10+
interfaces and CUDA-aware or ROCm-aware MPI interfaces.
1111

1212
The MPIPreferences.jl package allows the user to choose which MPI implementation to use in MPI.jl. It uses [Preferences.jl](https://github.com/JuliaPackaging/Preferences.jl) to
1313
configure the MPI backend for each project separately. This provides
@@ -134,8 +134,9 @@ julia> MPIPreferences.use_system_binary()
134134
The test suite can also be modified by the following variables:
135135

136136
- `JULIA_MPI_TEST_NPROCS`: How many ranks to use within the tests
137-
- `JULIA_MPI_TEST_ARRAYTYPE`: Set to `CuArray` to test the CUDA-aware interface with
138-
[`CUDA.CuArray](https://github.com/JuliaGPU/CUDA.jl) buffers.
137+
- `JULIA_MPI_TEST_ARRAYTYPE`: Set to `CuArray` or `ROCArray` to test the CUDA-aware interface with
138+
[`CUDA.CuArray`](https://github.com/JuliaGPU/CUDA.jl) or the ROCm-aware interface with
139+
[`AMDGPU.ROCArray`](https://github.com/JuliaGPU/AMDGPU.jl) or buffers.
139140
- `JULIA_MPI_TEST_BINARY`: Check that the specified MPI binary is used for the tests
140141
- `JULIA_MPI_TEST_ABI`: Check that the specified MPI ABI is used for the tests
141142

docs/src/knownissues.md

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,7 @@ _More about CUDA.jl [memory environment-variables](https://cuda.juliagpu.org/sta
9797

9898
Make sure to:
9999
- Have MPI and CUDA on path (or module loaded) that were used to build the CUDA-aware MPI
100-
- Make sure to have:
100+
- Set the following environment variables:
101101
```
102102
export JULIA_CUDA_MEMORY_POOL=none
103103
export JULIA_CUDA_USE_BINARYBUILDER=false
@@ -114,6 +114,22 @@ Make sure to:
114114
115115
After that, it may be preferred to run the Julia MPI script (as suggested [here](https://discourse.julialang.org/t/cuda-aware-mpi-works-on-system-but-not-for-julia/75060/11)) launching it from a shell script (as suggested [here](https://discourse.julialang.org/t/cuda-aware-mpi-works-on-system-but-not-for-julia/75060/4)).
116116
117+
## ROCm-aware MPI
118+
119+
### Hints to ensure ROCm-aware MPI to be functional
120+
121+
Make sure to:
122+
- Have MPI and ROCm on path (or module loaded) that were used to build the ROCm-aware MPI
123+
- Add AMDGPU and MPI packages in Julia:
124+
```
125+
julia -e 'using Pkg; pkg"add AMDGPU"; pkg"add MPI"; using MPI; MPI.use_system_binary()'
126+
```
127+
- Then in Julia, upon loading MPI and CUDA modules, you can check
128+
- AMDGPU version: `AMDGPU.versioninfo()`
129+
- If you are using correct MPI implementation: `MPI.identify_implementation()`
130+
131+
After that, [this script](https://gist.github.com/luraess/c228ec08629737888a18c6a1e397643c) can be used to verify if ROCm-aware MPI is functional (modified after the CUDA-aware version from [here](https://discourse.julialang.org/t/cuda-aware-mpi-works-on-system-but-not-for-julia/75060/11)). It may be preferred to run the Julia ROCm-aware MPI script launching it from a shell script (as suggested [here](https://discourse.julialang.org/t/cuda-aware-mpi-works-on-system-but-not-for-julia/75060/4)).
132+
117133
## Microsoft MPI
118134
119135
### Custom operators on 32-bit Windows

docs/src/usage.md

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -72,8 +72,16 @@ $ mpiexecjl --project=/path/to/project -n 20 julia script.jl
7272

7373
If your MPI implementation has been compiled with CUDA support, then `CUDA.CuArray`s (from the
7474
[CUDA.jl](https://github.com/JuliaGPU/CUDA.jl) package) can be passed directly as
75-
send and receive buffers for point-to-point and collective operations (they may also work
76-
with one-sided operations, but these are not often supported).
75+
send and receive buffers for point-to-point and collective operations (they may also work with one-sided operations, but these are not often supported).
7776

7877
If using Open MPI, the status of CUDA support can be checked via the
7978
[`MPI.has_cuda()`](@ref) function.
79+
80+
## ROCm-aware MPI support
81+
82+
If your MPI implementation has been compiled with ROCm support (AMDGPU), then `AMDGPU.ROCArray`s (from the
83+
[AMDGPU.jl](https://github.com/JuliaGPU/AMDGPU.jl) package) can be passed directly as send and receive buffers for point-to-point and collective operations (they may also work with one-sided operations, but these are not often supported).
84+
85+
Successfully running the [alltoall_test_rocm.jl](https://gist.github.com/luraess/c228ec08629737888a18c6a1e397643c) should confirm your MPI implementation to have the ROCm support (AMDGPU) enabled. Moreover, successfully running the [alltoall_test_rocm_mulitgpu.jl](https://gist.github.com/luraess/d478b3f98eae984931fd39a7158f4b9e) should confirm your ROCm-aware MPI implementation to use multiple AMD GPUs (one GPU per rank).
86+
87+
The status of ROCm (AMDGPU) support cannot currently be queried.

src/MPI.jl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -134,6 +134,7 @@ function __init__()
134134

135135
run_load_time_hooks()
136136

137+
@require AMDGPU="21141c5a-9bdb-4563-92ae-f87d6854732e" include("rocm.jl")
137138
@require CUDA="052768ef-5323-5732-b1bb-66c8b64840ba" include("cuda.jl")
138139
end
139140

src/buffers.jl

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ Currently supported are:
4444
- `Array`
4545
- `SubArray`
4646
- `CUDA.CuArray` if CUDA.jl is loaded.
47+
- `AMDGPU.ROCArray` if AMDGPU.jl is loaded.
4748
4849
Additionally, certain sentinel values can be used, e.g. `MPI_IN_PLACE` or `MPI_BOTTOM`.
4950
"""
@@ -102,8 +103,9 @@ and `datatype`. Methods are provided for
102103
103104
- `Ref`
104105
- `Array`
105-
- `CUDA.CuArray` if CUDA.jl is loaded
106-
- `SubArray`s of an `Array` or `CUDA.CuArray` where the layout is contiguous, sequential or
106+
- `CUDA.CuArray` if CUDA.jl is loaded.
107+
- `AMDGPU.ROCArray` if AMDGPU.jl is loaded.
108+
- `SubArray`s of an `Array`, `CUDA.CuArray` or `AMDGPU.ROCArray` where the layout is contiguous, sequential or
107109
blocked.
108110
109111
# See also

src/rocm.jl

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
import .AMDGPU
2+
3+
function Base.cconvert(::Type{MPIPtr}, A::AMDGPU.ROCArray{T}) where T
4+
A
5+
end
6+
7+
function Base.unsafe_convert(::Type{MPIPtr}, X::AMDGPU.ROCArray{T}) where T
8+
reinterpret(MPIPtr, Base.unsafe_convert(Ptr{T}, X.buf.ptr+X.offset))
9+
end
10+
11+
# only need to define this for strided arrays: all others can be handled by generic machinery
12+
function Base.unsafe_convert(::Type{MPIPtr}, V::SubArray{T,N,P,I,true}) where {T,N,P<:AMDGPU.ROCArray,I}
13+
X = parent(V)
14+
pX = Base.unsafe_convert(Ptr{T}, X)
15+
pV = pX + ((V.offset1 + V.stride1) - first(LinearIndices(X)))*sizeof(T)
16+
return reinterpret(MPIPtr, pV)
17+
end
18+
19+
function Buffer(arr::AMDGPU.ROCArray)
20+
Buffer(arr, Cint(length(arr)), Datatype(eltype(arr)))
21+
end

test/Project.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
[deps]
2+
AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
23
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
34
DoubleFloats = "497a8b3b-efae-58df-a0af-a86822472b78"
45
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"

test/common.jl

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,16 @@ if get(ENV,"JULIA_MPI_TEST_ARRAYTYPE","") == "CuArray"
55
import CUDA
66
ArrayType = CUDA.CuArray
77
synchronize() = CUDA.synchronize()
8+
elseif get(ENV,"JULIA_MPI_TEST_ARRAYTYPE","") == "ROCArray"
9+
import AMDGPU
10+
ArrayType = AMDGPU.ROCArray
11+
function synchronize()
12+
# TODO: AMDGPU synchronization story is complicated. HSA does not provide a consistent notion of global queues. We need a mechanism for all GPUArrays.jl provided kernels to be synchronized.
13+
queue = AMDGPU.get_default_queue()
14+
barrier = AMDGPU.barrier_and!(queue, AMDGPU.active_kernels(queue))
15+
AMDGPU.HIP.hipDeviceSynchronize() # Sync all HIP kernels e.g. BLAS. N.B. this is blocking Julia progress
16+
wait(barrier)
17+
end
818
else
919
ArrayType = Array
1020
synchronize() = nothing

test/runtests.jl

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,11 @@ if get(ENV, "JULIA_MPI_TEST_ARRAYTYPE", "") == "CuArray"
99
CUDA.version()
1010
CUDA.precompile_runtime()
1111
ArrayType = CUDA.CuArray
12+
elseif get(ENV,"JULIA_MPI_TEST_ARRAYTYPE","") == "ROCArray"
13+
import AMDGPU
14+
AMDGPU.versioninfo()
15+
# DEBUG: currently no `precompile_runtime()` functionnality is implemented in AMDGPU.jl. If needed, it could be added by analogy of CUDA; no use of caps in AMDGPU.jl, but https://github.com/JuliaGPU/AMDGPU.jl/blob/cfaade146977594bf18e14b285ee3a9c84fbc7f2/src/execution.jl#L351-L357 shows how to construct a CompilerJob for a given agent.
16+
ArrayType = AMDGPU.ROCArray
1217
else
1318
ArrayType = Array
1419
end

0 commit comments

Comments
 (0)