Add ROCm support (AMDGPU) (#572)

luraess · vchuravy · simonbyrne · web-flow · commit a8d4d6400d9f · 2022-06-03T11:28:47.000-07:00
* Add ROCm (AMDGPU) support
* Add buildkite script

Co-authored-by: Valentin Churavy &lt;vchuravy@users.noreply.github.com&gt;
Co-authored-by: Simon Byrne &lt;simonbyrne@gmail.com&gt;
diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
@@ -7,8 +7,8 @@
         queue: "juliagpu"
         cuda: "11.0"
       env:
-        OPENMPI_VER: "4.0"
-        OPENMPI_VER_FULL: "4.0.3"
+        OPENMPI_VER: "4.1"
+        OPENMPI_VER_FULL: "4.1.4"
         UCX_VER: "1.12.1"
         CCACHE_DIR: "/root/ccache"
       commands: |
@@ -43,7 +43,7 @@
         - "mpi-prefix.tar.gz"
 
     - wait
-    
+
     - label: "Tests -- Julia 1.6"
       plugins:
         - JuliaCI/julia#v1:
@@ -135,3 +135,91 @@
            import Pkg
            Pkg.test("MPI")
            '
+
+ - group: "ROCm"
+   key: "rocm"
+   steps:
+    - label: "Build OpenMPI"
+      key: "rocm-build-openmpi"
+      agents:
+        queue: "juliagpu"
+        rocm: "*" # todo fix ROCM version
+      env:
+        OPENMPI_VER: "4.1"
+        OPENMPI_VER_FULL: "4.1.4"
+        UCX_VER: "1.13-rc1"
+        CCACHE_DIR: "/root/ccache"
+      commands: |
+        echo "--- Install packages"
+        apt-get install --yes --no-install-recommends curl ccache
+        export PATH="/usr/lib/ccache/:$$PATH"
+        echo "--- Build UCX"
+        curl -L https://github.com/openucx/ucx/releases/download/v1.13.0-rc1/ucx-1.13.0.tar.gz --output ucx.tar.gz
+        tar -zxf ucx.tar.gz
+        pushd ucx-*
+        ./configure --with-rocm --enable-mt --prefix=$$(realpath ../mpi-prefix)
+        make -j
+        make install
+        popd
+        echo "--- Build OpenMPI"
+        curl -L https://download.open-mpi.org/release/open-mpi/v$${OPENMPI_VER}/openmpi-$${OPENMPI_VER_FULL}.tar.gz --output openmpi.tar.gz
+        tar -zxf openmpi.tar.gz
+        pushd openmpi-*
+        ./configure --with-ucx=$$(realpath ../mpi-prefix) --prefix=$$(realpath ../mpi-prefix)
+        make -j
+        make install
+        popd
+        echo "--- Package prefix"
+        tar -zcf mpi-prefix.tar.gz mpi-prefix/
+        echo "--- ccache stats"
+        ccache -s
+      artifact_paths:
+        - "mpi-prefix.tar.gz"
+
+    - wait
+
+    - label: "Tests -- Julia 1.7"
+      plugins:
+        - JuliaCI/julia#v1:
+            version: "1.7"
+            persist_depot_dirs: packages,artifacts,compiled
+      agents:
+        queue: "juliagpu"
+        rocm: "*" # todo fix ROCM version
+      if: build.message !~ /\[skip tests\]/
+      timeout_in_minutes: 60
+      env:
+        JULIA_MPI_TEST_ARRAYTYPE: ROCArray
+        JULIA_MPI_TEST_NPROCS: 2
+        JULIA_MPI_PATH: "${BUILDKITE_BUILD_CHECKOUT_PATH}/openmpi"
+        OMPI_ALLOW_RUN_AS_ROOT: 1
+        OMPI_ALLOW_RUN_AS_ROOT_CONFIRM: 1
+        OMPI_MCA_btl_vader_single_copy_mechanism: 'none' # https://github.com/open-mpi/ompi/issues/4948
+        OPAL_PREFIX: "${BUILDKITE_BUILD_CHECKOUT_PATH}/openmpi" # Should we set this for the user?
+        JULIA_CUDA_MEMORY_POOL: "none"
+      commands: |
+        echo "--- Configure MPI"
+        buildkite-agent artifact download --step "rocm-build-openmpi" mpi-prefix.tar.gz .
+        mkdir -p $${JULIA_MPI_PATH}
+        tar -zxf mpi-prefix.tar.gz --strip-components 1 -C $${JULIA_MPI_PATH}
+        export PATH=$${JULIA_MPI_PATH}/bin:$${PATH}
+        export LD_LIBRARY_PATH=$${JULIA_MPI_PATH}/lib:$${LD_LIBRARY_PATH}
+
+        echo "--- Setup Julia packages"
+        julia --color=yes --project=. -e '
+            import Pkg
+            Pkg.develop(; path = joinpath(pwd(), "lib", "MPIPreferences"))
+            '
+        julia --color=yes --project=test -e '
+            using Pkg
+            Pkg.develop(path="lib/MPIPreferences")
+            using MPIPreferences
+            MPIPreferences.use_system_binary(export_prefs=true)
+            rm("test/Manifest.toml")
+            '
+
+        echo "+++ Run tests"
+        julia --color=yes --project=. -e '
+           import Pkg
+           Pkg.test("MPI")
+           '
diff --git a/docs/src/configuration.md b/docs/src/configuration.md
@@ -7,7 +7,7 @@ By default, MPI.jl will download and link against the following MPI implementati
 This is suitable for most single-node use cases, but for larger systems, such as HPC
 clusters or multi-GPU machines, you will probably want to configure against a
 system-provided MPI implementation in order to exploit features such as fast network
-interfaces and CUDA-aware MPI interfaces.
+interfaces and CUDA-aware or ROCm-aware MPI interfaces.
 
 The MPIPreferences.jl package allows the user to choose which MPI implementation to use in MPI.jl. It uses [Preferences.jl](https://github.com/JuliaPackaging/Preferences.jl) to
 configure the MPI backend for each project separately. This provides
@@ -134,8 +134,9 @@ julia> MPIPreferences.use_system_binary()
 The test suite can also be modified by the following variables:
 
 - `JULIA_MPI_TEST_NPROCS`: How many ranks to use within the tests
-- `JULIA_MPI_TEST_ARRAYTYPE`: Set to `CuArray` to test the CUDA-aware interface with
-  [`CUDA.CuArray](https://github.com/JuliaGPU/CUDA.jl) buffers.
+- `JULIA_MPI_TEST_ARRAYTYPE`: Set to `CuArray` or `ROCArray` to test the CUDA-aware interface with
+  [`CUDA.CuArray`](https://github.com/JuliaGPU/CUDA.jl) or the ROCm-aware interface with 
+  [`AMDGPU.ROCArray`](https://github.com/JuliaGPU/AMDGPU.jl) or buffers.
 - `JULIA_MPI_TEST_BINARY`: Check that the specified MPI binary is used for the tests
 - `JULIA_MPI_TEST_ABI`: Check that the specified MPI ABI is used for the tests
 
diff --git a/docs/src/knownissues.md b/docs/src/knownissues.md
@@ -97,7 +97,7 @@ _More about CUDA.jl [memory environment-variables](https://cuda.juliagpu.org/sta
 
 Make sure to:
 - Have MPI and CUDA on path (or module loaded) that were used to build the CUDA-aware MPI
-- Make sure to have:
+- Set the following environment variables:
     ```
     export JULIA_CUDA_MEMORY_POOL=none
     export JULIA_CUDA_USE_BINARYBUILDER=false
@@ -114,6 +114,22 @@ Make sure to:
 
 After that, it may be preferred to run the Julia MPI script (as suggested [here](https://discourse.julialang.org/t/cuda-aware-mpi-works-on-system-but-not-for-julia/75060/11)) launching it from a shell script (as suggested [here](https://discourse.julialang.org/t/cuda-aware-mpi-works-on-system-but-not-for-julia/75060/4)).
 
+## ROCm-aware MPI
+
+### Hints to ensure ROCm-aware MPI to be functional
+
+Make sure to:
+- Have MPI and ROCm on path (or module loaded) that were used to build the ROCm-aware MPI
+- Add AMDGPU and MPI packages in Julia: 
+    ```
+    julia -e 'using Pkg; pkg"add AMDGPU"; pkg"add MPI"; using MPI; MPI.use_system_binary()'
+    ```
+- Then in Julia, upon loading MPI and CUDA modules, you can check
+  - AMDGPU version: `AMDGPU.versioninfo()`
+  - If you are using correct MPI implementation: `MPI.identify_implementation()`
+
+After that, [this script](https://gist.github.com/luraess/c228ec08629737888a18c6a1e397643c) can be used to verify if ROCm-aware MPI is functional (modified after the CUDA-aware version from [here](https://discourse.julialang.org/t/cuda-aware-mpi-works-on-system-but-not-for-julia/75060/11)). It may be preferred to run the Julia ROCm-aware MPI script launching it from a shell script (as suggested [here](https://discourse.julialang.org/t/cuda-aware-mpi-works-on-system-but-not-for-julia/75060/4)).
+
 ## Microsoft MPI
 
 ### Custom operators on 32-bit Windows
diff --git a/docs/src/usage.md b/docs/src/usage.md
@@ -72,8 +72,16 @@ $ mpiexecjl --project=/path/to/project -n 20 julia script.jl
 
 If your MPI implementation has been compiled with CUDA support, then `CUDA.CuArray`s (from the
 [CUDA.jl](https://github.com/JuliaGPU/CUDA.jl) package) can be passed directly as
-send and receive buffers for point-to-point and collective operations (they may also work
-with one-sided operations, but these are not often supported).
+send and receive buffers for point-to-point and collective operations (they may also work with one-sided operations, but these are not often supported).
 
 If using Open MPI, the status of CUDA support can be checked via the
 [`MPI.has_cuda()`](@ref) function.
+
+## ROCm-aware MPI support
+
+If your MPI implementation has been compiled with ROCm support (AMDGPU), then `AMDGPU.ROCArray`s (from the
+[AMDGPU.jl](https://github.com/JuliaGPU/AMDGPU.jl) package) can be passed directly as send and receive buffers for point-to-point and collective operations (they may also work with one-sided operations, but these are not often supported).
+
+Successfully running the [alltoall_test_rocm.jl](https://gist.github.com/luraess/c228ec08629737888a18c6a1e397643c) should confirm your MPI implementation to have the ROCm support (AMDGPU) enabled. Moreover, successfully running the [alltoall_test_rocm_mulitgpu.jl](https://gist.github.com/luraess/d478b3f98eae984931fd39a7158f4b9e) should confirm your ROCm-aware MPI implementation to use multiple AMD GPUs (one GPU per rank).
+
+The status of ROCm (AMDGPU) support cannot currently be queried.
diff --git a/src/MPI.jl b/src/MPI.jl
@@ -134,6 +134,7 @@ function __init__()
 
     run_load_time_hooks()
 
+    @require AMDGPU="21141c5a-9bdb-4563-92ae-f87d6854732e" include("rocm.jl")
     @require CUDA="052768ef-5323-5732-b1bb-66c8b64840ba" include("cuda.jl")
 end
 
diff --git a/src/buffers.jl b/src/buffers.jl
@@ -44,6 +44,7 @@ Currently supported are:
  - `Array`
  - `SubArray`
  - `CUDA.CuArray` if CUDA.jl is loaded.
+ - `AMDGPU.ROCArray` if AMDGPU.jl is loaded.
 
 Additionally, certain sentinel values can be used, e.g. `MPI_IN_PLACE` or `MPI_BOTTOM`.
 """
@@ -102,8 +103,9 @@ and `datatype`. Methods are provided for
 
  - `Ref`
  - `Array`
- - `CUDA.CuArray` if CUDA.jl is loaded
- - `SubArray`s of an `Array` or `CUDA.CuArray` where the layout is contiguous, sequential or
+ - `CUDA.CuArray` if CUDA.jl is loaded.
+ - `AMDGPU.ROCArray` if AMDGPU.jl is loaded.
+ - `SubArray`s of an `Array`, `CUDA.CuArray` or `AMDGPU.ROCArray` where the layout is contiguous, sequential or
    blocked.
 
 # See also
diff --git a/src/rocm.jl b/src/rocm.jl
@@ -0,0 +1,21 @@
+import .AMDGPU
+
+function Base.cconvert(::Type{MPIPtr}, A::AMDGPU.ROCArray{T}) where T
+    A
+end
+
+function Base.unsafe_convert(::Type{MPIPtr}, X::AMDGPU.ROCArray{T}) where T
+    reinterpret(MPIPtr, Base.unsafe_convert(Ptr{T}, X.buf.ptr+X.offset))
+end
+
+# only need to define this for strided arrays: all others can be handled by generic machinery
+function Base.unsafe_convert(::Type{MPIPtr}, V::SubArray{T,N,P,I,true}) where {T,N,P<:AMDGPU.ROCArray,I}
+    X = parent(V)
+    pX = Base.unsafe_convert(Ptr{T}, X)
+    pV = pX + ((V.offset1 + V.stride1) - first(LinearIndices(X)))*sizeof(T)
+    return reinterpret(MPIPtr, pV)
+end
+
+function Buffer(arr::AMDGPU.ROCArray)
+    Buffer(arr, Cint(length(arr)), Datatype(eltype(arr)))
+end
diff --git a/test/Project.toml b/test/Project.toml
@@ -1,4 +1,5 @@
 [deps]
+AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 DoubleFloats = "497a8b3b-efae-58df-a0af-a86822472b78"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
diff --git a/test/common.jl b/test/common.jl
@@ -5,6 +5,16 @@ if get(ENV,"JULIA_MPI_TEST_ARRAYTYPE","") == "CuArray"
     import CUDA
     ArrayType = CUDA.CuArray
     synchronize() = CUDA.synchronize()
+elseif get(ENV,"JULIA_MPI_TEST_ARRAYTYPE","") == "ROCArray"
+    import AMDGPU
+    ArrayType = AMDGPU.ROCArray
+    function synchronize()
+        # TODO: AMDGPU synchronization story is complicated. HSA does not provide a consistent notion of global queues. We need a mechanism for all GPUArrays.jl provided kernels to be synchronized.
+        queue = AMDGPU.get_default_queue()
+        barrier = AMDGPU.barrier_and!(queue, AMDGPU.active_kernels(queue))
+        AMDGPU.HIP.hipDeviceSynchronize() # Sync all HIP kernels e.g. BLAS. N.B. this is blocking Julia progress
+        wait(barrier)
+    end
 else
     ArrayType = Array
     synchronize() = nothing
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -9,6 +9,11 @@ if get(ENV, "JULIA_MPI_TEST_ARRAYTYPE", "") == "CuArray"
     CUDA.version()
     CUDA.precompile_runtime()
     ArrayType = CUDA.CuArray
+elseif get(ENV,"JULIA_MPI_TEST_ARRAYTYPE","") == "ROCArray"
+    import AMDGPU
+    AMDGPU.versioninfo()
+    # DEBUG: currently no `precompile_runtime()` functionnality is implemented in AMDGPU.jl. If needed, it could be added by analogy of CUDA; no use of caps in AMDGPU.jl, but https://github.com/JuliaGPU/AMDGPU.jl/blob/cfaade146977594bf18e14b285ee3a9c84fbc7f2/src/execution.jl#L351-L357 shows how to construct a CompilerJob for a given agent.
+    ArrayType = AMDGPU.ROCArray
 else
     ArrayType = Array
 end
diff --git a/test/test_basic.jl b/test/test_basic.jl
@@ -8,7 +8,7 @@ MPI.Init()
 
 @test MPI.has_cuda() isa Bool
 
-if ArrayType != Array
+if get(ENV,"JULIA_MPI_TEST_ARRAYTYPE","") == "CuArray"
     @test MPI.has_cuda()
 end
 
diff --git a/test/test_bcast.jl b/test/test_bcast.jl
@@ -3,9 +3,7 @@ using Random
 
 MPI.Init()
 
-
 comm = MPI.COMM_WORLD
-
 root = 0
 matsize = (17,17)
 
diff --git a/test/test_io.jl b/test/test_io.jl
@@ -2,6 +2,7 @@ include("common.jl")
 using Random
 
 MPI.Init()
+
 comm = MPI.COMM_WORLD
 rank = MPI.Comm_rank(comm)
 sz = MPI.Comm_size(comm)
diff --git a/test/test_io_shared.jl b/test/test_io_shared.jl
@@ -1,6 +1,7 @@
 include("common.jl")
 
 MPI.Init()
+
 comm = MPI.COMM_WORLD
 rank = MPI.Comm_rank(comm)
 sz = MPI.Comm_size(comm)
diff --git a/test/test_io_subarray.jl b/test/test_io_subarray.jl
@@ -1,8 +1,8 @@
 include("common.jl")
-
 using Random
 
 MPI.Init()
+
 comm = MPI.COMM_WORLD
 rank = MPI.Comm_rank(comm)
 sz = MPI.Comm_size(comm)
diff --git a/test/test_onesided.jl b/test/test_onesided.jl
@@ -1,7 +1,7 @@
 using Test
 using MPI
 
-# TODO: enable CUDA tests once OpenMPI has full support
+# TODO: enable CUDA and AMDGPU tests once OpenMPI has full support
 ArrayType = Array
 
 MPI.Init()
diff --git a/test/test_reduce.jl b/test/test_reduce.jl
@@ -36,6 +36,7 @@ val = isroot ? sz : nothing
 @test MPI.Reduce(1, +, root, comm) == val
 
 mesg = ArrayType(1.0:5.0)
+synchronize()
 sum_mesg = MPI.Reduce(mesg, +, comm; root=root)
 if isroot
     @test sum_mesg isa ArrayType{Float64}
@@ -52,6 +53,7 @@ for T = [Int]
     for dims = [1, 2, 3]
         send_arr = ArrayType(zeros(T, Tuple(3 for i in 1:dims)))
         send_arr[:] .= 1:length(send_arr)
+        synchronize()
 
         for op in operators
 

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,5 @@`
`1`	`1`	`[deps]`
	`2`	`+AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"`
`2`	`3`	`CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"`
`3`	`4`	`DoubleFloats = "497a8b3b-efae-58df-a0af-a86822472b78"`
`4`	`5`	`Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"`