|
2 | 2 | Post Stack Inversion - 3D with CUDA-Aware MPI |
3 | 3 | ============================================= |
4 | 4 | This tutorial is an extension of the :ref:`sphx_glr_tutorials_poststack.py` |
5 | | -tutorial where PyLops-MPI is run in multi-GPU setting with GPUs communicating via |
6 | | -CUDA-Aware MPI. |
| 5 | +tutorial where PyLops-MPI is run in multi-GPU setting with GPUs communicating |
| 6 | +via MPI. |
7 | 7 | """ |
8 | 8 |
|
9 | 9 | import numpy as np |
|
18 | 18 |
|
19 | 19 | import pylops_mpi |
20 | 20 |
|
| 21 | +############################################################################### |
| 22 | +# The standard MPI communicator is used in this example, so there is no need |
| 23 | +# for any initalization. However, we need to assign our GPU resources to the |
| 24 | +# different ranks. Here we decide to assign a unique GPU to each process if |
| 25 | +# the number of ranks is equal or smaller than that of the GPUs. Otherwise we |
| 26 | +# start assigning more than one GPU to the available ranks. Note that this |
| 27 | +# approach will work equally well if we have a multi-node multi-GPU setup, where |
| 28 | +# each node has one or more GPUs. |
| 29 | + |
21 | 30 | plt.close("all") |
22 | 31 | rank = MPI.COMM_WORLD.Get_rank() |
23 | 32 | device_count = cp.cuda.runtime.getDeviceCount() |
24 | | -cp.cuda.Device(rank % device_count).use() |
| 33 | +cp.cuda.Device(rank % device_count).use(); |
25 | 34 |
|
26 | 35 | ############################################################################### |
27 | 36 | # Let's start by defining all the parameters required by the |
28 | 37 | # :py:func:`pylops.avo.poststack.PoststackLinearModelling` operator. |
29 | | -# Note that this section is exactly the same as the one in the MPI example as we will keep using MPI for transfering metadata (i.e., shapes, dims, etc.) |
| 38 | +# Note that this section is exactly the same as the one in the MPI example as |
| 39 | +# we will keep using MPI for transfering metadata (i.e., shapes, dims, etc.) |
30 | 40 |
|
31 | 41 | # Model |
32 | 42 | model = np.load("../testdata/avo/poststack_model.npz") |
|
58 | 68 | mback3d = np.concatenate(MPI.COMM_WORLD.allgather(mback3d_i)) |
59 | 69 |
|
60 | 70 | ############################################################################### |
61 | | -# We are now ready to initialize various :py:class:`pylops_mpi.DistributedArray` objects. |
62 | | -# Compared to the MPI tutorial, we need to make sure that we set CuPy as the engine and |
63 | | -# use CuPy arrays |
| 71 | +# We are now ready to initialize various :py:class:`pylops_mpi.DistributedArray` |
| 72 | +# objects. Compared to the MPI tutorial, we need to make sure that we set ``cupy`` |
| 73 | +# as the engine and fill the distributed arrays with CuPy arrays. |
64 | 74 |
|
65 | | -m3d_dist = pylops_mpi.DistributedArray(global_shape=ny * nx * nz, engine="cupy") |
| 75 | +m3d_dist = pylops_mpi.DistributedArray(global_shape=ny * nx * nz, |
| 76 | + engine="cupy") |
66 | 77 | m3d_dist[:] = cp.asarray(m3d_i.flatten()) |
67 | 78 |
|
68 | 79 | # Do the same thing for smooth model |
69 | | -mback3d_dist = pylops_mpi.DistributedArray(global_shape=ny * nx * nz, engine="cupy") |
| 80 | +mback3d_dist = pylops_mpi.DistributedArray(global_shape=ny * nx * nz, |
| 81 | + engine="cupy") |
70 | 82 | mback3d_dist[:] = cp.asarray(mback3d_i.flatten()) |
71 | 83 |
|
72 | 84 | ############################################################################### |
73 | | -# For PostStackLinearModelling, there is no change needed to have it run with CUDA-Aware MPI. |
74 | | -# This PyLops operator has GPU-support (https://pylops.readthedocs.io/en/stable/gpu.html) |
75 | | -# so it can run with DistributedArray whose engine is Cupy |
| 85 | +# For PostStackLinearModelling, there is no change needed to have it run with |
| 86 | +# MPI. This PyLops operator has GPU-support |
| 87 | +# (https://pylops.readthedocs.io/en/stable/gpu.html) so it can operate on a |
| 88 | +# distributed arrays with engine set to CuPy. |
76 | 89 |
|
77 | 90 | PPop = PoststackLinearModelling(cp.asarray(wav.astype(np.float32)), nt0=nz, |
78 | 91 | spatdims=(ny_i, nx)) |
79 | 92 | Top = Transpose((ny_i, nx, nz), (2, 0, 1)) |
80 | 93 | BDiag = pylops_mpi.basicoperators.MPIBlockDiag(ops=[Top.H @ PPop @ Top, ]) |
81 | 94 |
|
82 | 95 | ############################################################################### |
83 | | -# This computation will be done in GPU. The call :code:`asarray()` triggers the CUDA-aware |
84 | | -# MPI communication (gather result from each GPU). |
85 | | -# But array :code:`d` and :code:`d_0` still live in GPU memory |
| 96 | +# This computation will be done on the GPU(s). The call :code:`asarray()` |
| 97 | +# triggers the MPI communication (gather results from each GPU). |
| 98 | +# Note that the array :code:`d` and :code:`d_0` still live in GPU memory. |
86 | 99 |
|
87 | 100 | d_dist = BDiag @ m3d_dist |
88 | 101 | d_local = d_dist.local_array.reshape((ny_i, nx, nz)) |
|
91 | 104 | d_0 = d_dist.asarray().reshape((ny, nx, nz)) |
92 | 105 |
|
93 | 106 | ############################################################################### |
94 | | -# Inversion using CGLS solver - There is no code change to run on CUDA-aware |
95 | | -# MPI (this is handled through MPI operator and DistributedArray) |
| 107 | +# Inversion using CGLS solver - no code change is required to run the solver |
| 108 | +# with CUDA-aware MPI (this is handled by the MPI operator and DistributedArray) |
96 | 109 | # In this particular case, the local computation will be done in GPU. |
97 | 110 | # Collective communication calls will be carried through MPI GPU-to-GPU. |
98 | 111 |
|
99 | 112 | # Inversion using CGLS solver |
100 | | -minv3d_iter_dist = pylops_mpi.optimization.basic.cgls(BDiag, d_dist, x0=mback3d_dist, niter=100, show=True)[0] |
| 113 | +minv3d_iter_dist = pylops_mpi.optimization.basic.cgls(BDiag, d_dist, |
| 114 | + x0=mback3d_dist, |
| 115 | + niter=100, show=True)[0] |
101 | 116 | minv3d_iter = minv3d_iter_dist.asarray().reshape((ny, nx, nz)) |
102 | 117 |
|
103 | 118 | ############################################################################### |
104 | 119 |
|
105 | 120 | # Regularized inversion with normal equations |
106 | 121 | epsR = 1e2 |
107 | | -LapOp = pylops_mpi.MPILaplacian(dims=(ny, nx, nz), axes=(0, 1, 2), weights=(1, 1, 1), |
108 | | - sampling=(1, 1, 1), dtype=BDiag.dtype) |
| 122 | +LapOp = pylops_mpi.MPILaplacian(dims=(ny, nx, nz), axes=(0, 1, 2), |
| 123 | + weights=(1, 1, 1), |
| 124 | + sampling=(1, 1, 1), |
| 125 | + dtype=BDiag.dtype) |
109 | 126 | NormEqOp = BDiag.H @ BDiag + epsR * LapOp.H @ LapOp |
110 | 127 | dnorm_dist = BDiag.H @ d_dist |
111 | | -minv3d_ne_dist = pylops_mpi.optimization.basic.cg(NormEqOp, dnorm_dist, x0=mback3d_dist, niter=100, show=True)[0] |
| 128 | +minv3d_ne_dist = pylops_mpi.optimization.basic.cg(NormEqOp, dnorm_dist, |
| 129 | + x0=mback3d_dist, |
| 130 | + niter=100, show=True)[0] |
112 | 131 | minv3d_ne = minv3d_ne_dist.asarray().reshape((ny, nx, nz)) |
113 | 132 |
|
114 | 133 | ############################################################################### |
|
120 | 139 | dstack_dist = pylops_mpi.StackedDistributedArray([d_dist, d0_dist]) |
121 | 140 |
|
122 | 141 | dnorm_dist = BDiag.H @ d_dist |
123 | | -minv3d_reg_dist = pylops_mpi.optimization.basic.cgls(StackOp, dstack_dist, x0=mback3d_dist, niter=100, show=True)[0] |
| 142 | +minv3d_reg_dist = pylops_mpi.optimization.basic.cgls(StackOp, dstack_dist, |
| 143 | + x0=mback3d_dist, |
| 144 | + niter=100, show=True)[0] |
124 | 145 | minv3d_reg = minv3d_reg_dist.asarray().reshape((ny, nx, nz)) |
125 | 146 |
|
126 | 147 | ############################################################################### |
127 | | -# To plot the inversion results, the array must be copied back to cpu via :code:`get()` |
| 148 | +# Finally we visualize the results. Note that the array must be copied back |
| 149 | +# to the CPU by calling the :code:`get()` method on the CuPy arrays. |
128 | 150 |
|
129 | 151 | if rank == 0: |
130 | 152 | # Check the distributed implementation gives the same result |
|
0 commit comments