|
1 | 1 | r""" |
2 | | -Post Stack Inversion - 3D with Nvidia's NCCL |
3 | | -========================= |
4 | | -This tutorial is similar to poststack.py but it shows how to run PyLops-MPI in multi-GPU environment |
5 | | -and have those GPUs communicate via NCCL |
| 2 | +Post Stack Inversion - 3D with NCCL |
| 3 | +============================================ |
| 4 | +This tutorial is an extension of the :ref:`sphx_glr_tutorials_poststack.py` tutorial where PyLops-MPI is run in multi-GPU setting with GPUs communicating via NCCL. |
6 | 5 | """ |
7 | 6 |
|
8 | 7 | import numpy as np |
|
21 | 20 |
|
22 | 21 | plt.close("all") |
23 | 22 | rank = MPI.COMM_WORLD.Get_rank() |
24 | | -# size = MPI.COMM_WORLD.Get_size() |
25 | 23 |
|
26 | 24 | ############################################################################### |
27 | | -# This section is exactly the same as MPI version. We will keep using MPI for |
28 | | -# transfering meta data i.e., shapes, dims, etc. |
| 25 | +# Let's start by defining all the parameters required by the |
| 26 | +# :py:func:`pylops.avo.poststack.PoststackLinearModelling` operator. |
| 27 | +# Note that this section is exactly the same as the one in the MPI example as we will keep using MPI for transfering metadata (i.e., shapes, dims, etc.) |
29 | 28 |
|
30 | 29 | # Model |
31 | 30 | model = np.load("../testdata/avo/poststack_model.npz") |
|
59 | 58 | ############################################################################### |
60 | 59 | # NCCL communication can be easily initialized with |
61 | 60 | # :py:func:`pylops_mpi.utils._nccl.initialize_nccl_comm` operator. |
62 | | -# One can think of this as GPU-counterpart of MPI.COMM_WORLD |
| 61 | +# One can think of this as GPU-counterpart of :code:`MPI.COMM_WORLD` |
| 62 | + |
63 | 63 | nccl_comm = pylops_mpi.utils._nccl.initialize_nccl_comm() |
64 | 64 |
|
| 65 | +############################################################################### |
| 66 | +# We are now ready to initialize various :py:class:`pylops_mpi.DistributedArray` objects. |
| 67 | +# Compared to the MPI tutorial, we need to make sure that we pass :code:`base_comm_nccl = nccl_comm` and set CuPy as the engine. |
65 | 68 |
|
66 | | -# Initialize DistributedArray with `base_comm_nccl = nccl_comm` and CuPy engine |
67 | | -# This DistributedArray internally have both MPI.COMM_WORLD (by default) and `cupy.cuda.nccl.NcclCommunicator` to operate on |
68 | 69 | m3d_dist = pylops_mpi.DistributedArray(global_shape=ny * nx * nz, base_comm_nccl=nccl_comm, engine="cupy") |
69 | 70 | m3d_dist[:] = cp.asarray(m3d_i.flatten()) |
70 | 71 |
|
71 | 72 | # Do the same thing for smooth model |
72 | 73 | mback3d_dist = pylops_mpi.DistributedArray(global_shape=ny * nx * nz, base_comm_nccl=nccl_comm, engine="cupy") |
73 | 74 | mback3d_dist[:] = cp.asarray(mback3d_i.flatten()) |
74 | 75 |
|
| 76 | +############################################################################### |
75 | 77 | # For PostStackLinearModelling, there is no change needed to have it run with NCCL. |
76 | 78 | # This PyLops operator has GPU-support (https://pylops.readthedocs.io/en/stable/gpu.html) |
77 | 79 | # so it can run with DistributedArray whose engine is Cupy |
| 80 | + |
78 | 81 | PPop = PoststackLinearModelling(wav, nt0=nz, spatdims=(ny_i, nx)) |
79 | 82 | Top = Transpose((ny_i, nx, nz), (2, 0, 1)) |
80 | 83 | BDiag = pylops_mpi.basicoperators.MPIBlockDiag(ops=[Top.H @ PPop @ Top, ]) |
81 | 84 |
|
82 | | -# This computation will be done in GPU. The call asarray() trigger the NCCL communication (gather result from each GPU). |
83 | | -# But array `d` and `d_0` still live in GPU memory |
| 85 | +############################################################################### |
| 86 | +# This computation will be done in GPU. The call :code:`asarray()` triggers the NCCL communication (gather result from each GPU). |
| 87 | +# But array :code:`d` and :code:`d_0` still live in GPU memory |
| 88 | + |
84 | 89 | d_dist = BDiag @ m3d_dist |
85 | 90 | d_local = d_dist.local_array.reshape((ny_i, nx, nz)) |
86 | 91 | d = d_dist.asarray().reshape((ny, nx, nz)) |
87 | 92 | d_0_dist = BDiag @ mback3d_dist |
88 | 93 | d_0 = d_dist.asarray().reshape((ny, nx, nz)) |
89 | 94 |
|
90 | | -# ############################################################################### |
91 | | - |
| 95 | +############################################################################### |
92 | 96 | # Inversion using CGLS solver - There is no code change to have run on NCCL (it handles though MPI operator and DistributedArray) |
93 | 97 | # In this particular case, the local computation will be done in GPU. Collective communication calls |
94 | 98 | # will be carried through NCCL GPU-to-GPU. |
| 99 | + |
| 100 | +# Inversion using CGLS solver |
95 | 101 | minv3d_iter_dist = pylops_mpi.optimization.basic.cgls(BDiag, d_dist, x0=mback3d_dist, niter=1, show=True)[0] |
96 | 102 | minv3d_iter = minv3d_iter_dist.asarray().reshape((ny, nx, nz)) |
97 | 103 |
|
98 | | -# ############################################################################### |
| 104 | +############################################################################### |
99 | 105 |
|
100 | 106 | # Regularized inversion with normal equations |
101 | 107 | epsR = 1e2 |
|
106 | 112 | minv3d_ne_dist = pylops_mpi.optimization.basic.cg(NormEqOp, dnorm_dist, x0=mback3d_dist, niter=10, show=True)[0] |
107 | 113 | minv3d_ne = minv3d_ne_dist.asarray().reshape((ny, nx, nz)) |
108 | 114 |
|
109 | | -# ############################################################################### |
| 115 | +############################################################################### |
110 | 116 |
|
111 | 117 | # Regularized inversion with regularized equations |
112 | 118 | StackOp = pylops_mpi.MPIStackedVStack([BDiag, np.sqrt(epsR) * LapOp]) |
|
118 | 124 | minv3d_reg_dist = pylops_mpi.optimization.basic.cgls(StackOp, dstack_dist, x0=mback3d_dist, niter=10, show=True)[0] |
119 | 125 | minv3d_reg = minv3d_reg_dist.asarray().reshape((ny, nx, nz)) |
120 | 126 |
|
121 | | -# ############################################################################### |
122 | | -# To plot the inversion result, the array must be copied back to cpu via `get()` |
| 127 | +############################################################################### |
| 128 | +# To plot the inversion results, the array must be copied back to cpu via :code:`get()` |
123 | 129 |
|
124 | 130 | if rank == 0: |
125 | 131 | # Check the distributed implementation gives the same result |
|
0 commit comments