Skip to content

Commit 6f68c0b

Browse files
committed
minor: finalized cupy tutorials
1 parent 8c680b5 commit 6f68c0b

File tree

4 files changed

+140
-64
lines changed

4 files changed

+140
-64
lines changed

tutorials_cupy/lsm_cupy.py

Lines changed: 23 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,8 @@
22
Least-squares Migration with CUDA-Aware MPI
33
===========================================
44
This tutorial is an extension of the :ref:`sphx_glr_tutorials_lsm.py`
5-
tutorial where PyLops-MPI is run in multi-GPU setting with GPUs communicating via
6-
CUDA-Aware MPI.
5+
tutorial where PyLops-MPI is run in multi-GPU setting with GPUs
6+
communicating via MPI.
77
"""
88

99
import warnings
@@ -19,16 +19,25 @@
1919

2020
import pylops_mpi
2121

22+
###############################################################################
23+
# The standard MPI communicator is used in this example, so there is no need
24+
# for any initalization. However, we need to assign our GPU resources to the
25+
# different ranks. Here we decide to assign a unique GPU to each process if
26+
# the number of ranks is equal or smaller than that of the GPUs. Otherwise we
27+
# start assigning more than one GPU to the available ranks. Note that this
28+
# approach will work equally well if we have a multi-node multi-GPU setup, where
29+
# each node has one or more GPUs.
30+
2231
np.random.seed(42)
2332
plt.close("all")
2433
rank = MPI.COMM_WORLD.Get_rank()
2534
size = MPI.COMM_WORLD.Get_size()
2635
device_count = cp.cuda.runtime.getDeviceCount()
27-
cp.cuda.Device(rank % device_count).use()
36+
cp.cuda.Device(rank % device_count).use();
2837

2938
###############################################################################
30-
# Let's start with a simple model with two interfaces, where sources are
31-
# distributed across different ranks.
39+
# Let's start by defining all the parameters required by the
40+
# :py:class:`pylops.waveeqprocessing.LSM` operator.
3241
# Note that this section is exactly the same as the one in the MPI example
3342
# as we will keep using MPI for transfering metadata (i.e., shapes, dims, etc.)
3443

@@ -66,7 +75,8 @@
6675
plt.figure(figsize=(10, 5))
6776
im = plt.imshow(vel.T, cmap="summer", extent=(x[0], x[-1], z[-1], z[0]))
6877
plt.scatter(recs[0], recs[1], marker="v", s=150, c="b", edgecolors="k")
69-
plt.scatter(sources_tot[0], sources_tot[1], marker="*", s=150, c="r", edgecolors="k")
78+
plt.scatter(sources_tot[0], sources_tot[1], marker="*", s=150, c="r",
79+
edgecolors="k")
7080
cb = plt.colorbar(im)
7181
cb.set_label("[m/s]")
7282
plt.axis("tight")
@@ -78,7 +88,8 @@
7888
plt.figure(figsize=(10, 5))
7989
im = plt.imshow(refl.T, cmap="gray", extent=(x[0], x[-1], z[-1], z[0]))
8090
plt.scatter(recs[0], recs[1], marker="v", s=150, c="b", edgecolors="k")
81-
plt.scatter(sources_tot[0], sources_tot[1], marker="*", s=150, c="r", edgecolors="k")
91+
plt.scatter(sources_tot[0], sources_tot[1], marker="*", s=150, c="r",
92+
edgecolors="k")
8293
plt.colorbar(im)
8394
plt.axis("tight")
8495
plt.xlabel("x [m]"), plt.ylabel("z [m]")
@@ -90,7 +101,7 @@
90101
# We are now ready to create the :py:class:`pylops.waveeqprocessing.LSM`
91102
# operator and initialize the :py:class:`pylops_mpi.DistributedArray`
92103
# reflecitivity object. Compared to the MPI tutorial, we need to make sure that
93-
# we set CuPy as the engine and use CuPy arrays
104+
# we set ``cupy`` as the engine and fill the distributed arrays with CuPy arrays.
94105

95106
# Wavelet
96107
nt = 651
@@ -126,7 +137,7 @@
126137
# We calculate now the adjoint and the inverse using the
127138
# :py:func:`pylops_mpi.optimization.basic.cgls` solver. No code change
128139
# is required to run on CUDA-aware
129-
# MPI (this is handled through MPI operator and DistributedArray)
140+
# MPI (this is handled by the MPI operator and DistributedArray)
130141
# In this particular case, the local computation will be done in GPU.
131142
# Collective communication calls will be carried through MPI GPU-to-GPU.
132143

@@ -148,6 +159,9 @@
148159
d_inv = d_inv_dist.asarray().reshape(nstot, nr, nt)
149160

150161
###############################################################################
162+
# Finally we visualize the results. Note that the array must be copied back
163+
# to the CPU by calling the :code:`get()` method on the CuPy arrays.
164+
151165
if rank == 0:
152166
# Visualize
153167
fig1, axs = plt.subplots(1, 3, figsize=(10, 3))

tutorials_cupy/mdd_cupy.py

Lines changed: 29 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,8 @@
22
Multi-Dimensional Deconvolution with CUDA-Aware MPI
33
===================================================
44
This tutorial is an extension of the :ref:`sphx_glr_tutorials_mdd.py`
5-
tutorial where PyLops-MPI is run in multi-GPU setting with GPUs communicating via
6-
CUDA-Aware MPI.
5+
tutorial where PyLops-MPI is run in multi-GPU setting with GPUs communicating
6+
via MPI.
77
"""
88

99
import numpy as np
@@ -18,17 +18,28 @@
1818
import pylops_mpi
1919
from pylops_mpi.DistributedArray import local_split, Partition
2020

21+
###############################################################################
22+
# The standard MPI communicator is used in this example, so there is no need
23+
# for any initalization. However, we need to assign our GPU resources to the
24+
# different ranks. Here we decide to assign a unique GPU to each process if
25+
# the number of ranks is equal or smaller than that of the GPUs. Otherwise we
26+
# start assigning more than one GPU to the available ranks. Note that this
27+
# approach will work equally well if we have a multi-node multi-GPU setup, where
28+
# each node has one or more GPUs.
29+
2130
plt.close("all")
2231
rank = MPI.COMM_WORLD.Get_rank()
2332
size = MPI.COMM_WORLD.Get_size()
2433
dtype = np.float32
2534
cdtype = np.complex64
2635
device_count = cp.cuda.runtime.getDeviceCount()
27-
cp.cuda.Device(rank % device_count).use()
36+
cp.cuda.Device(rank % device_count).use();
2837

2938
###############################################################################
30-
# Let's start by creating a set of hyperbolic events to be used as
31-
# our MDC kernel as well as the model
39+
# Let's start by defining all the parameters required by the
40+
# :py:func:`pylops.waveeqprocessing.MPIMDC` operator.
41+
# Note that this section is exactly the same as the one in the MPI example as
42+
# we will keep using MPI for transfering metadata (i.e., shapes, dims, etc.)
3243

3344
# Input parameters
3445
par = {
@@ -93,13 +104,15 @@
93104
ifin_rank = np.insert(np.cumsum(nf_ranks)[:-1], 0, 0)[rank]
94105
ifend_rank = np.cumsum(nf_ranks)[rank]
95106

96-
# Extract batch of frequency slices (in practice, this will be directly read from input file)
107+
# Extract batch of frequency slices (in practice, this will be directly
108+
# read from input file)
97109
G = Gwav_fft[ifin_rank:ifend_rank].astype(cdtype)
98110

99111
###############################################################################
100-
# Let's now define the distributed operator and model as well as compute the
101-
# data. Compared to the MPI tutorial, we need to make sure that we set CuPy
102-
# as the engine and use CuPy arrays
112+
# For MPIMDCOperator, there is no change needed to have it run with
113+
# MPI. This PyLops operator has GPU-support
114+
# (https://pylops.readthedocs.io/en/stable/gpu.html) so it can operate on a
115+
# distributed arrays with engine set to CuPy.
103116

104117
# Move operator kernel to GPU
105118
G = cp.asarray(G)
@@ -182,7 +195,9 @@
182195
# We are now ready to compute the adjoint (i.e., cross-correlation) and invert
183196
# back for our input model. This computation will be done in GPU. The call
184197
# :code:`asarray()` triggers CUDA-aware MPI communication (gather result
185-
# from each GPU). But array :code:`madjloc` still live in GPU memory
198+
# from each GPU). Note that the array :code:`madjloc` and :code:`minvloc`
199+
# still live in GPU memory.
200+
186201

187202
# Adjoint
188203
madj = MDCop.H @ d
@@ -196,6 +211,10 @@
196211
minv = pylops_mpi.cgls(MDCop, d, x0=m0, niter=50, show=True if rank == 0 else False)[0]
197212
minvloc = minv.asarray().real.reshape(2 * par["nt"] - 1, par["nx"])
198213

214+
###############################################################################
215+
# Finally we visualize the results. Note that the array must be copied back
216+
# to the CPU by calling the :code:`get()` method on the CuPy arrays.
217+
199218
if rank == 0:
200219
fig = plt.figure(figsize=(8, 6))
201220
ax1 = plt.subplot2grid((1, 5), (0, 0), colspan=2)

tutorials_cupy/poststack_cupy.py

Lines changed: 45 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,8 @@
22
Post Stack Inversion - 3D with CUDA-Aware MPI
33
=============================================
44
This tutorial is an extension of the :ref:`sphx_glr_tutorials_poststack.py`
5-
tutorial where PyLops-MPI is run in multi-GPU setting with GPUs communicating via
6-
CUDA-Aware MPI.
5+
tutorial where PyLops-MPI is run in multi-GPU setting with GPUs communicating
6+
via MPI.
77
"""
88

99
import numpy as np
@@ -18,15 +18,25 @@
1818

1919
import pylops_mpi
2020

21+
###############################################################################
22+
# The standard MPI communicator is used in this example, so there is no need
23+
# for any initalization. However, we need to assign our GPU resources to the
24+
# different ranks. Here we decide to assign a unique GPU to each process if
25+
# the number of ranks is equal or smaller than that of the GPUs. Otherwise we
26+
# start assigning more than one GPU to the available ranks. Note that this
27+
# approach will work equally well if we have a multi-node multi-GPU setup, where
28+
# each node has one or more GPUs.
29+
2130
plt.close("all")
2231
rank = MPI.COMM_WORLD.Get_rank()
2332
device_count = cp.cuda.runtime.getDeviceCount()
24-
cp.cuda.Device(rank % device_count).use()
33+
cp.cuda.Device(rank % device_count).use();
2534

2635
###############################################################################
2736
# Let's start by defining all the parameters required by the
2837
# :py:func:`pylops.avo.poststack.PoststackLinearModelling` operator.
29-
# Note that this section is exactly the same as the one in the MPI example as we will keep using MPI for transfering metadata (i.e., shapes, dims, etc.)
38+
# Note that this section is exactly the same as the one in the MPI example as
39+
# we will keep using MPI for transfering metadata (i.e., shapes, dims, etc.)
3040

3141
# Model
3242
model = np.load("../testdata/avo/poststack_model.npz")
@@ -58,31 +68,34 @@
5868
mback3d = np.concatenate(MPI.COMM_WORLD.allgather(mback3d_i))
5969

6070
###############################################################################
61-
# We are now ready to initialize various :py:class:`pylops_mpi.DistributedArray` objects.
62-
# Compared to the MPI tutorial, we need to make sure that we set CuPy as the engine and
63-
# use CuPy arrays
71+
# We are now ready to initialize various :py:class:`pylops_mpi.DistributedArray`
72+
# objects. Compared to the MPI tutorial, we need to make sure that we set ``cupy``
73+
# as the engine and fill the distributed arrays with CuPy arrays.
6474

65-
m3d_dist = pylops_mpi.DistributedArray(global_shape=ny * nx * nz, engine="cupy")
75+
m3d_dist = pylops_mpi.DistributedArray(global_shape=ny * nx * nz,
76+
engine="cupy")
6677
m3d_dist[:] = cp.asarray(m3d_i.flatten())
6778

6879
# Do the same thing for smooth model
69-
mback3d_dist = pylops_mpi.DistributedArray(global_shape=ny * nx * nz, engine="cupy")
80+
mback3d_dist = pylops_mpi.DistributedArray(global_shape=ny * nx * nz,
81+
engine="cupy")
7082
mback3d_dist[:] = cp.asarray(mback3d_i.flatten())
7183

7284
###############################################################################
73-
# For PostStackLinearModelling, there is no change needed to have it run with CUDA-Aware MPI.
74-
# This PyLops operator has GPU-support (https://pylops.readthedocs.io/en/stable/gpu.html)
75-
# so it can run with DistributedArray whose engine is Cupy
85+
# For PostStackLinearModelling, there is no change needed to have it run with
86+
# MPI. This PyLops operator has GPU-support
87+
# (https://pylops.readthedocs.io/en/stable/gpu.html) so it can operate on a
88+
# distributed arrays with engine set to CuPy.
7689

7790
PPop = PoststackLinearModelling(cp.asarray(wav.astype(np.float32)), nt0=nz,
7891
spatdims=(ny_i, nx))
7992
Top = Transpose((ny_i, nx, nz), (2, 0, 1))
8093
BDiag = pylops_mpi.basicoperators.MPIBlockDiag(ops=[Top.H @ PPop @ Top, ])
8194

8295
###############################################################################
83-
# This computation will be done in GPU. The call :code:`asarray()` triggers the CUDA-aware
84-
# MPI communication (gather result from each GPU).
85-
# But array :code:`d` and :code:`d_0` still live in GPU memory
96+
# This computation will be done on the GPU(s). The call :code:`asarray()`
97+
# triggers the MPI communication (gather results from each GPU).
98+
# Note that the array :code:`d` and :code:`d_0` still live in GPU memory.
8699

87100
d_dist = BDiag @ m3d_dist
88101
d_local = d_dist.local_array.reshape((ny_i, nx, nz))
@@ -91,24 +104,30 @@
91104
d_0 = d_dist.asarray().reshape((ny, nx, nz))
92105

93106
###############################################################################
94-
# Inversion using CGLS solver - There is no code change to run on CUDA-aware
95-
# MPI (this is handled through MPI operator and DistributedArray)
107+
# Inversion using CGLS solver - no code change is required to run the solver
108+
# with CUDA-aware MPI (this is handled by the MPI operator and DistributedArray)
96109
# In this particular case, the local computation will be done in GPU.
97110
# Collective communication calls will be carried through MPI GPU-to-GPU.
98111

99112
# Inversion using CGLS solver
100-
minv3d_iter_dist = pylops_mpi.optimization.basic.cgls(BDiag, d_dist, x0=mback3d_dist, niter=100, show=True)[0]
113+
minv3d_iter_dist = pylops_mpi.optimization.basic.cgls(BDiag, d_dist,
114+
x0=mback3d_dist,
115+
niter=100, show=True)[0]
101116
minv3d_iter = minv3d_iter_dist.asarray().reshape((ny, nx, nz))
102117

103118
###############################################################################
104119

105120
# Regularized inversion with normal equations
106121
epsR = 1e2
107-
LapOp = pylops_mpi.MPILaplacian(dims=(ny, nx, nz), axes=(0, 1, 2), weights=(1, 1, 1),
108-
sampling=(1, 1, 1), dtype=BDiag.dtype)
122+
LapOp = pylops_mpi.MPILaplacian(dims=(ny, nx, nz), axes=(0, 1, 2),
123+
weights=(1, 1, 1),
124+
sampling=(1, 1, 1),
125+
dtype=BDiag.dtype)
109126
NormEqOp = BDiag.H @ BDiag + epsR * LapOp.H @ LapOp
110127
dnorm_dist = BDiag.H @ d_dist
111-
minv3d_ne_dist = pylops_mpi.optimization.basic.cg(NormEqOp, dnorm_dist, x0=mback3d_dist, niter=100, show=True)[0]
128+
minv3d_ne_dist = pylops_mpi.optimization.basic.cg(NormEqOp, dnorm_dist,
129+
x0=mback3d_dist,
130+
niter=100, show=True)[0]
112131
minv3d_ne = minv3d_ne_dist.asarray().reshape((ny, nx, nz))
113132

114133
###############################################################################
@@ -120,11 +139,14 @@
120139
dstack_dist = pylops_mpi.StackedDistributedArray([d_dist, d0_dist])
121140

122141
dnorm_dist = BDiag.H @ d_dist
123-
minv3d_reg_dist = pylops_mpi.optimization.basic.cgls(StackOp, dstack_dist, x0=mback3d_dist, niter=100, show=True)[0]
142+
minv3d_reg_dist = pylops_mpi.optimization.basic.cgls(StackOp, dstack_dist,
143+
x0=mback3d_dist,
144+
niter=100, show=True)[0]
124145
minv3d_reg = minv3d_reg_dist.asarray().reshape((ny, nx, nz))
125146

126147
###############################################################################
127-
# To plot the inversion results, the array must be copied back to cpu via :code:`get()`
148+
# Finally we visualize the results. Note that the array must be copied back
149+
# to the CPU by calling the :code:`get()` method on the CuPy arrays.
128150

129151
if rank == 0:
130152
# Check the distributed implementation gives the same result

0 commit comments

Comments
 (0)