Skip to content

Commit da326a0

Browse files
authored
Merge pull request #146 from mrava87/doc-cupy
doc: added tutorials with cupy+mpi
2 parents 83c336a + 6f68c0b commit da326a0

File tree

7 files changed

+749
-36
lines changed

7 files changed

+749
-36
lines changed

Makefile

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,10 @@ PIP := $(shell command -v pip3 2> /dev/null || command which pip 2> /dev/null)
22
PYTHON := $(shell command -v python3 2> /dev/null || command which python 2> /dev/null)
33
NUM_PROCESSES = 3
44

5-
.PHONY: install dev-install dev-install_nccl install_conda install_conda_nccl dev-install_conda dev-install_conda_nccl tests tests_nccl doc docupdate run_examples run_tutorials
5+
.PHONY: install dev-install dev-install_nccl install_ \
6+
conda install_conda_nccl dev-install_conda dev-install_conda_nccl \
7+
tests tests_nccl doc doc_cupy doc_nccl docupdate \
8+
run_examples run_tutorials run_tutorials_cupy run_tutorials_nccl
69

710
pipcheck:
811
ifndef PIP
@@ -55,12 +58,19 @@ doc:
5558
rm -rf source/tutorials && rm -rf build &&\
5659
cd .. && sphinx-build -b html docs/source docs/build
5760

61+
doc_cupy:
62+
cp tutorials_cupy/* tutorials/
63+
cd docs && rm -rf source/api/generated && rm -rf source/gallery &&\
64+
rm -rf source/tutorials && rm -rf source/tutorials && rm -rf build &&\
65+
cd .. && sphinx-build -b html docs/source docs/build
66+
rm tutorials/*_cupy.py
67+
5868
doc_nccl:
59-
cp tutorials_nccl/* tutorials/
69+
cp tutorials_cupy/* tutorials_nccl/* tutorials/
6070
cd docs && rm -rf source/api/generated && rm -rf source/gallery &&\
6171
rm -rf source/tutorials && rm -rf source/tutorials && rm -rf build &&\
6272
cd .. && sphinx-build -b html docs/source docs/build
63-
rm tutorials/*_nccl.py
73+
rm tutorials/*_cupy.py tutorials/*_nccl.py
6474

6575
docupdate:
6676
cd docs && make html && cd ..
@@ -76,6 +86,10 @@ run_examples:
7686
run_tutorials:
7787
sh mpi_examples.sh tutorials $(NUM_PROCESSES)
7888

89+
# Run tutorials using mpi with cupy arrays
90+
run_tutorials_cupy:
91+
sh mpi_examples.sh tutorials_cupy $(NUM_PROCESSES)
92+
7993
# Run tutorials using nccl
8094
run_tutorials_nccl:
8195
sh mpi_examples.sh tutorials_nccl $(NUM_PROCESSES)

tutorials/lsm.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -152,8 +152,8 @@
152152
d = d_dist.asarray().reshape((nstot, nr, nt))
153153

154154
###############################################################################
155-
156-
# Adjoint
155+
# We calculate now the adjoint and model the data using the adjoint reflectivity
156+
# as input.
157157
madj_dist = VStack.H @ d_dist
158158
madj = madj_dist.asarray().reshape((nx, nz))
159159
d_adj_dist = VStack @ madj_dist

tutorials/mdd.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -37,16 +37,16 @@
3737
# Input parameters
3838
par = {
3939
"ox": -300,
40-
"dx": 10,
41-
"nx": 61,
40+
"dx": 5,
41+
"nx": 121,
4242
"oy": -500,
43-
"dy": 10,
44-
"ny": 101,
43+
"dy": 5,
44+
"ny": 201,
4545
"ot": 0,
46-
"dt": 0.004,
47-
"nt": 400,
46+
"dt": 0.002,
47+
"nt": 800,
4848
"f0": 20,
49-
"nfmax": 200,
49+
"nfmax": 400,
5050
}
5151

5252
t0_m = 0.2

tutorials_cupy/lsm_cupy.py

Lines changed: 200 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,200 @@
1+
r"""
2+
Least-squares Migration with CUDA-Aware MPI
3+
===========================================
4+
This tutorial is an extension of the :ref:`sphx_glr_tutorials_lsm.py`
5+
tutorial where PyLops-MPI is run in multi-GPU setting with GPUs
6+
communicating via MPI.
7+
"""
8+
9+
import warnings
10+
warnings.filterwarnings('ignore')
11+
12+
import numpy as np
13+
import cupy as cp
14+
from matplotlib import pyplot as plt
15+
from mpi4py import MPI
16+
17+
from pylops.utils.wavelets import ricker
18+
from pylops.waveeqprocessing.lsm import LSM
19+
20+
import pylops_mpi
21+
22+
###############################################################################
23+
# The standard MPI communicator is used in this example, so there is no need
24+
# for any initalization. However, we need to assign our GPU resources to the
25+
# different ranks. Here we decide to assign a unique GPU to each process if
26+
# the number of ranks is equal or smaller than that of the GPUs. Otherwise we
27+
# start assigning more than one GPU to the available ranks. Note that this
28+
# approach will work equally well if we have a multi-node multi-GPU setup, where
29+
# each node has one or more GPUs.
30+
31+
np.random.seed(42)
32+
plt.close("all")
33+
rank = MPI.COMM_WORLD.Get_rank()
34+
size = MPI.COMM_WORLD.Get_size()
35+
device_count = cp.cuda.runtime.getDeviceCount()
36+
cp.cuda.Device(rank % device_count).use();
37+
38+
###############################################################################
39+
# Let's start by defining all the parameters required by the
40+
# :py:class:`pylops.waveeqprocessing.LSM` operator.
41+
# Note that this section is exactly the same as the one in the MPI example
42+
# as we will keep using MPI for transfering metadata (i.e., shapes, dims, etc.)
43+
44+
# Velocity Model
45+
nx, nz = 81, 60
46+
dx, dz = 4, 4
47+
x, z = np.arange(nx) * dx, np.arange(nz) * dz
48+
v0 = 1000 # initial velocity
49+
kv = 0.0 # gradient
50+
vel = np.outer(np.ones(nx), v0 + kv * z)
51+
52+
# Reflectivity Model
53+
refl = np.zeros((nx, nz), dtype=np.float32)
54+
refl[:, 30] = -1
55+
refl[:, 50] = 0.5
56+
57+
# Receivers
58+
nr = 11
59+
rx = np.linspace(10 * dx, (nx - 10) * dx, nr)
60+
rz = 20 * np.ones(nr)
61+
recs = np.vstack((rx, rz))
62+
63+
# Sources
64+
ns = 10
65+
# Total number of sources at all ranks
66+
nstot = MPI.COMM_WORLD.allreduce(ns, op=MPI.SUM)
67+
sxtot = np.linspace(dx * 10, (nx - 10) * dx, nstot)
68+
sx = sxtot[rank * ns: (rank + 1) * ns]
69+
sztot = 10 * np.ones(nstot)
70+
sz = 10 * np.ones(ns)
71+
sources = np.vstack((sx, sz))
72+
sources_tot = np.vstack((sxtot, sztot))
73+
74+
if rank == 0:
75+
plt.figure(figsize=(10, 5))
76+
im = plt.imshow(vel.T, cmap="summer", extent=(x[0], x[-1], z[-1], z[0]))
77+
plt.scatter(recs[0], recs[1], marker="v", s=150, c="b", edgecolors="k")
78+
plt.scatter(sources_tot[0], sources_tot[1], marker="*", s=150, c="r",
79+
edgecolors="k")
80+
cb = plt.colorbar(im)
81+
cb.set_label("[m/s]")
82+
plt.axis("tight")
83+
plt.xlabel("x [m]"), plt.ylabel("z [m]")
84+
plt.title("Velocity")
85+
plt.xlim(x[0], x[-1])
86+
plt.tight_layout()
87+
88+
plt.figure(figsize=(10, 5))
89+
im = plt.imshow(refl.T, cmap="gray", extent=(x[0], x[-1], z[-1], z[0]))
90+
plt.scatter(recs[0], recs[1], marker="v", s=150, c="b", edgecolors="k")
91+
plt.scatter(sources_tot[0], sources_tot[1], marker="*", s=150, c="r",
92+
edgecolors="k")
93+
plt.colorbar(im)
94+
plt.axis("tight")
95+
plt.xlabel("x [m]"), plt.ylabel("z [m]")
96+
plt.title("Reflectivity")
97+
plt.xlim(x[0], x[-1])
98+
plt.tight_layout()
99+
100+
###############################################################################
101+
# We are now ready to create the :py:class:`pylops.waveeqprocessing.LSM`
102+
# operator and initialize the :py:class:`pylops_mpi.DistributedArray`
103+
# reflecitivity object. Compared to the MPI tutorial, we need to make sure that
104+
# we set ``cupy`` as the engine and fill the distributed arrays with CuPy arrays.
105+
106+
# Wavelet
107+
nt = 651
108+
dt = 0.004
109+
t = np.arange(nt) * dt
110+
wav, wavt, wavc = ricker(t[:41], f0=20)
111+
112+
lsm = LSM(
113+
z,
114+
x,
115+
t,
116+
sources,
117+
recs,
118+
v0,
119+
cp.asarray(wav.astype(np.float32)),
120+
wavc,
121+
mode="analytic",
122+
engine="cuda",
123+
dtype=np.float32
124+
)
125+
lsm.Demop.trav_srcs = cp.asarray(lsm.Demop.trav_srcs.astype(np.float32))
126+
lsm.Demop.trav_recs = cp.asarray(lsm.Demop.trav_recs.astype(np.float32))
127+
128+
VStack = pylops_mpi.MPIVStack(ops=[lsm.Demop, ])
129+
refl_dist = pylops_mpi.DistributedArray(global_shape=nx * nz,
130+
partition=pylops_mpi.Partition.BROADCAST,
131+
engine="cupy")
132+
refl_dist[:] = cp.asarray(refl.flatten())
133+
d_dist = VStack @ refl_dist
134+
d = d_dist.asarray().reshape((nstot, nr, nt))
135+
136+
###############################################################################
137+
# We calculate now the adjoint and the inverse using the
138+
# :py:func:`pylops_mpi.optimization.basic.cgls` solver. No code change
139+
# is required to run on CUDA-aware
140+
# MPI (this is handled by the MPI operator and DistributedArray)
141+
# In this particular case, the local computation will be done in GPU.
142+
# Collective communication calls will be carried through MPI GPU-to-GPU.
143+
144+
# Adjoint
145+
madj_dist = VStack.H @ d_dist
146+
madj = madj_dist.asarray().reshape((nx, nz))
147+
d_adj_dist = VStack @ madj_dist
148+
d_adj = d_adj_dist.asarray().reshape((nstot, nr, nt))
149+
150+
# Inverse
151+
# Initializing x0 to zeroes
152+
x0 = pylops_mpi.DistributedArray(VStack.shape[1],
153+
partition=pylops_mpi.Partition.BROADCAST,
154+
engine="cupy")
155+
x0[:] = 0
156+
minv_dist = pylops_mpi.cgls(VStack, d_dist, x0=x0, niter=100, show=True)[0]
157+
minv = minv_dist.asarray().reshape((nx, nz))
158+
d_inv_dist = VStack @ minv_dist
159+
d_inv = d_inv_dist.asarray().reshape(nstot, nr, nt)
160+
161+
###############################################################################
162+
# Finally we visualize the results. Note that the array must be copied back
163+
# to the CPU by calling the :code:`get()` method on the CuPy arrays.
164+
165+
if rank == 0:
166+
# Visualize
167+
fig1, axs = plt.subplots(1, 3, figsize=(10, 3))
168+
axs[0].imshow(refl.T, cmap="gray", vmin=-1, vmax=1)
169+
axs[0].axis("tight")
170+
axs[0].set_title(r"$m$")
171+
axs[1].imshow(madj.T.get(), cmap="gray", vmin=-madj.max(), vmax=madj.max())
172+
axs[1].set_title(r"$m_{adj}$")
173+
axs[1].axis("tight")
174+
axs[2].imshow(minv.T.get(), cmap="gray", vmin=-1, vmax=1)
175+
axs[2].axis("tight")
176+
axs[2].set_title(r"$m_{inv}$")
177+
plt.tight_layout()
178+
179+
fig2, axs = plt.subplots(1, 3, figsize=(10, 3))
180+
axs[0].imshow(d[0, :, :300].T.get(), cmap="gray", vmin=-d.max(), vmax=d.max())
181+
axs[0].set_title(r"$d$")
182+
axs[0].axis("tight")
183+
axs[1].imshow(d_adj[0, :, :300].T.get(), cmap="gray", vmin=-d_adj.max(), vmax=d_adj.max())
184+
axs[1].set_title(r"$d_{adj}$")
185+
axs[1].axis("tight")
186+
axs[2].imshow(d_inv[0, :, :300].T.get(), cmap="gray", vmin=-d.max(), vmax=d.max())
187+
axs[2].set_title(r"$d_{inv}$")
188+
axs[2].axis("tight")
189+
190+
fig3, axs = plt.subplots(1, 3, figsize=(10, 3))
191+
axs[0].imshow(d[nstot // 2, :, :300].T.get(), cmap="gray", vmin=-d.max(), vmax=d.max())
192+
axs[0].set_title(r"$d$")
193+
axs[0].axis("tight")
194+
axs[1].imshow(d_adj[nstot // 2, :, :300].T.get(), cmap="gray", vmin=-d_adj.max(), vmax=d_adj.max())
195+
axs[1].set_title(r"$d_{adj}$")
196+
axs[1].axis("tight")
197+
axs[2].imshow(d_inv[nstot // 2, :, :300].T.get(), cmap="gray", vmin=-d.max(), vmax=d.max())
198+
axs[2].set_title(r"$d_{inv}$")
199+
axs[2].axis("tight")
200+
plt.tight_layout()

0 commit comments

Comments
 (0)