Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 17 additions & 3 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,10 @@ PIP := $(shell command -v pip3 2> /dev/null || command which pip 2> /dev/null)
PYTHON := $(shell command -v python3 2> /dev/null || command which python 2> /dev/null)
NUM_PROCESSES = 3

.PHONY: install dev-install dev-install_nccl install_conda install_conda_nccl dev-install_conda dev-install_conda_nccl tests tests_nccl doc docupdate run_examples run_tutorials
.PHONY: install dev-install dev-install_nccl install_ \
conda install_conda_nccl dev-install_conda dev-install_conda_nccl \
tests tests_nccl doc doc_cupy doc_nccl docupdate \
run_examples run_tutorials run_tutorials_cupy run_tutorials_nccl

pipcheck:
ifndef PIP
Expand Down Expand Up @@ -55,12 +58,19 @@ doc:
rm -rf source/tutorials && rm -rf build &&\
cd .. && sphinx-build -b html docs/source docs/build

doc_cupy:
cp tutorials_cupy/* tutorials/
cd docs && rm -rf source/api/generated && rm -rf source/gallery &&\
rm -rf source/tutorials && rm -rf source/tutorials && rm -rf build &&\
cd .. && sphinx-build -b html docs/source docs/build
rm tutorials/*_cupy.py

doc_nccl:
cp tutorials_nccl/* tutorials/
cp tutorials_cupy/* tutorials_nccl/* tutorials/
cd docs && rm -rf source/api/generated && rm -rf source/gallery &&\
rm -rf source/tutorials && rm -rf source/tutorials && rm -rf build &&\
cd .. && sphinx-build -b html docs/source docs/build
rm tutorials/*_nccl.py
rm tutorials/*_cupy.py tutorials/*_nccl.py

docupdate:
cd docs && make html && cd ..
Expand All @@ -76,6 +86,10 @@ run_examples:
run_tutorials:
sh mpi_examples.sh tutorials $(NUM_PROCESSES)

# Run tutorials using mpi with cupy arrays
run_tutorials_cupy:
sh mpi_examples.sh tutorials_cupy $(NUM_PROCESSES)

# Run tutorials using nccl
run_tutorials_nccl:
sh mpi_examples.sh tutorials_nccl $(NUM_PROCESSES)
4 changes: 2 additions & 2 deletions tutorials/lsm.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,8 +152,8 @@
d = d_dist.asarray().reshape((nstot, nr, nt))

###############################################################################

# Adjoint
# We calculate now the adjoint and model the data using the adjoint reflectivity
# as input.
madj_dist = VStack.H @ d_dist
madj = madj_dist.asarray().reshape((nx, nz))
d_adj_dist = VStack @ madj_dist
Expand Down
14 changes: 7 additions & 7 deletions tutorials/mdd.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,16 +37,16 @@
# Input parameters
par = {
"ox": -300,
"dx": 10,
"nx": 61,
"dx": 5,
"nx": 121,
"oy": -500,
"dy": 10,
"ny": 101,
"dy": 5,
"ny": 201,
"ot": 0,
"dt": 0.004,
"nt": 400,
"dt": 0.002,
"nt": 800,
"f0": 20,
"nfmax": 200,
"nfmax": 400,
}

t0_m = 0.2
Expand Down
200 changes: 200 additions & 0 deletions tutorials_cupy/lsm_cupy.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,200 @@
r"""
Least-squares Migration with CUDA-Aware MPI
===========================================
This tutorial is an extension of the :ref:`sphx_glr_tutorials_lsm.py`
tutorial where PyLops-MPI is run in multi-GPU setting with GPUs
communicating via MPI.
"""

import warnings
warnings.filterwarnings('ignore')

import numpy as np
import cupy as cp
from matplotlib import pyplot as plt
from mpi4py import MPI

from pylops.utils.wavelets import ricker
from pylops.waveeqprocessing.lsm import LSM

import pylops_mpi

###############################################################################
# The standard MPI communicator is used in this example, so there is no need
# for any initalization. However, we need to assign our GPU resources to the
# different ranks. Here we decide to assign a unique GPU to each process if
# the number of ranks is equal or smaller than that of the GPUs. Otherwise we
# start assigning more than one GPU to the available ranks. Note that this
# approach will work equally well if we have a multi-node multi-GPU setup, where
# each node has one or more GPUs.

np.random.seed(42)
plt.close("all")
rank = MPI.COMM_WORLD.Get_rank()
size = MPI.COMM_WORLD.Get_size()
device_count = cp.cuda.runtime.getDeviceCount()
cp.cuda.Device(rank % device_count).use();

###############################################################################
# Let's start by defining all the parameters required by the
# :py:class:`pylops.waveeqprocessing.LSM` operator.
# Note that this section is exactly the same as the one in the MPI example
# as we will keep using MPI for transfering metadata (i.e., shapes, dims, etc.)

# Velocity Model
nx, nz = 81, 60
dx, dz = 4, 4
x, z = np.arange(nx) * dx, np.arange(nz) * dz
v0 = 1000 # initial velocity
kv = 0.0 # gradient
vel = np.outer(np.ones(nx), v0 + kv * z)

# Reflectivity Model
refl = np.zeros((nx, nz), dtype=np.float32)
refl[:, 30] = -1
refl[:, 50] = 0.5

# Receivers
nr = 11
rx = np.linspace(10 * dx, (nx - 10) * dx, nr)
rz = 20 * np.ones(nr)
recs = np.vstack((rx, rz))

# Sources
ns = 10
# Total number of sources at all ranks
nstot = MPI.COMM_WORLD.allreduce(ns, op=MPI.SUM)
sxtot = np.linspace(dx * 10, (nx - 10) * dx, nstot)
sx = sxtot[rank * ns: (rank + 1) * ns]
sztot = 10 * np.ones(nstot)
sz = 10 * np.ones(ns)
sources = np.vstack((sx, sz))
sources_tot = np.vstack((sxtot, sztot))

if rank == 0:
plt.figure(figsize=(10, 5))
im = plt.imshow(vel.T, cmap="summer", extent=(x[0], x[-1], z[-1], z[0]))
plt.scatter(recs[0], recs[1], marker="v", s=150, c="b", edgecolors="k")
plt.scatter(sources_tot[0], sources_tot[1], marker="*", s=150, c="r",
edgecolors="k")
cb = plt.colorbar(im)
cb.set_label("[m/s]")
plt.axis("tight")
plt.xlabel("x [m]"), plt.ylabel("z [m]")
plt.title("Velocity")
plt.xlim(x[0], x[-1])
plt.tight_layout()

plt.figure(figsize=(10, 5))
im = plt.imshow(refl.T, cmap="gray", extent=(x[0], x[-1], z[-1], z[0]))
plt.scatter(recs[0], recs[1], marker="v", s=150, c="b", edgecolors="k")
plt.scatter(sources_tot[0], sources_tot[1], marker="*", s=150, c="r",
edgecolors="k")
plt.colorbar(im)
plt.axis("tight")
plt.xlabel("x [m]"), plt.ylabel("z [m]")
plt.title("Reflectivity")
plt.xlim(x[0], x[-1])
plt.tight_layout()

###############################################################################
# We are now ready to create the :py:class:`pylops.waveeqprocessing.LSM`
# operator and initialize the :py:class:`pylops_mpi.DistributedArray`
# reflecitivity object. Compared to the MPI tutorial, we need to make sure that
# we set ``cupy`` as the engine and fill the distributed arrays with CuPy arrays.

# Wavelet
nt = 651
dt = 0.004
t = np.arange(nt) * dt
wav, wavt, wavc = ricker(t[:41], f0=20)

lsm = LSM(
z,
x,
t,
sources,
recs,
v0,
cp.asarray(wav.astype(np.float32)),
wavc,
mode="analytic",
engine="cuda",
dtype=np.float32
)
lsm.Demop.trav_srcs = cp.asarray(lsm.Demop.trav_srcs.astype(np.float32))
lsm.Demop.trav_recs = cp.asarray(lsm.Demop.trav_recs.astype(np.float32))

VStack = pylops_mpi.MPIVStack(ops=[lsm.Demop, ])
refl_dist = pylops_mpi.DistributedArray(global_shape=nx * nz,
partition=pylops_mpi.Partition.BROADCAST,
engine="cupy")
refl_dist[:] = cp.asarray(refl.flatten())
d_dist = VStack @ refl_dist
d = d_dist.asarray().reshape((nstot, nr, nt))

###############################################################################
# We calculate now the adjoint and the inverse using the
# :py:func:`pylops_mpi.optimization.basic.cgls` solver. No code change
# is required to run on CUDA-aware
# MPI (this is handled by the MPI operator and DistributedArray)
# In this particular case, the local computation will be done in GPU.
# Collective communication calls will be carried through MPI GPU-to-GPU.

# Adjoint
madj_dist = VStack.H @ d_dist
madj = madj_dist.asarray().reshape((nx, nz))
d_adj_dist = VStack @ madj_dist
d_adj = d_adj_dist.asarray().reshape((nstot, nr, nt))

# Inverse
# Initializing x0 to zeroes
x0 = pylops_mpi.DistributedArray(VStack.shape[1],
partition=pylops_mpi.Partition.BROADCAST,
engine="cupy")
x0[:] = 0
minv_dist = pylops_mpi.cgls(VStack, d_dist, x0=x0, niter=100, show=True)[0]
minv = minv_dist.asarray().reshape((nx, nz))
d_inv_dist = VStack @ minv_dist
d_inv = d_inv_dist.asarray().reshape(nstot, nr, nt)

###############################################################################
# Finally we visualize the results. Note that the array must be copied back
# to the CPU by calling the :code:`get()` method on the CuPy arrays.

if rank == 0:
# Visualize
fig1, axs = plt.subplots(1, 3, figsize=(10, 3))
axs[0].imshow(refl.T, cmap="gray", vmin=-1, vmax=1)
axs[0].axis("tight")
axs[0].set_title(r"$m$")
axs[1].imshow(madj.T.get(), cmap="gray", vmin=-madj.max(), vmax=madj.max())
axs[1].set_title(r"$m_{adj}$")
axs[1].axis("tight")
axs[2].imshow(minv.T.get(), cmap="gray", vmin=-1, vmax=1)
axs[2].axis("tight")
axs[2].set_title(r"$m_{inv}$")
plt.tight_layout()

fig2, axs = plt.subplots(1, 3, figsize=(10, 3))
axs[0].imshow(d[0, :, :300].T.get(), cmap="gray", vmin=-d.max(), vmax=d.max())
axs[0].set_title(r"$d$")
axs[0].axis("tight")
axs[1].imshow(d_adj[0, :, :300].T.get(), cmap="gray", vmin=-d_adj.max(), vmax=d_adj.max())
axs[1].set_title(r"$d_{adj}$")
axs[1].axis("tight")
axs[2].imshow(d_inv[0, :, :300].T.get(), cmap="gray", vmin=-d.max(), vmax=d.max())
axs[2].set_title(r"$d_{inv}$")
axs[2].axis("tight")

fig3, axs = plt.subplots(1, 3, figsize=(10, 3))
axs[0].imshow(d[nstot // 2, :, :300].T.get(), cmap="gray", vmin=-d.max(), vmax=d.max())
axs[0].set_title(r"$d$")
axs[0].axis("tight")
axs[1].imshow(d_adj[nstot // 2, :, :300].T.get(), cmap="gray", vmin=-d_adj.max(), vmax=d_adj.max())
axs[1].set_title(r"$d_{adj}$")
axs[1].axis("tight")
axs[2].imshow(d_inv[nstot // 2, :, :300].T.get(), cmap="gray", vmin=-d.max(), vmax=d.max())
axs[2].set_title(r"$d_{inv}$")
axs[2].axis("tight")
plt.tight_layout()
Loading