Skip to content
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 17 additions & 3 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,10 @@ PIP := $(shell command -v pip3 2> /dev/null || command which pip 2> /dev/null)
PYTHON := $(shell command -v python3 2> /dev/null || command which python 2> /dev/null)
NUM_PROCESSES = 3

.PHONY: install dev-install dev-install_nccl install_conda install_conda_nccl dev-install_conda dev-install_conda_nccl tests tests_nccl doc docupdate run_examples run_tutorials
.PHONY: install dev-install dev-install_nccl install_ \
conda install_conda_nccl dev-install_conda dev-install_conda_nccl \
tests tests_nccl doc doc_cupy doc_nccl docupdate \
run_examples run_tutorials run_tutorials_cupy run_tutorials_nccl

pipcheck:
ifndef PIP
Expand Down Expand Up @@ -55,12 +58,19 @@ doc:
rm -rf source/tutorials && rm -rf build &&\
cd .. && sphinx-build -b html docs/source docs/build

doc_cupy:
cp tutorials_cupy/* tutorials/
cd docs && rm -rf source/api/generated && rm -rf source/gallery &&\
rm -rf source/tutorials && rm -rf source/tutorials && rm -rf build &&\
cd .. && sphinx-build -b html docs/source docs/build
rm tutorials/*_cupy.py

doc_nccl:
cp tutorials_nccl/* tutorials/
cp tutorials_cupy/* tutorials_nccl/* tutorials/
cd docs && rm -rf source/api/generated && rm -rf source/gallery &&\
rm -rf source/tutorials && rm -rf source/tutorials && rm -rf build &&\
cd .. && sphinx-build -b html docs/source docs/build
rm tutorials/*_nccl.py
rm tutorials/*_cupy.py tutorials/*_nccl.py

docupdate:
cd docs && make html && cd ..
Expand All @@ -76,6 +86,10 @@ run_examples:
run_tutorials:
sh mpi_examples.sh tutorials $(NUM_PROCESSES)

# Run tutorials using mpi with cupy arrays
run_tutorials_cupy:
sh mpi_examples.sh tutorials_cupy $(NUM_PROCESSES)

# Run tutorials using nccl
run_tutorials_nccl:
sh mpi_examples.sh tutorials_nccl $(NUM_PROCESSES)
4 changes: 2 additions & 2 deletions tutorials/lsm.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,8 +152,8 @@
d = d_dist.asarray().reshape((nstot, nr, nt))

###############################################################################

# Adjoint
# We calculate now the adjoint and model the data using the adjoint reflectivity
# as input.
madj_dist = VStack.H @ d_dist
madj = madj_dist.asarray().reshape((nx, nz))
d_adj_dist = VStack @ madj_dist
Expand Down
14 changes: 7 additions & 7 deletions tutorials/mdd.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,16 +37,16 @@
# Input parameters
par = {
"ox": -300,
"dx": 10,
"nx": 61,
"dx": 5,
"nx": 121,
"oy": -500,
"dy": 10,
"ny": 101,
"dy": 5,
"ny": 201,
"ot": 0,
"dt": 0.004,
"nt": 400,
"dt": 0.002,
"nt": 800,
"f0": 20,
"nfmax": 200,
"nfmax": 400,
}

t0_m = 0.2
Expand Down
186 changes: 186 additions & 0 deletions tutorials_cupy/lsm_cupy.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,186 @@
r"""
Least-squares Migration with CUDA-Aware MPI
===========================================
This tutorial is an extension of the :ref:`sphx_glr_tutorials_lsm.py`
tutorial where PyLops-MPI is run in multi-GPU setting with GPUs communicating via
CUDA-Aware MPI.
"""

import warnings
warnings.filterwarnings('ignore')

import numpy as np
import cupy as cp
from matplotlib import pyplot as plt
from mpi4py import MPI

from pylops.utils.wavelets import ricker
from pylops.waveeqprocessing.lsm import LSM

import pylops_mpi

np.random.seed(42)
plt.close("all")
rank = MPI.COMM_WORLD.Get_rank()
size = MPI.COMM_WORLD.Get_size()
device_count = cp.cuda.runtime.getDeviceCount()
cp.cuda.Device(rank % device_count).use()

###############################################################################
# Let's start with a simple model with two interfaces, where sources are
# distributed across different ranks.
# Note that this section is exactly the same as the one in the MPI example
# as we will keep using MPI for transfering metadata (i.e., shapes, dims, etc.)

# Velocity Model
nx, nz = 81, 60
dx, dz = 4, 4
x, z = np.arange(nx) * dx, np.arange(nz) * dz
v0 = 1000 # initial velocity
kv = 0.0 # gradient
vel = np.outer(np.ones(nx), v0 + kv * z)

# Reflectivity Model
refl = np.zeros((nx, nz), dtype=np.float32)
refl[:, 30] = -1
refl[:, 50] = 0.5

# Receivers
nr = 11
rx = np.linspace(10 * dx, (nx - 10) * dx, nr)
rz = 20 * np.ones(nr)
recs = np.vstack((rx, rz))

# Sources
ns = 10
# Total number of sources at all ranks
nstot = MPI.COMM_WORLD.allreduce(ns, op=MPI.SUM)
sxtot = np.linspace(dx * 10, (nx - 10) * dx, nstot)
sx = sxtot[rank * ns: (rank + 1) * ns]
sztot = 10 * np.ones(nstot)
sz = 10 * np.ones(ns)
sources = np.vstack((sx, sz))
sources_tot = np.vstack((sxtot, sztot))

if rank == 0:
plt.figure(figsize=(10, 5))
im = plt.imshow(vel.T, cmap="summer", extent=(x[0], x[-1], z[-1], z[0]))
plt.scatter(recs[0], recs[1], marker="v", s=150, c="b", edgecolors="k")
plt.scatter(sources_tot[0], sources_tot[1], marker="*", s=150, c="r", edgecolors="k")
cb = plt.colorbar(im)
cb.set_label("[m/s]")
plt.axis("tight")
plt.xlabel("x [m]"), plt.ylabel("z [m]")
plt.title("Velocity")
plt.xlim(x[0], x[-1])
plt.tight_layout()

plt.figure(figsize=(10, 5))
im = plt.imshow(refl.T, cmap="gray", extent=(x[0], x[-1], z[-1], z[0]))
plt.scatter(recs[0], recs[1], marker="v", s=150, c="b", edgecolors="k")
plt.scatter(sources_tot[0], sources_tot[1], marker="*", s=150, c="r", edgecolors="k")
plt.colorbar(im)
plt.axis("tight")
plt.xlabel("x [m]"), plt.ylabel("z [m]")
plt.title("Reflectivity")
plt.xlim(x[0], x[-1])
plt.tight_layout()

###############################################################################
# We are now ready to create the :py:class:`pylops.waveeqprocessing.LSM`
# operator and initialize the :py:class:`pylops_mpi.DistributedArray`
# reflecitivity object. Compared to the MPI tutorial, we need to make sure that
# we set CuPy as the engine and use CuPy arrays

# Wavelet
nt = 651
dt = 0.004
t = np.arange(nt) * dt
wav, wavt, wavc = ricker(t[:41], f0=20)

lsm = LSM(
z,
x,
t,
sources,
recs,
v0,
cp.asarray(wav.astype(np.float32)),
wavc,
mode="analytic",
engine="cuda",
dtype=np.float32
)
lsm.Demop.trav_srcs = cp.asarray(lsm.Demop.trav_srcs.astype(np.float32))
lsm.Demop.trav_recs = cp.asarray(lsm.Demop.trav_recs.astype(np.float32))

VStack = pylops_mpi.MPIVStack(ops=[lsm.Demop, ])
refl_dist = pylops_mpi.DistributedArray(global_shape=nx * nz,
partition=pylops_mpi.Partition.BROADCAST,
engine="cupy")
refl_dist[:] = cp.asarray(refl.flatten())
d_dist = VStack @ refl_dist
d = d_dist.asarray().reshape((nstot, nr, nt))

###############################################################################
# We calculate now the adjoint and the inverse using the
# :py:func:`pylops_mpi.optimization.basic.cgls` solver. No code change
# is required to run on CUDA-aware
# MPI (this is handled through MPI operator and DistributedArray)
# In this particular case, the local computation will be done in GPU.
# Collective communication calls will be carried through MPI GPU-to-GPU.

# Adjoint
madj_dist = VStack.H @ d_dist
madj = madj_dist.asarray().reshape((nx, nz))
d_adj_dist = VStack @ madj_dist
d_adj = d_adj_dist.asarray().reshape((nstot, nr, nt))

# Inverse
# Initializing x0 to zeroes
x0 = pylops_mpi.DistributedArray(VStack.shape[1],
partition=pylops_mpi.Partition.BROADCAST,
engine="cupy")
x0[:] = 0
minv_dist = pylops_mpi.cgls(VStack, d_dist, x0=x0, niter=100, show=True)[0]
minv = minv_dist.asarray().reshape((nx, nz))
d_inv_dist = VStack @ minv_dist
d_inv = d_inv_dist.asarray().reshape(nstot, nr, nt)

###############################################################################
if rank == 0:
# Visualize
fig1, axs = plt.subplots(1, 3, figsize=(10, 3))
axs[0].imshow(refl.T, cmap="gray", vmin=-1, vmax=1)
axs[0].axis("tight")
axs[0].set_title(r"$m$")
axs[1].imshow(madj.T.get(), cmap="gray", vmin=-madj.max(), vmax=madj.max())
axs[1].set_title(r"$m_{adj}$")
axs[1].axis("tight")
axs[2].imshow(minv.T.get(), cmap="gray", vmin=-1, vmax=1)
axs[2].axis("tight")
axs[2].set_title(r"$m_{inv}$")
plt.tight_layout()

fig2, axs = plt.subplots(1, 3, figsize=(10, 3))
axs[0].imshow(d[0, :, :300].T.get(), cmap="gray", vmin=-d.max(), vmax=d.max())
axs[0].set_title(r"$d$")
axs[0].axis("tight")
axs[1].imshow(d_adj[0, :, :300].T.get(), cmap="gray", vmin=-d_adj.max(), vmax=d_adj.max())
axs[1].set_title(r"$d_{adj}$")
axs[1].axis("tight")
axs[2].imshow(d_inv[0, :, :300].T.get(), cmap="gray", vmin=-d.max(), vmax=d.max())
axs[2].set_title(r"$d_{inv}$")
axs[2].axis("tight")

fig3, axs = plt.subplots(1, 3, figsize=(10, 3))
axs[0].imshow(d[nstot // 2, :, :300].T.get(), cmap="gray", vmin=-d.max(), vmax=d.max())
axs[0].set_title(r"$d$")
axs[0].axis("tight")
axs[1].imshow(d_adj[nstot // 2, :, :300].T.get(), cmap="gray", vmin=-d_adj.max(), vmax=d_adj.max())
axs[1].set_title(r"$d_{adj}$")
axs[1].axis("tight")
axs[2].imshow(d_inv[nstot // 2, :, :300].T.get(), cmap="gray", vmin=-d.max(), vmax=d.max())
axs[2].set_title(r"$d_{inv}$")
axs[2].axis("tight")
plt.tight_layout()
Loading