diff --git a/Makefile b/Makefile index 6623e5be..7d866764 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,10 @@ PIP := $(shell command -v pip3 2> /dev/null || command which pip 2> /dev/null) PYTHON := $(shell command -v python3 2> /dev/null || command which python 2> /dev/null) NUM_PROCESSES = 3 -.PHONY: install dev-install dev-install_nccl install_conda install_conda_nccl dev-install_conda dev-install_conda_nccl tests tests_nccl doc docupdate run_examples run_tutorials +.PHONY: install dev-install dev-install_nccl install_ \ + conda install_conda_nccl dev-install_conda dev-install_conda_nccl \ + tests tests_nccl doc doc_cupy doc_nccl docupdate \ + run_examples run_tutorials run_tutorials_cupy run_tutorials_nccl pipcheck: ifndef PIP @@ -55,12 +58,19 @@ doc: rm -rf source/tutorials && rm -rf build &&\ cd .. && sphinx-build -b html docs/source docs/build +doc_cupy: + cp tutorials_cupy/* tutorials/ + cd docs && rm -rf source/api/generated && rm -rf source/gallery &&\ + rm -rf source/tutorials && rm -rf source/tutorials && rm -rf build &&\ + cd .. && sphinx-build -b html docs/source docs/build + rm tutorials/*_cupy.py + doc_nccl: - cp tutorials_nccl/* tutorials/ + cp tutorials_cupy/* tutorials_nccl/* tutorials/ cd docs && rm -rf source/api/generated && rm -rf source/gallery &&\ rm -rf source/tutorials && rm -rf source/tutorials && rm -rf build &&\ cd .. && sphinx-build -b html docs/source docs/build - rm tutorials/*_nccl.py + rm tutorials/*_cupy.py tutorials/*_nccl.py docupdate: cd docs && make html && cd .. @@ -76,6 +86,10 @@ run_examples: run_tutorials: sh mpi_examples.sh tutorials $(NUM_PROCESSES) +# Run tutorials using mpi with cupy arrays +run_tutorials_cupy: + sh mpi_examples.sh tutorials_cupy $(NUM_PROCESSES) + # Run tutorials using nccl run_tutorials_nccl: sh mpi_examples.sh tutorials_nccl $(NUM_PROCESSES) diff --git a/tutorials/lsm.py b/tutorials/lsm.py index 9db25795..bee504bd 100644 --- a/tutorials/lsm.py +++ b/tutorials/lsm.py @@ -152,8 +152,8 @@ d = d_dist.asarray().reshape((nstot, nr, nt)) ############################################################################### - -# Adjoint +# We calculate now the adjoint and model the data using the adjoint reflectivity +# as input. madj_dist = VStack.H @ d_dist madj = madj_dist.asarray().reshape((nx, nz)) d_adj_dist = VStack @ madj_dist diff --git a/tutorials/mdd.py b/tutorials/mdd.py index 5946fc05..7913f3f5 100644 --- a/tutorials/mdd.py +++ b/tutorials/mdd.py @@ -37,16 +37,16 @@ # Input parameters par = { "ox": -300, - "dx": 10, - "nx": 61, + "dx": 5, + "nx": 121, "oy": -500, - "dy": 10, - "ny": 101, + "dy": 5, + "ny": 201, "ot": 0, - "dt": 0.004, - "nt": 400, + "dt": 0.002, + "nt": 800, "f0": 20, - "nfmax": 200, + "nfmax": 400, } t0_m = 0.2 diff --git a/tutorials_cupy/lsm_cupy.py b/tutorials_cupy/lsm_cupy.py new file mode 100644 index 00000000..ba1267c7 --- /dev/null +++ b/tutorials_cupy/lsm_cupy.py @@ -0,0 +1,200 @@ +r""" +Least-squares Migration with CUDA-Aware MPI +=========================================== +This tutorial is an extension of the :ref:`sphx_glr_tutorials_lsm.py` +tutorial where PyLops-MPI is run in multi-GPU setting with GPUs +communicating via MPI. +""" + +import warnings +warnings.filterwarnings('ignore') + +import numpy as np +import cupy as cp +from matplotlib import pyplot as plt +from mpi4py import MPI + +from pylops.utils.wavelets import ricker +from pylops.waveeqprocessing.lsm import LSM + +import pylops_mpi + +############################################################################### +# The standard MPI communicator is used in this example, so there is no need +# for any initalization. However, we need to assign our GPU resources to the +# different ranks. Here we decide to assign a unique GPU to each process if +# the number of ranks is equal or smaller than that of the GPUs. Otherwise we +# start assigning more than one GPU to the available ranks. Note that this +# approach will work equally well if we have a multi-node multi-GPU setup, where +# each node has one or more GPUs. + +np.random.seed(42) +plt.close("all") +rank = MPI.COMM_WORLD.Get_rank() +size = MPI.COMM_WORLD.Get_size() +device_count = cp.cuda.runtime.getDeviceCount() +cp.cuda.Device(rank % device_count).use(); + +############################################################################### +# Let's start by defining all the parameters required by the +# :py:class:`pylops.waveeqprocessing.LSM` operator. +# Note that this section is exactly the same as the one in the MPI example +# as we will keep using MPI for transfering metadata (i.e., shapes, dims, etc.) + +# Velocity Model +nx, nz = 81, 60 +dx, dz = 4, 4 +x, z = np.arange(nx) * dx, np.arange(nz) * dz +v0 = 1000 # initial velocity +kv = 0.0 # gradient +vel = np.outer(np.ones(nx), v0 + kv * z) + +# Reflectivity Model +refl = np.zeros((nx, nz), dtype=np.float32) +refl[:, 30] = -1 +refl[:, 50] = 0.5 + +# Receivers +nr = 11 +rx = np.linspace(10 * dx, (nx - 10) * dx, nr) +rz = 20 * np.ones(nr) +recs = np.vstack((rx, rz)) + +# Sources +ns = 10 +# Total number of sources at all ranks +nstot = MPI.COMM_WORLD.allreduce(ns, op=MPI.SUM) +sxtot = np.linspace(dx * 10, (nx - 10) * dx, nstot) +sx = sxtot[rank * ns: (rank + 1) * ns] +sztot = 10 * np.ones(nstot) +sz = 10 * np.ones(ns) +sources = np.vstack((sx, sz)) +sources_tot = np.vstack((sxtot, sztot)) + +if rank == 0: + plt.figure(figsize=(10, 5)) + im = plt.imshow(vel.T, cmap="summer", extent=(x[0], x[-1], z[-1], z[0])) + plt.scatter(recs[0], recs[1], marker="v", s=150, c="b", edgecolors="k") + plt.scatter(sources_tot[0], sources_tot[1], marker="*", s=150, c="r", + edgecolors="k") + cb = plt.colorbar(im) + cb.set_label("[m/s]") + plt.axis("tight") + plt.xlabel("x [m]"), plt.ylabel("z [m]") + plt.title("Velocity") + plt.xlim(x[0], x[-1]) + plt.tight_layout() + + plt.figure(figsize=(10, 5)) + im = plt.imshow(refl.T, cmap="gray", extent=(x[0], x[-1], z[-1], z[0])) + plt.scatter(recs[0], recs[1], marker="v", s=150, c="b", edgecolors="k") + plt.scatter(sources_tot[0], sources_tot[1], marker="*", s=150, c="r", + edgecolors="k") + plt.colorbar(im) + plt.axis("tight") + plt.xlabel("x [m]"), plt.ylabel("z [m]") + plt.title("Reflectivity") + plt.xlim(x[0], x[-1]) + plt.tight_layout() + +############################################################################### +# We are now ready to create the :py:class:`pylops.waveeqprocessing.LSM` +# operator and initialize the :py:class:`pylops_mpi.DistributedArray` +# reflecitivity object. Compared to the MPI tutorial, we need to make sure that +# we set ``cupy`` as the engine and fill the distributed arrays with CuPy arrays. + +# Wavelet +nt = 651 +dt = 0.004 +t = np.arange(nt) * dt +wav, wavt, wavc = ricker(t[:41], f0=20) + +lsm = LSM( + z, + x, + t, + sources, + recs, + v0, + cp.asarray(wav.astype(np.float32)), + wavc, + mode="analytic", + engine="cuda", + dtype=np.float32 +) +lsm.Demop.trav_srcs = cp.asarray(lsm.Demop.trav_srcs.astype(np.float32)) +lsm.Demop.trav_recs = cp.asarray(lsm.Demop.trav_recs.astype(np.float32)) + +VStack = pylops_mpi.MPIVStack(ops=[lsm.Demop, ]) +refl_dist = pylops_mpi.DistributedArray(global_shape=nx * nz, + partition=pylops_mpi.Partition.BROADCAST, + engine="cupy") +refl_dist[:] = cp.asarray(refl.flatten()) +d_dist = VStack @ refl_dist +d = d_dist.asarray().reshape((nstot, nr, nt)) + +############################################################################### +# We calculate now the adjoint and the inverse using the +# :py:func:`pylops_mpi.optimization.basic.cgls` solver. No code change +# is required to run on CUDA-aware +# MPI (this is handled by the MPI operator and DistributedArray) +# In this particular case, the local computation will be done in GPU. +# Collective communication calls will be carried through MPI GPU-to-GPU. + +# Adjoint +madj_dist = VStack.H @ d_dist +madj = madj_dist.asarray().reshape((nx, nz)) +d_adj_dist = VStack @ madj_dist +d_adj = d_adj_dist.asarray().reshape((nstot, nr, nt)) + +# Inverse +# Initializing x0 to zeroes +x0 = pylops_mpi.DistributedArray(VStack.shape[1], + partition=pylops_mpi.Partition.BROADCAST, + engine="cupy") +x0[:] = 0 +minv_dist = pylops_mpi.cgls(VStack, d_dist, x0=x0, niter=100, show=True)[0] +minv = minv_dist.asarray().reshape((nx, nz)) +d_inv_dist = VStack @ minv_dist +d_inv = d_inv_dist.asarray().reshape(nstot, nr, nt) + +############################################################################### +# Finally we visualize the results. Note that the array must be copied back +# to the CPU by calling the :code:`get()` method on the CuPy arrays. + +if rank == 0: + # Visualize + fig1, axs = plt.subplots(1, 3, figsize=(10, 3)) + axs[0].imshow(refl.T, cmap="gray", vmin=-1, vmax=1) + axs[0].axis("tight") + axs[0].set_title(r"$m$") + axs[1].imshow(madj.T.get(), cmap="gray", vmin=-madj.max(), vmax=madj.max()) + axs[1].set_title(r"$m_{adj}$") + axs[1].axis("tight") + axs[2].imshow(minv.T.get(), cmap="gray", vmin=-1, vmax=1) + axs[2].axis("tight") + axs[2].set_title(r"$m_{inv}$") + plt.tight_layout() + + fig2, axs = plt.subplots(1, 3, figsize=(10, 3)) + axs[0].imshow(d[0, :, :300].T.get(), cmap="gray", vmin=-d.max(), vmax=d.max()) + axs[0].set_title(r"$d$") + axs[0].axis("tight") + axs[1].imshow(d_adj[0, :, :300].T.get(), cmap="gray", vmin=-d_adj.max(), vmax=d_adj.max()) + axs[1].set_title(r"$d_{adj}$") + axs[1].axis("tight") + axs[2].imshow(d_inv[0, :, :300].T.get(), cmap="gray", vmin=-d.max(), vmax=d.max()) + axs[2].set_title(r"$d_{inv}$") + axs[2].axis("tight") + + fig3, axs = plt.subplots(1, 3, figsize=(10, 3)) + axs[0].imshow(d[nstot // 2, :, :300].T.get(), cmap="gray", vmin=-d.max(), vmax=d.max()) + axs[0].set_title(r"$d$") + axs[0].axis("tight") + axs[1].imshow(d_adj[nstot // 2, :, :300].T.get(), cmap="gray", vmin=-d_adj.max(), vmax=d_adj.max()) + axs[1].set_title(r"$d_{adj}$") + axs[1].axis("tight") + axs[2].imshow(d_inv[nstot // 2, :, :300].T.get(), cmap="gray", vmin=-d.max(), vmax=d.max()) + axs[2].set_title(r"$d_{inv}$") + axs[2].axis("tight") + plt.tight_layout() diff --git a/tutorials_cupy/mdd_cupy.py b/tutorials_cupy/mdd_cupy.py new file mode 100644 index 00000000..400742c4 --- /dev/null +++ b/tutorials_cupy/mdd_cupy.py @@ -0,0 +1,254 @@ +""" +Multi-Dimensional Deconvolution with CUDA-Aware MPI +=================================================== +This tutorial is an extension of the :ref:`sphx_glr_tutorials_mdd.py` +tutorial where PyLops-MPI is run in multi-GPU setting with GPUs communicating +via MPI. +""" + +import numpy as np +import cupy as cp +from matplotlib import pyplot as plt +from mpi4py import MPI + +from pylops.utils.seismicevents import hyperbolic2d, makeaxis +from pylops.utils.tapers import taper3d +from pylops.utils.wavelets import ricker + +import pylops_mpi +from pylops_mpi.DistributedArray import local_split, Partition + +############################################################################### +# The standard MPI communicator is used in this example, so there is no need +# for any initalization. However, we need to assign our GPU resources to the +# different ranks. Here we decide to assign a unique GPU to each process if +# the number of ranks is equal or smaller than that of the GPUs. Otherwise we +# start assigning more than one GPU to the available ranks. Note that this +# approach will work equally well if we have a multi-node multi-GPU setup, where +# each node has one or more GPUs. + +plt.close("all") +rank = MPI.COMM_WORLD.Get_rank() +size = MPI.COMM_WORLD.Get_size() +dtype = np.float32 +cdtype = np.complex64 +device_count = cp.cuda.runtime.getDeviceCount() +cp.cuda.Device(rank % device_count).use(); + +############################################################################### +# Let's start by defining all the parameters required by the +# :py:func:`pylops.waveeqprocessing.MPIMDC` operator. +# Note that this section is exactly the same as the one in the MPI example as +# we will keep using MPI for transfering metadata (i.e., shapes, dims, etc.) + +# Input parameters +par = { + "ox": -300, + "dx": 5, + "nx": 121, + "oy": -500, + "dy": 5, + "ny": 201, + "ot": 0, + "dt": 0.002, + "nt": 800, + "f0": 20, + "nfmax": 400, +} + +t0_m = 0.2 +vrms_m = 1100.0 +amp_m = 1.0 + +t0_G = (0.2, 0.5, 0.7) +vrms_G = (1200.0, 1500.0, 2000.0) +amp_G = (1.0, 0.6, 0.5) + +# Taper +tap = taper3d(par["nt"], (par["ny"], par["nx"]), (5, 5), tapertype="hanning") + +# Create axis +t, t2, x, y = makeaxis(par) + +# Create wavelet +wav = ricker(t[:41], f0=par["f0"])[0] + +# Generate model +mrefl, mwav = hyperbolic2d(x, t, t0_m, vrms_m, amp_m, wav) + +# Generate operator +G, Gwav = np.zeros((par["ny"], par["nx"], par["nt"])), np.zeros( + (par["ny"], par["nx"], par["nt"]) +) +for iy, y0 in enumerate(y): + G[iy], Gwav[iy] = hyperbolic2d(x - y0, t, t0_G, vrms_G, amp_G, wav) +G, Gwav = G * tap, Gwav * tap + +# Add negative part to data and model +mrefl = np.concatenate((np.zeros((par["nx"], par["nt"] - 1)), mrefl), axis=-1) +mwav = np.concatenate((np.zeros((par["nx"], par["nt"] - 1)), mwav), axis=-1) +Gwav2 = np.concatenate((np.zeros((par["ny"], par["nx"], par["nt"] - 1)), Gwav), axis=-1) + +# Move to frequency +Gwav_fft = np.fft.rfft(Gwav2, 2 * par["nt"] - 1, axis=-1) +Gwav_fft = (Gwav_fft[..., : par["nfmax"]]) + +# Move frequency/time to first axis +mrefl, mwav = mrefl.T, mwav.T +Gwav_fft = Gwav_fft.transpose(2, 0, 1) + +# Choose how to split frequencies to ranks +nf = par["nfmax"] +nf_rank = local_split((nf,), MPI.COMM_WORLD, Partition.SCATTER, 0) +nf_ranks = np.concatenate(MPI.COMM_WORLD.allgather(nf_rank)) +ifin_rank = np.insert(np.cumsum(nf_ranks)[:-1], 0, 0)[rank] +ifend_rank = np.cumsum(nf_ranks)[rank] + +# Extract batch of frequency slices (in practice, this will be directly +# read from input file) +G = Gwav_fft[ifin_rank:ifend_rank].astype(cdtype) + +############################################################################### +# For MPIMDCOperator, there is no change needed to have it run with +# MPI. This PyLops operator has GPU-support +# (https://pylops.readthedocs.io/en/stable/gpu.html) so it can operate on a +# distributed arrays with engine set to CuPy. + +# Move operator kernel to GPU +G = cp.asarray(G) + +# Define operator +MDCop = pylops_mpi.waveeqprocessing.MPIMDC((1.0 * par["dt"] * np.sqrt(par["nt"])) * G, + nt=2 * par["nt"] - 1, nv=1, nfreq=nf, + dt=par["dt"], dr=1.0, twosided=True, + fftengine="numpy", prescaled=True) + +# Create model +m = pylops_mpi.DistributedArray(global_shape=(2 * par["nt"] - 1) * par["nx"] * 1, + partition=Partition.BROADCAST, engine="cupy", + dtype=dtype) +m[:] = cp.asarray(mrefl.astype(dtype).ravel()) + +# Create data +d = MDCop @ m +dloc = d.asarray().real.reshape(2 * par["nt"] - 1, par["ny"]) + +############################################################################### +# Let's display what we have so far: operator, input model, and data + +if rank == 0: + fig, axs = plt.subplots(1, 2, figsize=(8, 6)) + axs[0].imshow( + Gwav2[int(par["ny"] / 2)].T, + aspect="auto", + interpolation="nearest", + cmap="gray", + vmin=-np.abs(Gwav2.max()), + vmax=np.abs(Gwav2.max()), + extent=(x.min(), x.max(), t2.max(), t2.min()), + ) + axs[0].set_title("G - inline view", fontsize=15) + axs[0].set_xlabel(r"$x_R$") + axs[1].set_ylabel(r"$t$") + axs[1].imshow( + Gwav2[:, int(par["nx"] / 2)].T, + aspect="auto", + interpolation="nearest", + cmap="gray", + vmin=-np.abs(Gwav2.max()), + vmax=np.abs(Gwav2.max()), + extent=(y.min(), y.max(), t2.max(), t2.min()), + ) + axs[1].set_title("G - inline view", fontsize=15) + axs[1].set_xlabel(r"$x_S$") + axs[1].set_ylabel(r"$t$") + fig.tight_layout() + + fig, axs = plt.subplots(1, 2, figsize=(8, 6)) + axs[0].imshow( + mwav, + aspect="auto", + interpolation="nearest", + cmap="gray", + vmin=-np.abs(mwav.max()), + vmax=np.abs(mwav.max()), + extent=(x.min(), x.max(), t2.max(), t2.min()), + ) + axs[0].set_title(r"$m$", fontsize=15) + axs[0].set_xlabel(r"$x_R$") + axs[0].set_ylabel(r"$t$") + axs[1].imshow( + dloc.get(), + aspect="auto", + interpolation="nearest", + cmap="gray", + vmin=-np.abs(dloc.max()), + vmax=np.abs(dloc.max()), + extent=(x.min(), x.max(), t2.max(), t2.min()), + ) + axs[1].set_title(r"$d$", fontsize=15) + axs[1].set_xlabel(r"$x_S$") + axs[1].set_ylabel(r"$t$") + fig.tight_layout() + +############################################################################### +# We are now ready to compute the adjoint (i.e., cross-correlation) and invert +# back for our input model. This computation will be done in GPU. The call +# :code:`asarray()` triggers CUDA-aware MPI communication (gather result +# from each GPU). Note that the array :code:`madjloc` and :code:`minvloc` +# still live in GPU memory. + + +# Adjoint +madj = MDCop.H @ d +madjloc = madj.asarray().real.reshape(2 * par["nt"] - 1, par["nx"]) + +# Inverse +m0 = pylops_mpi.DistributedArray(global_shape=(2 * par["nt"] - 1) * par["nx"] * 1, + partition=Partition.BROADCAST, engine="cupy", + dtype=cdtype) +m0[:] = 0 +minv = pylops_mpi.cgls(MDCop, d, x0=m0, niter=50, show=True if rank == 0 else False)[0] +minvloc = minv.asarray().real.reshape(2 * par["nt"] - 1, par["nx"]) + +############################################################################### +# Finally we visualize the results. Note that the array must be copied back +# to the CPU by calling the :code:`get()` method on the CuPy arrays. + +if rank == 0: + fig = plt.figure(figsize=(8, 6)) + ax1 = plt.subplot2grid((1, 5), (0, 0), colspan=2) + ax2 = plt.subplot2grid((1, 5), (0, 2), colspan=2) + ax3 = plt.subplot2grid((1, 5), (0, 4)) + ax1.imshow( + madjloc.get(), + aspect="auto", + interpolation="nearest", + cmap="gray", + vmin=-np.abs(madjloc.max()), + vmax=np.abs(madjloc.max()), + extent=(x.min(), x.max(), t2.max(), t2.min()), + ) + ax1.set_title("Adjoint m", fontsize=15) + ax1.set_xlabel(r"$x_V$") + ax1.set_ylabel(r"$t$") + ax2.imshow( + minvloc.get(), + aspect="auto", + interpolation="nearest", + cmap="gray", + vmin=-np.abs(minvloc.max()), + vmax=np.abs(minvloc.max()), + extent=(x.min(), x.max(), t2.max(), t2.min()), + ) + ax2.set_title("Inverted m", fontsize=15) + ax2.set_xlabel(r"$x_V$") + ax2.set_ylabel(r"$t$") + ax3.plot( + madjloc[:, int(par["nx"] / 2)].get() / np.abs(madjloc[:, int(par["nx"] / 2)].get()).max(), t2, "r", lw=5 + ) + ax3.plot( + minvloc[:, int(par["nx"] / 2)].get() / np.abs(minvloc[:, int(par["nx"] / 2)].get()).max(), t2, "k", lw=3 + ) + ax3.set_ylim([t2[-1], t2[0]]) + fig.tight_layout() diff --git a/tutorials_cupy/poststack_cupy.py b/tutorials_cupy/poststack_cupy.py new file mode 100644 index 00000000..3b35cb76 --- /dev/null +++ b/tutorials_cupy/poststack_cupy.py @@ -0,0 +1,222 @@ +r""" +Post Stack Inversion - 3D with CUDA-Aware MPI +============================================= +This tutorial is an extension of the :ref:`sphx_glr_tutorials_poststack.py` +tutorial where PyLops-MPI is run in multi-GPU setting with GPUs communicating +via MPI. +""" + +import numpy as np +import cupy as cp +from scipy.signal import filtfilt +from matplotlib import pyplot as plt +from mpi4py import MPI + +from pylops.utils.wavelets import ricker +from pylops.basicoperators import Transpose +from pylops.avo.poststack import PoststackLinearModelling + +import pylops_mpi + +############################################################################### +# The standard MPI communicator is used in this example, so there is no need +# for any initalization. However, we need to assign our GPU resources to the +# different ranks. Here we decide to assign a unique GPU to each process if +# the number of ranks is equal or smaller than that of the GPUs. Otherwise we +# start assigning more than one GPU to the available ranks. Note that this +# approach will work equally well if we have a multi-node multi-GPU setup, where +# each node has one or more GPUs. + +plt.close("all") +rank = MPI.COMM_WORLD.Get_rank() +device_count = cp.cuda.runtime.getDeviceCount() +cp.cuda.Device(rank % device_count).use(); + +############################################################################### +# Let's start by defining all the parameters required by the +# :py:func:`pylops.avo.poststack.PoststackLinearModelling` operator. +# Note that this section is exactly the same as the one in the MPI example as +# we will keep using MPI for transfering metadata (i.e., shapes, dims, etc.) + +# Model +model = np.load("../testdata/avo/poststack_model.npz") +x, z, m = model['x'][::3], model['z'], np.log(model['model'])[:, ::3] + +# Making m a 3D model +ny_i = 20 # size of model in y direction for rank i +y = np.arange(ny_i) +m3d_i = np.tile(m[:, :, np.newaxis], (1, 1, ny_i)).transpose((2, 1, 0)) +ny_i, nx, nz = m3d_i.shape + +# Size of y at all ranks +ny = MPI.COMM_WORLD.allreduce(ny_i) + +# Smooth model +nsmoothy, nsmoothx, nsmoothz = 5, 30, 20 +mback3d_i = filtfilt(np.ones(nsmoothy) / float(nsmoothy), 1, m3d_i, axis=0) +mback3d_i = filtfilt(np.ones(nsmoothx) / float(nsmoothx), 1, mback3d_i, axis=1) +mback3d_i = filtfilt(np.ones(nsmoothz) / float(nsmoothz), 1, mback3d_i, axis=2) + +# Wavelet +dt = 0.004 +t0 = np.arange(nz) * dt +ntwav = 41 +wav = ricker(t0[:ntwav // 2 + 1], 15)[0] + +# Collecting all the m3d and mback3d at all ranks +m3d = np.concatenate(MPI.COMM_WORLD.allgather(m3d_i)) +mback3d = np.concatenate(MPI.COMM_WORLD.allgather(mback3d_i)) + +############################################################################### +# We are now ready to initialize various :py:class:`pylops_mpi.DistributedArray` +# objects. Compared to the MPI tutorial, we need to make sure that we set ``cupy`` +# as the engine and fill the distributed arrays with CuPy arrays. + +m3d_dist = pylops_mpi.DistributedArray(global_shape=ny * nx * nz, + engine="cupy") +m3d_dist[:] = cp.asarray(m3d_i.flatten()) + +# Do the same thing for smooth model +mback3d_dist = pylops_mpi.DistributedArray(global_shape=ny * nx * nz, + engine="cupy") +mback3d_dist[:] = cp.asarray(mback3d_i.flatten()) + +############################################################################### +# For PostStackLinearModelling, there is no change needed to have it run with +# MPI. This PyLops operator has GPU-support +# (https://pylops.readthedocs.io/en/stable/gpu.html) so it can operate on a +# distributed arrays with engine set to CuPy. + +PPop = PoststackLinearModelling(cp.asarray(wav.astype(np.float32)), nt0=nz, + spatdims=(ny_i, nx)) +Top = Transpose((ny_i, nx, nz), (2, 0, 1)) +BDiag = pylops_mpi.basicoperators.MPIBlockDiag(ops=[Top.H @ PPop @ Top, ]) + +############################################################################### +# This computation will be done on the GPU(s). The call :code:`asarray()` +# triggers the MPI communication (gather results from each GPU). +# Note that the array :code:`d` and :code:`d_0` still live in GPU memory. + +d_dist = BDiag @ m3d_dist +d_local = d_dist.local_array.reshape((ny_i, nx, nz)) +d = d_dist.asarray().reshape((ny, nx, nz)) +d_0_dist = BDiag @ mback3d_dist +d_0 = d_dist.asarray().reshape((ny, nx, nz)) + +############################################################################### +# Inversion using CGLS solver - no code change is required to run the solver +# with CUDA-aware MPI (this is handled by the MPI operator and DistributedArray) +# In this particular case, the local computation will be done in GPU. +# Collective communication calls will be carried through MPI GPU-to-GPU. + +# Inversion using CGLS solver +minv3d_iter_dist = pylops_mpi.optimization.basic.cgls(BDiag, d_dist, + x0=mback3d_dist, + niter=100, show=True)[0] +minv3d_iter = minv3d_iter_dist.asarray().reshape((ny, nx, nz)) + +############################################################################### + +# Regularized inversion with normal equations +epsR = 1e2 +LapOp = pylops_mpi.MPILaplacian(dims=(ny, nx, nz), axes=(0, 1, 2), + weights=(1, 1, 1), + sampling=(1, 1, 1), + dtype=BDiag.dtype) +NormEqOp = BDiag.H @ BDiag + epsR * LapOp.H @ LapOp +dnorm_dist = BDiag.H @ d_dist +minv3d_ne_dist = pylops_mpi.optimization.basic.cg(NormEqOp, dnorm_dist, + x0=mback3d_dist, + niter=100, show=True)[0] +minv3d_ne = minv3d_ne_dist.asarray().reshape((ny, nx, nz)) + +############################################################################### + +# Regularized inversion with regularized equations +StackOp = pylops_mpi.MPIStackedVStack([BDiag, np.sqrt(epsR) * LapOp]) +d0_dist = pylops_mpi.DistributedArray(global_shape=ny * nx * nz, engine="cupy") +d0_dist[:] = 0. +dstack_dist = pylops_mpi.StackedDistributedArray([d_dist, d0_dist]) + +dnorm_dist = BDiag.H @ d_dist +minv3d_reg_dist = pylops_mpi.optimization.basic.cgls(StackOp, dstack_dist, + x0=mback3d_dist, + niter=100, show=True)[0] +minv3d_reg = minv3d_reg_dist.asarray().reshape((ny, nx, nz)) + +############################################################################### +# Finally we visualize the results. Note that the array must be copied back +# to the CPU by calling the :code:`get()` method on the CuPy arrays. + +if rank == 0: + # Check the distributed implementation gives the same result + # as the one running only on rank0 + PPop0 = PoststackLinearModelling(wav, nt0=nz, spatdims=(ny, nx)) + d0 = (PPop0 @ m3d.transpose(2, 0, 1)).transpose(1, 2, 0) + d0_0 = (PPop0 @ m3d.transpose(2, 0, 1)).transpose(1, 2, 0) + + # Check the two distributed implementations give the same modelling results + print('Distr == Local', np.allclose(cp.asnumpy(d), d0, atol=1e-6)) + print('Smooth Distr == Local', np.allclose(cp.asnumpy(d_0), d0_0, atol=1e-6)) + + # Visualize + fig, axs = plt.subplots(nrows=6, ncols=3, figsize=(9, 14), constrained_layout=True) + axs[0][0].imshow(m3d[5, :, :].T, cmap="gist_rainbow", vmin=m.min(), vmax=m.max()) + axs[0][0].set_title("Model x-z") + axs[0][0].axis("tight") + axs[0][1].imshow(m3d[:, 200, :].T, cmap="gist_rainbow", vmin=m.min(), vmax=m.max()) + axs[0][1].set_title("Model y-z") + axs[0][1].axis("tight") + axs[0][2].imshow(m3d[:, :, 220].T, cmap="gist_rainbow", vmin=m.min(), vmax=m.max()) + axs[0][2].set_title("Model y-z") + axs[0][2].axis("tight") + + axs[1][0].imshow(mback3d[5, :, :].T, cmap="gist_rainbow", vmin=m.min(), vmax=m.max()) + axs[1][0].set_title("Smooth Model x-z") + axs[1][0].axis("tight") + axs[1][1].imshow(mback3d[:, 200, :].T, cmap="gist_rainbow", vmin=m.min(), vmax=m.max()) + axs[1][1].set_title("Smooth Model y-z") + axs[1][1].axis("tight") + axs[1][2].imshow(mback3d[:, :, 220].T, cmap="gist_rainbow", vmin=m.min(), vmax=m.max()) + axs[1][2].set_title("Smooth Model y-z") + axs[1][2].axis("tight") + + axs[2][0].imshow(d[5, :, :].T.get(), cmap="gray", vmin=-1, vmax=1) + axs[2][0].set_title("Data x-z") + axs[2][0].axis("tight") + axs[2][1].imshow(d[:, 200, :].T.get(), cmap='gray', vmin=-1, vmax=1) + axs[2][1].set_title('Data y-z') + axs[2][1].axis('tight') + axs[2][2].imshow(d[:, :, 220].T.get(), cmap='gray', vmin=-1, vmax=1) + axs[2][2].set_title('Data x-y') + axs[2][2].axis('tight') + + axs[3][0].imshow(minv3d_iter[5, :, :].T.get(), cmap="gist_rainbow", vmin=m.min(), vmax=m.max()) + axs[3][0].set_title("Inverted Model iter x-z") + axs[3][0].axis("tight") + axs[3][1].imshow(minv3d_iter[:, 200, :].T.get(), cmap='gist_rainbow', vmin=m.min(), vmax=m.max()) + axs[3][1].set_title('Inverted Model iter y-z') + axs[3][1].axis('tight') + axs[3][2].imshow(minv3d_iter[:, :, 220].T.get(), cmap='gist_rainbow', vmin=m.min(), vmax=m.max()) + axs[3][2].set_title('Inverted Model iter x-y') + axs[3][2].axis('tight') + + axs[4][0].imshow(minv3d_ne[5, :, :].T.get(), cmap="gist_rainbow", vmin=m.min(), vmax=m.max()) + axs[4][0].set_title("Normal Equations Inverted Model iter x-z") + axs[4][0].axis("tight") + axs[4][1].imshow(minv3d_ne[:, 200, :].T.get(), cmap='gist_rainbow', vmin=m.min(), vmax=m.max()) + axs[4][1].set_title('Normal Equations Inverted Model iter y-z') + axs[4][1].axis('tight') + axs[4][2].imshow(minv3d_ne[:, :, 220].T.get(), cmap='gist_rainbow', vmin=m.min(), vmax=m.max()) + axs[4][2].set_title('Normal Equations Inverted Model iter x-y') + axs[4][2].axis('tight') + + axs[5][0].imshow(minv3d_reg[5, :, :].T.get(), cmap="gist_rainbow", vmin=m.min(), vmax=m.max()) + axs[5][0].set_title("Regularized Inverted Model iter x-z") + axs[5][0].axis("tight") + axs[5][1].imshow(minv3d_reg[:, 200, :].T.get(), cmap='gist_rainbow', vmin=m.min(), vmax=m.max()) + axs[5][1].set_title('Regularized Inverted Model iter y-z') + axs[5][1].axis('tight') + axs[5][2].imshow(minv3d_reg[:, :, 220].T.get(), cmap='gist_rainbow', vmin=m.min(), vmax=m.max()) + axs[5][2].set_title('Regularized Inverted Model iter x-y') + axs[5][2].axis('tight') diff --git a/tutorials_nccl/poststack_nccl.py b/tutorials_nccl/poststack_nccl.py index e6d94374..e81d9323 100644 --- a/tutorials_nccl/poststack_nccl.py +++ b/tutorials_nccl/poststack_nccl.py @@ -1,7 +1,9 @@ r""" Post Stack Inversion - 3D with NCCL -============================================ -This tutorial is an extension of the :ref:`sphx_glr_tutorials_poststack.py` tutorial where PyLops-MPI is run in multi-GPU setting with GPUs communicating via NCCL. +=================================== +This tutorial is an extension of the :ref:`sphx_glr_tutorials_poststack.py` +tutorial where PyLops-MPI is run in multi-GPU setting with GPUs communicating +via NCCL. """ import numpy as np @@ -28,7 +30,8 @@ ############################################################################### # Let's start by defining all the parameters required by the # :py:func:`pylops.avo.poststack.PoststackLinearModelling` operator. -# Note that this section is exactly the same as the one in the MPI example as we will keep using MPI for transfering metadata (i.e., shapes, dims, etc.) +# Note that this section is exactly the same as the one in the MPI example as +# we will keep using MPI for transfering metadata (i.e., shapes, dims, etc.) # Model model = np.load("../testdata/avo/poststack_model.npz") @@ -60,28 +63,37 @@ mback3d = np.concatenate(MPI.COMM_WORLD.allgather(mback3d_i)) ############################################################################### -# We are now ready to initialize various :py:class:`pylops_mpi.DistributedArray` objects. -# Compared to the MPI tutorial, we need to make sure that we pass :code:`base_comm_nccl = nccl_comm` and set CuPy as the engine. - -m3d_dist = pylops_mpi.DistributedArray(global_shape=ny * nx * nz, base_comm_nccl=nccl_comm, engine="cupy") +# We are now ready to initialize various :py:class:`pylops_mpi.DistributedArray` +# objects. Compared to the MPI tutorial, we need to make sure that we pass +# :code:`base_comm_nccl = nccl_comm`, set ``cupy`` as the engine, and fill the +# distributed arrays with CuPy arrays. + +m3d_dist = pylops_mpi.DistributedArray(global_shape=ny * nx * nz, + base_comm_nccl=nccl_comm, + engine="cupy") m3d_dist[:] = cp.asarray(m3d_i.flatten()) # Do the same thing for smooth model -mback3d_dist = pylops_mpi.DistributedArray(global_shape=ny * nx * nz, base_comm_nccl=nccl_comm, engine="cupy") +mback3d_dist = pylops_mpi.DistributedArray(global_shape=ny * nx * nz, + base_comm_nccl=nccl_comm, + engine="cupy") mback3d_dist[:] = cp.asarray(mback3d_i.flatten()) ############################################################################### -# For PostStackLinearModelling, there is no change needed to have it run with NCCL. -# This PyLops operator has GPU-support (https://pylops.readthedocs.io/en/stable/gpu.html) -# so it can run with DistributedArray whose engine is Cupy +# For PostStackLinearModelling, there is no change needed to have it run +# with NCCL. This PyLops operator has GPU-support +# (https://pylops.readthedocs.io/en/stable/gpu.html) so it can operate on a +# distributed arrays with engine set to CuPy. -PPop = PoststackLinearModelling(wav=cp.asarray(wav), nt0=nz, spatdims=(ny_i, nx)) +PPop = PoststackLinearModelling(wav=cp.asarray(wav), nt0=nz, + spatdims=(ny_i, nx)) Top = Transpose((ny_i, nx, nz), (2, 0, 1)) BDiag = pylops_mpi.basicoperators.MPIBlockDiag(ops=[Top.H @ PPop @ Top, ]) ############################################################################### -# This computation will be done in GPU. The call :code:`asarray()` triggers the NCCL communication (gather result from each GPU). -# But array :code:`d` and :code:`d_0` still live in GPU memory +# This computation will be done on the GPU(s). The call :code:`asarray()` +# triggers the NCCL communication (gather results from each GPU). +# Note that the array :code:`d` and :code:`d_0` still live in GPU memory. d_dist = BDiag @ m3d_dist d_local = d_dist.local_array.reshape((ny_i, nx, nz)) @@ -90,39 +102,50 @@ d_0 = d_dist.asarray().reshape((ny, nx, nz)) ############################################################################### -# Inversion using CGLS solver - There is no code change to have run on NCCL (it handles though MPI operator and DistributedArray) -# In this particular case, the local computation will be done in GPU. Collective communication calls -# will be carried through NCCL GPU-to-GPU. +# Inversion using CGLS solver - There is no code change to have run the solver +# with NCCL (this is handledby the MPI operator and DistributedArray) +# In this particular case, the local computation will be done in GPU. +# Collective communication calls will be carried through NCCL GPU-to-GPU. # Inversion using CGLS solver -minv3d_iter_dist = pylops_mpi.optimization.basic.cgls(BDiag, d_dist, x0=mback3d_dist, niter=100, show=True)[0] +minv3d_iter_dist = pylops_mpi.optimization.basic.cgls(BDiag, d_dist, + x0=mback3d_dist, + niter=100, show=True)[0] minv3d_iter = minv3d_iter_dist.asarray().reshape((ny, nx, nz)) ############################################################################### # Regularized inversion with normal equations epsR = 1e2 -LapOp = pylops_mpi.MPILaplacian(dims=(ny, nx, nz), axes=(0, 1, 2), weights=(1, 1, 1), - sampling=(1, 1, 1), dtype=BDiag.dtype) +LapOp = pylops_mpi.MPILaplacian(dims=(ny, nx, nz), axes=(0, 1, 2), + weights=(1, 1, 1), + sampling=(1, 1, 1), + dtype=BDiag.dtype) NormEqOp = BDiag.H @ BDiag + epsR * LapOp.H @ LapOp dnorm_dist = BDiag.H @ d_dist -minv3d_ne_dist = pylops_mpi.optimization.basic.cg(NormEqOp, dnorm_dist, x0=mback3d_dist, niter=100, show=True)[0] +minv3d_ne_dist = pylops_mpi.optimization.basic.cg(NormEqOp, dnorm_dist, + x0=mback3d_dist, + niter=100, show=True)[0] minv3d_ne = minv3d_ne_dist.asarray().reshape((ny, nx, nz)) ############################################################################### # Regularized inversion with regularized equations StackOp = pylops_mpi.MPIStackedVStack([BDiag, np.sqrt(epsR) * LapOp]) -d0_dist = pylops_mpi.DistributedArray(global_shape=ny * nx * nz, base_comm_nccl=nccl_comm, engine="cupy") +d0_dist = pylops_mpi.DistributedArray(global_shape=ny * nx * nz, + base_comm_nccl=nccl_comm, engine="cupy") d0_dist[:] = 0. dstack_dist = pylops_mpi.StackedDistributedArray([d_dist, d0_dist]) dnorm_dist = BDiag.H @ d_dist -minv3d_reg_dist = pylops_mpi.optimization.basic.cgls(StackOp, dstack_dist, x0=mback3d_dist, niter=100, show=True)[0] +minv3d_reg_dist = pylops_mpi.optimization.basic.cgls(StackOp, dstack_dist, + x0=mback3d_dist, + niter=100, show=True)[0] minv3d_reg = minv3d_reg_dist.asarray().reshape((ny, nx, nz)) ############################################################################### -# To plot the inversion results, the array must be copied back to cpu via :code:`get()` +# Finally we visualize the results. Note that the array must be copied back +# to the CPU by calling the :code:`get()` method on the CuPy arrays. if rank == 0: # Check the distributed implementation gives the same result