|
| 1 | +r""" |
| 2 | +Least-squares Migration with CUDA-Aware MPI |
| 3 | +=========================================== |
| 4 | +This tutorial is an extension of the :ref:`sphx_glr_tutorials_lsm.py` |
| 5 | +tutorial where PyLops-MPI is run in multi-GPU setting with GPUs communicating via |
| 6 | +CUDA-Aware MPI. |
| 7 | +""" |
| 8 | + |
| 9 | +import warnings |
| 10 | +warnings.filterwarnings('ignore') |
| 11 | + |
| 12 | +import numpy as np |
| 13 | +import cupy as cp |
| 14 | +from matplotlib import pyplot as plt |
| 15 | +from mpi4py import MPI |
| 16 | + |
| 17 | +from pylops.utils.wavelets import ricker |
| 18 | +from pylops.waveeqprocessing.lsm import LSM |
| 19 | + |
| 20 | +import pylops_mpi |
| 21 | + |
| 22 | +np.random.seed(42) |
| 23 | +plt.close("all") |
| 24 | +rank = MPI.COMM_WORLD.Get_rank() |
| 25 | +size = MPI.COMM_WORLD.Get_size() |
| 26 | +cp.cuda.Device(device=rank).use(); |
| 27 | + |
| 28 | +############################################################################### |
| 29 | +# Let's start with a simple model with two interfaces, where sources are |
| 30 | +# distributed across different ranks. |
| 31 | +# Note that this section is exactly the same as the one in the MPI example |
| 32 | +# as we will keep using MPI for transfering metadata (i.e., shapes, dims, etc.) |
| 33 | + |
| 34 | +# Velocity Model |
| 35 | +nx, nz = 81, 60 |
| 36 | +dx, dz = 4, 4 |
| 37 | +x, z = np.arange(nx) * dx, np.arange(nz) * dz |
| 38 | +v0 = 1000 # initial velocity |
| 39 | +kv = 0.0 # gradient |
| 40 | +vel = np.outer(np.ones(nx), v0 + kv * z) |
| 41 | + |
| 42 | +# Reflectivity Model |
| 43 | +refl = np.zeros((nx, nz), dtype=np.float32) |
| 44 | +refl[:, 30] = -1 |
| 45 | +refl[:, 50] = 0.5 |
| 46 | + |
| 47 | +# Receivers |
| 48 | +nr = 11 |
| 49 | +rx = np.linspace(10 * dx, (nx - 10) * dx, nr) |
| 50 | +rz = 20 * np.ones(nr) |
| 51 | +recs = np.vstack((rx, rz)) |
| 52 | + |
| 53 | +# Sources |
| 54 | +ns = 10 |
| 55 | +# Total number of sources at all ranks |
| 56 | +nstot = MPI.COMM_WORLD.allreduce(ns, op=MPI.SUM) |
| 57 | +sxtot = np.linspace(dx * 10, (nx - 10) * dx, nstot) |
| 58 | +sx = sxtot[rank * ns: (rank + 1) * ns] |
| 59 | +sztot = 10 * np.ones(nstot) |
| 60 | +sz = 10 * np.ones(ns) |
| 61 | +sources = np.vstack((sx, sz)) |
| 62 | +sources_tot = np.vstack((sxtot, sztot)) |
| 63 | + |
| 64 | +if rank == 0: |
| 65 | + plt.figure(figsize=(10, 5)) |
| 66 | + im = plt.imshow(vel.T, cmap="summer", extent=(x[0], x[-1], z[-1], z[0])) |
| 67 | + plt.scatter(recs[0], recs[1], marker="v", s=150, c="b", edgecolors="k") |
| 68 | + plt.scatter(sources_tot[0], sources_tot[1], marker="*", s=150, c="r", edgecolors="k") |
| 69 | + cb = plt.colorbar(im) |
| 70 | + cb.set_label("[m/s]") |
| 71 | + plt.axis("tight") |
| 72 | + plt.xlabel("x [m]"), plt.ylabel("z [m]") |
| 73 | + plt.title("Velocity") |
| 74 | + plt.xlim(x[0], x[-1]) |
| 75 | + plt.tight_layout() |
| 76 | + |
| 77 | + plt.figure(figsize=(10, 5)) |
| 78 | + im = plt.imshow(refl.T, cmap="gray", extent=(x[0], x[-1], z[-1], z[0])) |
| 79 | + plt.scatter(recs[0], recs[1], marker="v", s=150, c="b", edgecolors="k") |
| 80 | + plt.scatter(sources_tot[0], sources_tot[1], marker="*", s=150, c="r", edgecolors="k") |
| 81 | + plt.colorbar(im) |
| 82 | + plt.axis("tight") |
| 83 | + plt.xlabel("x [m]"), plt.ylabel("z [m]") |
| 84 | + plt.title("Reflectivity") |
| 85 | + plt.xlim(x[0], x[-1]) |
| 86 | + plt.tight_layout() |
| 87 | + |
| 88 | +############################################################################### |
| 89 | +# We are now ready to create the :py:class:`pylops.waveeqprocessing.LSM` |
| 90 | +# operator and initialize the :py:class:`pylops_mpi.DistributedArray` |
| 91 | +# reflecitivity object. Compared to the MPI tutorial, we need to make sure that |
| 92 | +# we set CuPy as the engine and use CuPy arrays |
| 93 | + |
| 94 | +# Wavelet |
| 95 | +nt = 651 |
| 96 | +dt = 0.004 |
| 97 | +t = np.arange(nt) * dt |
| 98 | +wav, wavt, wavc = ricker(t[:41], f0=20) |
| 99 | + |
| 100 | +lsm = LSM( |
| 101 | + z, |
| 102 | + x, |
| 103 | + t, |
| 104 | + sources, |
| 105 | + recs, |
| 106 | + v0, |
| 107 | + cp.asarray(wav.astype(np.float32)), |
| 108 | + wavc, |
| 109 | + mode="analytic", |
| 110 | + engine="cuda", |
| 111 | + dtype=np.float32 |
| 112 | +) |
| 113 | +lsm.Demop.trav_srcs = cp.asarray(lsm.Demop.trav_srcs.astype(np.float32)) |
| 114 | +lsm.Demop.trav_recs = cp.asarray(lsm.Demop.trav_recs.astype(np.float32)) |
| 115 | + |
| 116 | +VStack = pylops_mpi.MPIVStack(ops=[lsm.Demop, ]) |
| 117 | +refl_dist = pylops_mpi.DistributedArray(global_shape=nx * nz, |
| 118 | + partition=pylops_mpi.Partition.BROADCAST, |
| 119 | + engine="cupy") |
| 120 | +refl_dist[:] = cp.asarray(refl.flatten()) |
| 121 | +d_dist = VStack @ refl_dist |
| 122 | +d = d_dist.asarray().reshape((nstot, nr, nt)) |
| 123 | + |
| 124 | +############################################################################### |
| 125 | +# We calculate now the adjoint and the inverse using the |
| 126 | +# :py:func:`pylops_mpi.optimization.basic.cgls` solver. No code change |
| 127 | +# is required to run on CUDA-aware |
| 128 | +# MPI (this is handled through MPI operator and DistributedArray) |
| 129 | +# In this particular case, the local computation will be done in GPU. |
| 130 | +# Collective communication calls will be carried through MPI GPU-to-GPU. |
| 131 | + |
| 132 | +# Adjoint |
| 133 | +madj_dist = VStack.H @ d_dist |
| 134 | +madj = madj_dist.asarray().reshape((nx, nz)) |
| 135 | +d_adj_dist = VStack @ madj_dist |
| 136 | +d_adj = d_adj_dist.asarray().reshape((nstot, nr, nt)) |
| 137 | + |
| 138 | +# Inverse |
| 139 | +# Initializing x0 to zeroes |
| 140 | +x0 = pylops_mpi.DistributedArray(VStack.shape[1], |
| 141 | + partition=pylops_mpi.Partition.BROADCAST, |
| 142 | + engine="cupy") |
| 143 | +x0[:] = 0 |
| 144 | +minv_dist = pylops_mpi.cgls(VStack, d_dist, x0=x0, niter=100, show=True)[0] |
| 145 | +minv = minv_dist.asarray().reshape((nx, nz)) |
| 146 | +d_inv_dist = VStack @ minv_dist |
| 147 | +d_inv = d_inv_dist.asarray().reshape(nstot, nr, nt) |
| 148 | + |
| 149 | +############################################################################### |
| 150 | +if rank == 0: |
| 151 | + # Visualize |
| 152 | + fig1, axs = plt.subplots(1, 3, figsize=(10, 3)) |
| 153 | + axs[0].imshow(refl.T, cmap="gray", vmin=-1, vmax=1) |
| 154 | + axs[0].axis("tight") |
| 155 | + axs[0].set_title(r"$m$") |
| 156 | + axs[1].imshow(madj.T.get(), cmap="gray", vmin=-madj.max(), vmax=madj.max()) |
| 157 | + axs[1].set_title(r"$m_{adj}$") |
| 158 | + axs[1].axis("tight") |
| 159 | + axs[2].imshow(minv.T.get(), cmap="gray", vmin=-1, vmax=1) |
| 160 | + axs[2].axis("tight") |
| 161 | + axs[2].set_title(r"$m_{inv}$") |
| 162 | + plt.tight_layout() |
| 163 | + |
| 164 | + fig2, axs = plt.subplots(1, 3, figsize=(10, 3)) |
| 165 | + axs[0].imshow(d[0, :, :300].T.get(), cmap="gray", vmin=-d.max(), vmax=d.max()) |
| 166 | + axs[0].set_title(r"$d$") |
| 167 | + axs[0].axis("tight") |
| 168 | + axs[1].imshow(d_adj[0, :, :300].T.get(), cmap="gray", vmin=-d_adj.max(), vmax=d_adj.max()) |
| 169 | + axs[1].set_title(r"$d_{adj}$") |
| 170 | + axs[1].axis("tight") |
| 171 | + axs[2].imshow(d_inv[0, :, :300].T.get(), cmap="gray", vmin=-d.max(), vmax=d.max()) |
| 172 | + axs[2].set_title(r"$d_{inv}$") |
| 173 | + axs[2].axis("tight") |
| 174 | + |
| 175 | + fig3, axs = plt.subplots(1, 3, figsize=(10, 3)) |
| 176 | + axs[0].imshow(d[nstot // 2, :, :300].T.get(), cmap="gray", vmin=-d.max(), vmax=d.max()) |
| 177 | + axs[0].set_title(r"$d$") |
| 178 | + axs[0].axis("tight") |
| 179 | + axs[1].imshow(d_adj[nstot // 2, :, :300].T.get(), cmap="gray", vmin=-d_adj.max(), vmax=d_adj.max()) |
| 180 | + axs[1].set_title(r"$d_{adj}$") |
| 181 | + axs[1].axis("tight") |
| 182 | + axs[2].imshow(d_inv[nstot // 2, :, :300].T.get(), cmap="gray", vmin=-d.max(), vmax=d.max()) |
| 183 | + axs[2].set_title(r"$d_{inv}$") |
| 184 | + axs[2].axis("tight") |
| 185 | + plt.tight_layout() |
0 commit comments