|
1 | 1 | """ |
2 | 2 | Distributed Matrix Multiplication |
3 | 3 | ================================= |
4 | | -This example shows how to use the :py:class:`pylops_mpi.basicoperators.MatrixMult.MPIMatrixMult`. |
5 | | -This class provides a way to distribute arrays across multiple processes in |
6 | | -a parallel computing environment. |
| 4 | +This example shows how to use the :py:class:`pylops_mpi.basicoperators.MPIMatrixMult` |
| 5 | +operator to perform matrix-matrix multiplication between a matrix :math:`\mathbf{A}` |
| 6 | +blocked over rows (i.e., blocks of rows are stored over different ranks) and a |
| 7 | +matrix :math:`\mathbf{X}` blocked over columns (i.e., blocks of columns are |
| 8 | +stored over different ranks), with equal number of row and column blocks. |
| 9 | +Similarly, the adjoint operation can be peformed with a matrix :math:`\mathbf{Y}` |
| 10 | +blocked in the same fashion of matrix :math:`\mathbf{X}. |
| 11 | +
|
| 12 | +Note that whilst the different blocks of the matrix :math:`\mathbf{A}` are directly |
| 13 | +stored in the operator on different ranks, the matrix :math:`\mathbf{X}` is |
| 14 | +effectively represented by a 1-D :py:class:`pylops_mpi.DistributedArray` where |
| 15 | +the different blocks are flattened and stored on different ranks. Note that to |
| 16 | +optimize communications, the ranks are organized in a 2D grid and some of the |
| 17 | +row blocks of :math:`\mathbf{A}` and column blocks of :math:`\mathbf{X}` are |
| 18 | +replicated across different ranks - see below for details. |
| 19 | +
|
7 | 20 | """ |
8 | 21 | from matplotlib import pyplot as plt |
9 | 22 | import math |
|
14 | 27 | from pylops_mpi.basicoperators.MatrixMult import MPIMatrixMult |
15 | 28 |
|
16 | 29 | plt.close("all") |
| 30 | + |
17 | 31 | ############################################################################### |
18 | | -# We set the seed such that all processes initially start out with the same initial matrix. |
19 | | -# Ideally this data would be loaded in a manner appropriate to the use-case. |
| 32 | +# We set the seed such that all processes can create the input matrices filled |
| 33 | +# with the same random number. In practical application, such matrices will be |
| 34 | +# filled with data that is appropriate that is appropriate the use-case. |
20 | 35 | np.random.seed(42) |
21 | 36 |
|
22 | | -# MPI parameters |
| 37 | +############################################################################### |
| 38 | +# Next we obtain the MPI parameters for each rank and check that the number |
| 39 | +# of processes (``size``) is a square number |
23 | 40 | comm = MPI.COMM_WORLD |
24 | 41 | rank = comm.Get_rank() # rank of current process |
25 | 42 | size = comm.Get_size() # number of processes |
26 | 43 |
|
27 | 44 | p_prime = int(math.ceil(math.sqrt(size))) |
28 | | -C = int(math.ceil(size / p_prime)) |
| 45 | +repl_factor = int(math.ceil(size / p_prime)) |
29 | 46 |
|
30 | | -if (p_prime * C) != size: |
31 | | - print("No. of procs has to be a square number") |
| 47 | +if (p_prime * repl_factor) != size: |
| 48 | + print(f"Number of processes must be a square number, provided {size} instead...") |
32 | 49 | exit(-1) |
33 | 50 |
|
34 | | -# matrix dims |
| 51 | +############################################################################### |
| 52 | +# We are now ready to create the input matrices :math:`\mathbf{A}` of size |
| 53 | +# :math:`M \times k` :math:`\mathbf{A}` of size and :math:`\mathbf{A}` of size |
| 54 | +# :math:`K \times N`. |
35 | 55 | M, K, N = 4, 4, 4 |
36 | 56 | A = np.random.rand(M * K).astype(dtype=np.float32).reshape(M, K) |
37 | 57 | X = np.random.rand(K * N).astype(dtype=np.float32).reshape(K, N) |
| 58 | + |
38 | 59 | ################################################################################ |
39 | | -#Process Grid Organization |
40 | | -#************************* |
41 | | -# |
42 | | -#The processes are arranged in a :math:`\sqrt{P} \times \sqrt{P}` grid, where :math:`P` is the total number of processes. |
| 60 | +# The processes are now arranged in a :math:`\sqrt{P} \times \sqrt{P}` grid, |
| 61 | +# where :math:`P` is the total number of processes. |
43 | 62 | # |
44 | | -#Define |
| 63 | +# We define |
45 | 64 | # |
46 | | -#.. math:: |
47 | | -# P' = \bigl \lceil \sqrt{P} \bigr \rceil |
| 65 | +# .. math:: |
| 66 | +# P' = \bigl \lceil \sqrt{P} \bigr \rceil |
48 | 67 | # |
49 | | -#and the replication factor |
| 68 | +# and the replication factor |
50 | 69 | # |
51 | | -#.. math:: |
52 | | -# C = \bigl\lceil \tfrac{P}{P'} \bigr\rceil. |
| 70 | +# .. math:: |
| 71 | +# R = \bigl\lceil \tfrac{P}{P'} \bigr\rceil. |
53 | 72 | # |
54 | | -#Each process is assigned a pair of coordinates :math:`(g, l)` within this grid: |
| 73 | +# Each process is therefore assigned a pair of coordinates |
| 74 | +# :math:`(g, l)` within this grid: |
55 | 75 | # |
56 | | -#.. math:: |
57 | | -# g = \mathrm{rank} \bmod P', |
58 | | -# \quad |
59 | | -# l = \left\lfloor \frac{\mathrm{rank}}{P'} \right\rfloor. |
| 76 | +# .. math:: |
| 77 | +# g = \mathrm{rank} \bmod P', |
| 78 | +# \quad |
| 79 | +# l = \left\lfloor \frac{\mathrm{rank}}{P'} \right\rfloor. |
60 | 80 | # |
61 | 81 | #For example, when :math:`P = 4` we have :math:`P' = 2`, giving a 2×2 layout: |
62 | 82 | # |
|
75 | 95 | my_group = rank % p_prime |
76 | 96 | my_layer = rank // p_prime |
77 | 97 |
|
78 | | -# Create the sub‐communicators |
| 98 | +# Create sub‐communicators |
79 | 99 | layer_comm = comm.Split(color=my_layer, key=my_group) # all procs in same layer |
80 | 100 | group_comm = comm.Split(color=my_group, key=my_layer) # all procs in same group |
81 | 101 |
|
82 | | -blk_rows = int(math.ceil(M / p_prime)) |
83 | | -blk_cols = int(math.ceil(N / p_prime)) |
84 | | - |
85 | | -rs = my_group * blk_rows |
86 | | -re = min(M, rs + blk_rows) |
87 | | -my_own_rows = re - rs |
88 | | - |
89 | | -cs = my_layer * blk_cols |
90 | | -ce = min(N, cs + blk_cols) |
91 | | -my_own_cols = ce - cs |
92 | | - |
93 | 102 | ################################################################################ |
94 | | -#Each rank will end up with: |
95 | | -# - :math:`A_{p} \in \mathbb{R}^{\text{my_own_rows}\times K}` |
96 | | -# - :math:`X_{p} \in \mathbb{R}^{K\times \text{my_own_cols}}` |
97 | | -#as follows: |
98 | | -A_p, X_p = A[rs:re, :].copy(), X[:, cs:ce].copy() |
99 | | - |
100 | | -################################################################################ |
101 | | -#.. raw:: html |
| 103 | +# At this point we divide the rows and columns of :math:`\mathbf{A}` and |
| 104 | +# :math:`\mathbf{X}`, respectively, such that each rank ends up with: |
| 105 | +# |
| 106 | +# - :math:`A_{p} \in \mathbb{R}^{\text{my_own_rows}\times K}` |
| 107 | +# - :math:`X_{p} \in \mathbb{R}^{K\times \text{my_own_cols}}` |
| 108 | +# |
| 109 | +# .. raw:: html |
102 | 110 | # |
103 | 111 | # <div style="text-align: left; font-family: monospace; white-space: pre;"> |
104 | 112 | # <b>Matrix A (4 x 4):</b> |
|
111 | 119 | # └─────────────────┘ |
112 | 120 | # </div> |
113 | 121 | # |
114 | | -#.. raw:: html |
| 122 | +# .. raw:: html |
115 | 123 | # |
116 | 124 | # <div style="text-align: left; font-family: monospace; white-space: pre;"> |
117 | | -# <b>Matrix B (4 x 4):</b> |
| 125 | +# <b>Matrix X (4 x 4):</b> |
118 | 126 | # ┌─────────┬─────────┐ |
119 | 127 | # │ b11 b12 │ b13 b14 │ <- Cols 0–1 (Layer 0), Cols 2–3 (Layer 1) |
120 | 128 | # │ b21 b22 │ b23 b24 │ |
|
125 | 133 | # </div> |
126 | 134 | # |
127 | 135 |
|
| 136 | +blk_rows = int(math.ceil(M / p_prime)) |
| 137 | +blk_cols = int(math.ceil(N / p_prime)) |
| 138 | + |
| 139 | +rs = my_group * blk_rows |
| 140 | +re = min(M, rs + blk_rows) |
| 141 | +my_own_rows = re - rs |
| 142 | + |
| 143 | +cs = my_layer * blk_cols |
| 144 | +ce = min(N, cs + blk_cols) |
| 145 | +my_own_cols = ce - cs |
| 146 | + |
| 147 | +A_p, X_p = A[rs:re, :].copy(), X[:, cs:ce].copy() |
| 148 | + |
128 | 149 | ################################################################################ |
129 | | -#Forward Operation |
130 | | -#***************** |
131 | | -#To perform our distributed matrix-matrix multiplication :math:`Y = \text{Aop} \times X` we need to create our distributed operator :math:`\text{Aop}` and distributed operand :math:`X` from :math:`A_p` and |
132 | | -#:math:`X_p` respectively |
| 150 | +# We are now ready to create the :py:class:`pylops_mpi.basicoperators.MPIMatrixMult` |
| 151 | +# operator and the input matrix math:`\mathbf{X}` |
133 | 152 | Aop = MPIMatrixMult(A_p, N, dtype="float32") |
134 | | -################################################################################ |
135 | | -# While as well passing the appropriate values. |
| 153 | + |
136 | 154 | col_lens = comm.allgather(my_own_cols) |
137 | | -total_cols = np.sum(col_lens) |
138 | | -x = DistributedArray(global_shape=K * total_cols, |
| 155 | +x = DistributedArray(global_shape=K * N, |
139 | 156 | local_shapes=[K * col_len for col_len in col_lens], |
140 | 157 | partition=Partition.SCATTER, |
141 | | - mask=[i // p_prime for i in range(comm.Get_size())], |
| 158 | + mask=[i % p_prime for i in range(comm.Get_size())], |
142 | 159 | base_comm=comm, |
143 | 160 | dtype="float32") |
144 | 161 | x[:] = X_p.flatten() |
| 162 | + |
145 | 163 | ################################################################################ |
146 | | -#When we perform the matrix-matrix multiplication we shall then obtain a distributed :math:`Y` in the same way our :math:`X` was distributed. |
| 164 | +# We can now apply the forward pass :math:`\mathbf{y} = \mathbf{Ax}` (which effectively |
| 165 | +# implements a distributed matrix-matrix multiplication :math:`Y = \mathbf{AX}`) |
| 166 | +# Note :math:`\mathbf{Y}` is distributed in the same way as the input |
| 167 | +# :math:`\mathbf{X}`. |
147 | 168 | y = Aop @ x |
| 169 | + |
148 | 170 | ############################################################################### |
149 | | -#Adjoint Operation |
150 | | -#***************** |
151 | | -# In a similar fashion we then perform the Adjoint :math:`Xadj = A^H * Y` |
| 171 | +# Next we apply the adjoint pass :math:`\mathbf{x}_{adj} = \mathbf{A}^H \mathbf{x}` |
| 172 | +# (which effectively implements a distributed matrix-matrix multiplication |
| 173 | +# :math:`\mathbf{X}_{adj} = \mathbf{A}^H \mathbf{X}`). Note that |
| 174 | +# :math:`\mathbf{X}_{adj}` is again distributed in the same way as the input |
| 175 | +# :math:`\mathbf{X}`. |
152 | 176 | xadj = Aop.H @ y |
| 177 | + |
153 | 178 | ############################################################################### |
154 | | -#Here we verify the result against the equivalent serial version of the operation. Each rank checks that it has computed the correct values for it partition. |
| 179 | +# To conclude we verify our result against the equivalent serial version of |
| 180 | +# the operation by gathering the resulting matrices in rank0 and reorganizing |
| 181 | +# the returned 1D-arrays into 2D-arrays. |
| 182 | + |
| 183 | +# Local benchmarks |
155 | 184 | y_loc = A @ X |
156 | 185 | xadj_loc = (A.T.dot(y_loc.conj())).conj() |
157 | 186 |
|
158 | | -expected_y_loc = y_loc[:, cs:ce].flatten().astype(np.float32) |
159 | | -expected_xadj_loc = xadj_loc[:, cs:ce].flatten().astype(np.float32) |
160 | | - |
161 | | -if not np.allclose(y.local_array, expected_y_loc, rtol=1e-6): |
162 | | - print(f"RANK {rank}: FORWARD VERIFICATION FAILED") |
163 | | - print(f'{rank} local: {y.local_array}, expected: {y_loc[:, cs:ce]}') |
164 | | -else: |
165 | | - print(f"RANK {rank}: FORWARD VERIFICATION PASSED") |
166 | | - |
167 | | -if not np.allclose(xadj.local_array, expected_xadj_loc, rtol=1e-6): |
168 | | - print(f"RANK {rank}: ADJOINT VERIFICATION FAILED") |
169 | | - print(f'{rank} local: {xadj.local_array}, expected: {xadj_loc[:, cs:ce]}') |
170 | | -else: |
171 | | - print(f"RANK {rank}: ADJOINT VERIFICATION PASSED") |
172 | | - |
| 187 | +y = y.asarray(masked=True) |
| 188 | +if N > 1: |
| 189 | + y = y.reshape(p_prime, M, blk_cols) |
| 190 | +y = np.hstack([yblock for yblock in y]) |
| 191 | +xadj = xadj.asarray(masked=True) |
| 192 | +if N > 1: |
| 193 | + xadj = xadj.reshape(p_prime, K, blk_cols) |
| 194 | +xadj = np.hstack([xadjblock for xadjblock in xadj]) |
| 195 | + |
| 196 | +if rank == 0: |
| 197 | + y_loc = (A @ X).squeeze() |
| 198 | + xadj_loc = (A.T.dot(y_loc.conj())).conj().squeeze() |
| 199 | + |
| 200 | + if not np.allclose(y, y_loc, rtol=1e-6): |
| 201 | + print(f" FORWARD VERIFICATION FAILED") |
| 202 | + print(f'distributed: {y}') |
| 203 | + print(f'expected: {y_loc}') |
| 204 | + else: |
| 205 | + print(f"FORWARD VERIFICATION PASSED") |
| 206 | + |
| 207 | + if not np.allclose(xadj, xadj_loc, rtol=1e-6): |
| 208 | + print(f" ADJOINT VERIFICATION FAILED") |
| 209 | + print(f'distributed: {xadj}') |
| 210 | + print(f'expected: {xadj_loc}') |
| 211 | + else: |
| 212 | + print(f"ADJOINT VERIFICATION PASSED") |
0 commit comments