|
6 | 6 | "nccl_bcast", |
7 | 7 | "nccl_asarray", |
8 | 8 | "nccl_send", |
9 | | - "nccl_recv" |
| 9 | + "nccl_recv", |
| 10 | + "_prepare_nccl_allgather_inputs", |
| 11 | + "_unroll_nccl_allgather_recv" |
10 | 12 | ] |
11 | 13 |
|
12 | 14 | from enum import IntEnum |
@@ -251,61 +253,109 @@ def nccl_bcast(nccl_comm, local_array, index, value) -> None: |
251 | 253 | ) |
252 | 254 |
|
253 | 255 |
|
254 | | -def nccl_asarray(nccl_comm, local_array, local_shapes, axis) -> cp.ndarray: |
255 | | - """Global view of the array |
| 256 | +def _prepare_nccl_allgather_inputs(send_buf, send_buf_shapes) -> tuple[cp.ndarray, cp.ndarray]: |
| 257 | + """ Preparing the send_buf and recv_buf for the NCCL allgather (nccl_allgather) |
256 | 258 |
|
257 | | - Gather all local GPU arrays into a single global array via NCCL all-gather. |
| 259 | + NCCL's allGather requires the sending buffer to have the same size for every device. |
| 260 | + Therefore, the padding is required when the array is not evenly partitioned across |
| 261 | + all the ranks. The padding is applied such that the sending buffer has the size of |
| 262 | + each dimension corresponding to the max possible size of that dimension. |
| 263 | +
|
| 264 | + Receiver buff (recv_buf) will have the size n_rank * send_buf.size |
258 | 265 |
|
259 | 266 | Parameters |
260 | 267 | ---------- |
261 | | - nccl_comm : :obj:`cupy.cuda.nccl.NcclCommunicator` |
262 | | - The NCCL communicator used for collective communication. |
263 | | - local_array : :obj:`cupy.ndarray` |
264 | | - The local array on the current GPU. |
265 | | - local_shapes : :obj:`list` |
266 | | - A list of shapes for each GPU local array (used to trim padding). |
267 | | - axis : :obj:`int` |
268 | | - The axis along which to concatenate the gathered arrays. |
| 268 | + send_buf : :obj:`cupy.ndarray` or array-like |
| 269 | + The data buffer from the local GPU to be sent for allgather. |
| 270 | + send_buf_shapes: :obj:`list` |
| 271 | + A list of shapes for each GPU send_buf (used to calculate padding size) |
269 | 272 |
|
270 | 273 | Returns |
271 | 274 | ------- |
272 | | - final_array : :obj:`cupy.ndarray` |
273 | | - Global array gathered from all GPUs and concatenated along `axis`. |
| 275 | + tuple[send_buf, recv_buf]: :obj:`tuple` |
| 276 | + A tuple of (send_buf, recv_buf) will an appropriate size, shape and dtype for NCCL allgather |
274 | 277 |
|
275 | | - Notes |
276 | | - ----- |
277 | | - NCCL's allGather requires the sending buffer to have the same size for every device. |
278 | | - Therefore, the padding is required when the array is not evenly partitioned across |
279 | | - all the ranks. The padding is applied such that the sending buffer has the size of |
280 | | - each dimension corresponding to the max possible size of that dimension. |
281 | 278 | """ |
282 | | - sizes_each_dim = list(zip(*local_shapes)) |
283 | | - |
| 279 | + sizes_each_dim = list(zip(*send_buf_shapes)) |
284 | 280 | send_shape = tuple(map(max, sizes_each_dim)) |
285 | 281 | pad_size = [ |
286 | | - (0, s_shape - l_shape) for s_shape, l_shape in zip(send_shape, local_array.shape) |
| 282 | + (0, s_shape - l_shape) for s_shape, l_shape in zip(send_shape, send_buf.shape) |
287 | 283 | ] |
288 | 284 |
|
289 | 285 | send_buf = cp.pad( |
290 | | - local_array, pad_size, mode="constant", constant_values=0 |
| 286 | + send_buf, pad_size, mode="constant", constant_values=0 |
291 | 287 | ) |
292 | 288 |
|
293 | 289 | # NCCL recommends to use one MPI Process per GPU and so size of receiving buffer can be inferred |
294 | | - ndev = len(local_shapes) |
| 290 | + ndev = len(send_buf_shapes) |
295 | 291 | recv_buf = cp.zeros(ndev * send_buf.size, dtype=send_buf.dtype) |
296 | | - nccl_allgather(nccl_comm, send_buf, recv_buf) |
297 | 292 |
|
| 293 | + return (send_buf, recv_buf) |
| 294 | + |
| 295 | + |
| 296 | +def _unroll_nccl_allgather_recv(recv_buf, padded_send_buf_shape, send_buf_shapes) -> list: |
| 297 | + """ Remove the padded elements in recv_buff, extract an individual array from each device and return them as a list of arrays |
| 298 | +
|
| 299 | + Each GPU may send array with a different shape, so the return type has to be a list of array |
| 300 | + instead of the concatenated array. |
| 301 | +
|
| 302 | + Parameters |
| 303 | + ---------- |
| 304 | + recv_buf: :obj:`cupy.ndarray` or array-like |
| 305 | + The data buffer returned from nccl_allgather call |
| 306 | + padded_send_buf_shape: :obj:`tuple`:int |
| 307 | + The size of send_buf after padding used in nccl_allgather |
| 308 | + send_buf_shapes: :obj:`list` |
| 309 | + A list of original shapes for each GPU send_buf prior to padding |
| 310 | +
|
| 311 | + Returns |
| 312 | + ------- |
| 313 | + chunks: :obj:`list` |
| 314 | + A list of `cupy.ndarray` from each GPU with the padded element removed |
| 315 | + """ |
| 316 | + |
| 317 | + ndev = len(send_buf_shapes) |
298 | 318 | # extract an individual array from each device |
299 | | - chunk_size = np.prod(send_shape) |
| 319 | + chunk_size = np.prod(padded_send_buf_shape) |
300 | 320 | chunks = [ |
301 | 321 | recv_buf[i * chunk_size:(i + 1) * chunk_size] for i in range(ndev) |
302 | 322 | ] |
303 | 323 |
|
304 | 324 | # Remove padding from each array: the padded value may appear somewhere |
305 | 325 | # in the middle of the flat array and thus the reshape and slicing for each dimension is required |
306 | 326 | for i in range(ndev): |
307 | | - slicing = tuple(slice(0, end) for end in local_shapes[i]) |
308 | | - chunks[i] = chunks[i].reshape(send_shape)[slicing] |
| 327 | + slicing = tuple(slice(0, end) for end in send_buf_shapes[i]) |
| 328 | + chunks[i] = chunks[i].reshape(padded_send_buf_shape)[slicing] |
| 329 | + |
| 330 | + return chunks |
| 331 | + |
| 332 | + |
| 333 | +def nccl_asarray(nccl_comm, local_array, local_shapes, axis) -> cp.ndarray: |
| 334 | + """Global view of the array |
| 335 | +
|
| 336 | + Gather all local GPU arrays into a single global array via NCCL all-gather. |
| 337 | +
|
| 338 | + Parameters |
| 339 | + ---------- |
| 340 | + nccl_comm : :obj:`cupy.cuda.nccl.NcclCommunicator` |
| 341 | + The NCCL communicator used for collective communication. |
| 342 | + local_array : :obj:`cupy.ndarray` |
| 343 | + The local array on the current GPU. |
| 344 | + local_shapes : :obj:`list` |
| 345 | + A list of shapes for each GPU local array (used to trim padding). |
| 346 | + axis : :obj:`int` |
| 347 | + The axis along which to concatenate the gathered arrays. |
| 348 | +
|
| 349 | + Returns |
| 350 | + ------- |
| 351 | + final_array : :obj:`cupy.ndarray` |
| 352 | + Global array gathered from all GPUs and concatenated along `axis`. |
| 353 | + """ |
| 354 | + |
| 355 | + (send_buf, recv_buf) = _prepare_nccl_allgather_inputs(local_array, local_shapes) |
| 356 | + nccl_allgather(nccl_comm, send_buf, recv_buf) |
| 357 | + chunks = _unroll_nccl_allgather_recv(recv_buf, send_buf.shape, local_shapes) |
| 358 | + |
309 | 359 | # combine back to single global array |
310 | 360 | return cp.concatenate(chunks, axis=axis) |
311 | 361 |
|
|
0 commit comments