Skip to content

Commit f22982d

Browse files
dstaay-fbmeta-codesync[bot]
authored andcommitted
Increase shared MR key to support 64 GB segments (#1468)
Summary: Pull Request resolved: #1468 To leverage cuda caching allocator and recycle MRs to make cheap RdmaBuffer registration, we utilize a lightly documented mlx5dv api for mr keys (mlx5dv_wr_mr_list). In practice, this api would fail if MRs were large. Looking though research presentations, an author highlights the system has a MR limit of 2GB (https://ucfconsortium.org/wp-content/uploads/2024/12/2024_2_Cross-GVMI-UMR-Memory-Key-Pool-Optimization.pdf) With this insight, able to increase limit from 4GB -> 2GB x 32 = 64 GB. This leads to 87% utilization -> 99% utilization on 16GiB messages. load test: buck run //monarch/python/tests:rdma_load_test -- --device cuda:0 cuda:1 --operation write --iterations 100 --size 16000 --expandable-segments true Reviewed By: nblintao Differential Revision: D84166847 fbshipit-source-id: 12a1bf9816acf863dbb6e7531e89b11241919b80
1 parent e8e6268 commit f22982d

File tree

1 file changed

+50
-36
lines changed

1 file changed

+50
-36
lines changed

rdmaxcel-sys/src/rdmaxcel.cpp

Lines changed: 50 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,12 @@
1818
// TODO: explicitly obtain from ibverbs config, for now assume 32
1919
const int SGE_MAX = 32;
2020

21+
// Maximum size for a single MR: 2GB
22+
const size_t MAX_MR_SIZE = 2ULL * 1024 * 1024 * 1024;
23+
24+
// MR size must be a multiple of 2MB
25+
const size_t MR_ALIGNMENT = 2ULL * 1024 * 1024;
26+
2127
// Structure to hold segment information
2228
struct SegmentInfo {
2329
size_t phys_address;
@@ -190,10 +196,7 @@ int bind_mrs(
190196

191197
qpx->wr_flags = IBV_SEND_INLINE | IBV_SEND_SIGNALED;
192198
ibv_wr_start(qpx);
193-
struct mlx5dv_mkey_conf_attr mkey_cattr = {};
194-
mlx5dv_wr_mkey_configure(mqpx, seg.mkey, 2, &mkey_cattr);
195-
mlx5dv_wr_set_mkey_access_flags(mqpx, access_flags);
196-
mlx5dv_wr_set_mkey_layout_list(mqpx, mrs_cnt, sgl.data());
199+
mlx5dv_wr_mr_list(mqpx, seg.mkey, access_flags, mrs_cnt, sgl.data());
197200
int ret = ibv_wr_complete(qpx);
198201

199202
if (ret != 0) {
@@ -276,45 +279,56 @@ int register_segments(struct ibv_pd* pd, struct ibv_qp* qp) {
276279
if (seg.mr_size != seg.phys_size) {
277280
auto mr_start = seg.phys_address + seg.mr_size;
278281
auto mr_end = seg.phys_address + seg.phys_size;
279-
auto mr_size = mr_end - mr_start;
282+
auto remaining_size = mr_end - mr_start;
283+
284+
// Register in chunks of MAX_MR_SIZE
285+
size_t current_offset = 0;
286+
while (current_offset < remaining_size) {
287+
size_t chunk_size =
288+
std::min(remaining_size - current_offset, MAX_MR_SIZE);
289+
auto chunk_start = mr_start + current_offset;
290+
291+
// Validate that chunk_size is a multiple of 2MB
292+
if (chunk_size % MR_ALIGNMENT != 0) {
293+
return RDMAXCEL_MR_REGISTRATION_FAILED;
294+
}
280295

281-
// TODO: resolve 4GiB limit
282-
if (seg.phys_size > (1ULL << 32)) {
283-
return RDMAXCEL_MKEY_REG_LIMIT;
284-
}
285-
int fd = -1;
286-
CUresult cu_result = cuMemGetHandleForAddressRange(
287-
&fd,
288-
static_cast<CUdeviceptr>(mr_start),
289-
mr_size,
290-
CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD,
291-
0);
292-
293-
if (cu_result != CUDA_SUCCESS || fd < 0) {
294-
return RDMAXCEL_DMABUF_HANDLE_FAILED; // Failed to get dmabuf handle
295-
}
296+
int fd = -1;
297+
CUresult cu_result = cuMemGetHandleForAddressRange(
298+
&fd,
299+
static_cast<CUdeviceptr>(chunk_start),
300+
chunk_size,
301+
CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD,
302+
0);
303+
304+
if (cu_result != CUDA_SUCCESS || fd < 0) {
305+
return RDMAXCEL_DMABUF_HANDLE_FAILED; // Failed to get dmabuf handle
306+
}
307+
308+
// Register the dmabuf with fd, address is always 0.
309+
auto mr = ibv_reg_dmabuf_mr(pd, 0, chunk_size, 0, fd, access_flags);
310+
close(fd);
296311

297-
// Register the dmabuf with fd, address is always 0.
298-
auto mr = ibv_reg_dmabuf_mr(pd, 0, mr_size, 0, fd, access_flags);
299-
close(fd);
312+
if (!mr) {
313+
return RDMAXCEL_MR_REG_FAILED; // MR registration failed
314+
}
300315

301-
if (!mr) {
302-
return RDMAXCEL_MR_REG_FAILED; // MR registration failed
316+
seg.mrs.push_back(mr);
317+
current_offset += chunk_size;
318+
319+
// If we have too many MRs, compact them into a single MR
320+
if (seg.mrs.size() > SGE_MAX) {
321+
// TODO: find a safe way to compact with low performance cost.
322+
// return MAX_SGE error auto err = compact_mrs(pd, seg, access_flags);
323+
// if (err != 0) {
324+
// return err;
325+
// }
326+
return RDMAXCEL_MKEY_REG_LIMIT;
327+
}
303328
}
304329

305-
seg.mrs.push_back(mr);
306330
seg.mr_size = seg.phys_size;
307331

308-
// If we have too many MRs, compact them into a single MR
309-
if (seg.mrs.size() > SGE_MAX) {
310-
// TODO: find a safe way to compact with low performance cost.
311-
// return MAX_SGE error auto err = compact_mrs(pd, seg, access_flags);
312-
// if (err != 0) {
313-
// return err;
314-
// }
315-
return RDMAXCEL_MKEY_REG_LIMIT;
316-
}
317-
318332
// Create vector of GPU addresses for bind_mrs
319333
auto err = bind_mrs(pd, qp, access_flags, seg);
320334
if (err != 0) {

0 commit comments

Comments
 (0)