|
18 | 18 | // TODO: explicitly obtain from ibverbs config, for now assume 32
|
19 | 19 | const int SGE_MAX = 32;
|
20 | 20 |
|
| 21 | +// Maximum size for a single MR: 2GB |
| 22 | +const size_t MAX_MR_SIZE = 2ULL * 1024 * 1024 * 1024; |
| 23 | + |
| 24 | +// MR size must be a multiple of 2MB |
| 25 | +const size_t MR_ALIGNMENT = 2ULL * 1024 * 1024; |
| 26 | + |
21 | 27 | // Structure to hold segment information
|
22 | 28 | struct SegmentInfo {
|
23 | 29 | size_t phys_address;
|
@@ -190,10 +196,7 @@ int bind_mrs(
|
190 | 196 |
|
191 | 197 | qpx->wr_flags = IBV_SEND_INLINE | IBV_SEND_SIGNALED;
|
192 | 198 | ibv_wr_start(qpx);
|
193 |
| - struct mlx5dv_mkey_conf_attr mkey_cattr = {}; |
194 |
| - mlx5dv_wr_mkey_configure(mqpx, seg.mkey, 2, &mkey_cattr); |
195 |
| - mlx5dv_wr_set_mkey_access_flags(mqpx, access_flags); |
196 |
| - mlx5dv_wr_set_mkey_layout_list(mqpx, mrs_cnt, sgl.data()); |
| 199 | + mlx5dv_wr_mr_list(mqpx, seg.mkey, access_flags, mrs_cnt, sgl.data()); |
197 | 200 | int ret = ibv_wr_complete(qpx);
|
198 | 201 |
|
199 | 202 | if (ret != 0) {
|
@@ -276,45 +279,56 @@ int register_segments(struct ibv_pd* pd, struct ibv_qp* qp) {
|
276 | 279 | if (seg.mr_size != seg.phys_size) {
|
277 | 280 | auto mr_start = seg.phys_address + seg.mr_size;
|
278 | 281 | auto mr_end = seg.phys_address + seg.phys_size;
|
279 |
| - auto mr_size = mr_end - mr_start; |
| 282 | + auto remaining_size = mr_end - mr_start; |
| 283 | + |
| 284 | + // Register in chunks of MAX_MR_SIZE |
| 285 | + size_t current_offset = 0; |
| 286 | + while (current_offset < remaining_size) { |
| 287 | + size_t chunk_size = |
| 288 | + std::min(remaining_size - current_offset, MAX_MR_SIZE); |
| 289 | + auto chunk_start = mr_start + current_offset; |
| 290 | + |
| 291 | + // Validate that chunk_size is a multiple of 2MB |
| 292 | + if (chunk_size % MR_ALIGNMENT != 0) { |
| 293 | + return RDMAXCEL_MR_REGISTRATION_FAILED; |
| 294 | + } |
280 | 295 |
|
281 |
| - // TODO: resolve 4GiB limit |
282 |
| - if (seg.phys_size > (1ULL << 32)) { |
283 |
| - return RDMAXCEL_MKEY_REG_LIMIT; |
284 |
| - } |
285 |
| - int fd = -1; |
286 |
| - CUresult cu_result = cuMemGetHandleForAddressRange( |
287 |
| - &fd, |
288 |
| - static_cast<CUdeviceptr>(mr_start), |
289 |
| - mr_size, |
290 |
| - CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, |
291 |
| - 0); |
292 |
| - |
293 |
| - if (cu_result != CUDA_SUCCESS || fd < 0) { |
294 |
| - return RDMAXCEL_DMABUF_HANDLE_FAILED; // Failed to get dmabuf handle |
295 |
| - } |
| 296 | + int fd = -1; |
| 297 | + CUresult cu_result = cuMemGetHandleForAddressRange( |
| 298 | + &fd, |
| 299 | + static_cast<CUdeviceptr>(chunk_start), |
| 300 | + chunk_size, |
| 301 | + CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, |
| 302 | + 0); |
| 303 | + |
| 304 | + if (cu_result != CUDA_SUCCESS || fd < 0) { |
| 305 | + return RDMAXCEL_DMABUF_HANDLE_FAILED; // Failed to get dmabuf handle |
| 306 | + } |
| 307 | + |
| 308 | + // Register the dmabuf with fd, address is always 0. |
| 309 | + auto mr = ibv_reg_dmabuf_mr(pd, 0, chunk_size, 0, fd, access_flags); |
| 310 | + close(fd); |
296 | 311 |
|
297 |
| - // Register the dmabuf with fd, address is always 0. |
298 |
| - auto mr = ibv_reg_dmabuf_mr(pd, 0, mr_size, 0, fd, access_flags); |
299 |
| - close(fd); |
| 312 | + if (!mr) { |
| 313 | + return RDMAXCEL_MR_REG_FAILED; // MR registration failed |
| 314 | + } |
300 | 315 |
|
301 |
| - if (!mr) { |
302 |
| - return RDMAXCEL_MR_REG_FAILED; // MR registration failed |
| 316 | + seg.mrs.push_back(mr); |
| 317 | + current_offset += chunk_size; |
| 318 | + |
| 319 | + // If we have too many MRs, compact them into a single MR |
| 320 | + if (seg.mrs.size() > SGE_MAX) { |
| 321 | + // TODO: find a safe way to compact with low performance cost. |
| 322 | + // return MAX_SGE error auto err = compact_mrs(pd, seg, access_flags); |
| 323 | + // if (err != 0) { |
| 324 | + // return err; |
| 325 | + // } |
| 326 | + return RDMAXCEL_MKEY_REG_LIMIT; |
| 327 | + } |
303 | 328 | }
|
304 | 329 |
|
305 |
| - seg.mrs.push_back(mr); |
306 | 330 | seg.mr_size = seg.phys_size;
|
307 | 331 |
|
308 |
| - // If we have too many MRs, compact them into a single MR |
309 |
| - if (seg.mrs.size() > SGE_MAX) { |
310 |
| - // TODO: find a safe way to compact with low performance cost. |
311 |
| - // return MAX_SGE error auto err = compact_mrs(pd, seg, access_flags); |
312 |
| - // if (err != 0) { |
313 |
| - // return err; |
314 |
| - // } |
315 |
| - return RDMAXCEL_MKEY_REG_LIMIT; |
316 |
| - } |
317 |
| - |
318 | 332 | // Create vector of GPU addresses for bind_mrs
|
319 | 333 | auto err = bind_mrs(pd, qp, access_flags, seg);
|
320 | 334 | if (err != 0) {
|
|
0 commit comments