Skip to content

Commit f826281

Browse files
charlesstollsunkuamzn
authored andcommitted
prov/efa: Implement dmabuf try/fallback logic
feat: Add default dmabuf attempt with fallback for all efa_hmem_ifaces Problem: - Need to make dmabuf usage default going forward - Need fallback mechanism when dmabuf is not supported or fails - Neuron recommends against allocating dmabufs if they are not going to be used Solution: - Modified initial PR from @jiaxiyan at 6aa6708 - Added dmabuf_supported_by_device enum in efa_hmem_info structure in prov/efa/src/efa_hmem.h - Updated dmabuf_supported_by_device detection in each fi_hmem_iface p2p_support function in prov/efa/src/efa_hmem.c - Modified efa_mr_reg_ibv_mr() in prov/efa/src/efa_mr.c to use efa_mr->peer.iface for dmabuf checks - Implemented try-dmabuf-first with fallback to ibv_reg_mr in efa_mr_reg_ibv_mr() - Added environment variable control for dmabuf enable/disable per interface - Assume p2p dmabuf support for all interfaces Co-authored-by: Nick Mazzilli <[email protected]> Signed-off-by: Charles Stoll <[email protected]>
1 parent 02a3c1b commit f826281

File tree

3 files changed

+178
-156
lines changed

3 files changed

+178
-156
lines changed

prov/efa/src/efa_hmem.c

Lines changed: 56 additions & 104 deletions
Original file line numberDiff line numberDiff line change
@@ -143,21 +143,29 @@ static inline void efa_hmem_info_check_p2p_support_cuda(struct efa_hmem_info *in
143143
}
144144

145145
#if HAVE_EFA_DMABUF_MR
146-
ret = cuda_get_dmabuf_fd(ptr, len, &dmabuf_fd, &dmabuf_offset);
147-
if (ret == FI_SUCCESS) {
148-
ibv_mr = ibv_reg_dmabuf_mr(ibv_pd, dmabuf_offset,
149-
len, (uint64_t)ptr, dmabuf_fd, ibv_access);
150-
(void)cuda_put_dmabuf_fd(dmabuf_fd);
151-
if (!ibv_mr) {
146+
if (ofi_hmem_is_dmabuf_env_var_enabled(FI_HMEM_CUDA)) {
147+
ret = ofi_hmem_get_dmabuf_fd(FI_HMEM_CUDA, ptr, len, &dmabuf_fd, &dmabuf_offset);
148+
if (ret == FI_SUCCESS) {
149+
ibv_mr = ibv_reg_dmabuf_mr(ibv_pd, dmabuf_offset,
150+
len, (uint64_t)ptr, dmabuf_fd, ibv_access);
151+
(void)ofi_hmem_put_dmabuf_fd(FI_HMEM_CUDA, dmabuf_fd);
152+
if (!ibv_mr) {
153+
EFA_INFO(FI_LOG_CORE,
154+
"Unable to register CUDA device buffer via dmabuf: %s. "
155+
"Fall back to ibv_reg_mr\n", fi_strerror(-errno));
156+
ibv_mr = ibv_reg_mr(ibv_pd, ptr, len, ibv_access);
157+
} else {
158+
info->dmabuf_supported_by_device = EFA_DMABUF_SUPPORTED;
159+
}
160+
} else {
152161
EFA_INFO(FI_LOG_CORE,
153-
"Unable to register CUDA device buffer via dmabuf: %s. "
154-
"Fall back to ibv_reg_mr\n", fi_strerror(-errno));
162+
"Unable to retrieve dmabuf fd of CUDA device buffer: %d. "
163+
"Fall back to ibv_reg_mr\n", ret);
155164
ibv_mr = ibv_reg_mr(ibv_pd, ptr, len, ibv_access);
165+
info->dmabuf_supported_by_device = EFA_DMABUF_NOT_SUPPORTED;
156166
}
157167
} else {
158-
EFA_INFO(FI_LOG_CORE,
159-
"Unable to retrieve dmabuf fd of CUDA device buffer: %d. "
160-
"Fall back to ibv_reg_mr\n", ret);
168+
EFA_INFO(FI_LOG_CORE, "FI_HMEM_CUDA_USE_DMABUF set to false. Not using DMABUF for CUDA.\n");
161169
ibv_mr = ibv_reg_mr(ibv_pd, ptr, len, ibv_access);
162170
}
163171
#else
@@ -216,21 +224,29 @@ static inline void efa_hmem_info_check_p2p_support_rocr(struct efa_hmem_info *in
216224
}
217225

218226
#if HAVE_EFA_DMABUF_MR
219-
ret = rocr_hmem_get_dmabuf_fd(ptr, len, &dmabuf_fd, &dmabuf_offset);
220-
if (ret == FI_SUCCESS) {
221-
ibv_mr = ibv_reg_dmabuf_mr(ibv_pd, dmabuf_offset,
222-
len, (uint64_t) ptr, dmabuf_fd, ibv_access);
223-
(void) rocr_hmem_put_dmabuf_fd(dmabuf_fd);
224-
if (!ibv_mr) {
227+
if (ofi_hmem_is_dmabuf_env_var_enabled(FI_HMEM_ROCR)) {
228+
ret = rocr_hmem_get_dmabuf_fd(ptr, len, &dmabuf_fd, &dmabuf_offset);
229+
if (ret == FI_SUCCESS) {
230+
ibv_mr = ibv_reg_dmabuf_mr(ibv_pd, dmabuf_offset,
231+
len, (uint64_t) ptr, dmabuf_fd, ibv_access);
232+
(void) rocr_hmem_put_dmabuf_fd(dmabuf_fd);
233+
if (!ibv_mr) {
234+
EFA_INFO(FI_LOG_CORE,
235+
"Unable to register ROCr device buffer via dmabuf: %s. "
236+
"Fall back to ibv_reg_mr\n", fi_strerror(-errno));
237+
ibv_mr = ibv_reg_mr(ibv_pd, ptr, len, ibv_access);
238+
} else {
239+
info->dmabuf_supported_by_device = EFA_DMABUF_SUPPORTED;
240+
}
241+
} else {
225242
EFA_INFO(FI_LOG_CORE,
226-
"Unable to register ROCr device buffer via dmabuf: %s. "
227-
"Fall back to ibv_reg_mr\n", fi_strerror(-errno));
243+
"Unable to retrieve dmabuf fd of ROCr device buffer: %d. "
244+
"Fall back to ibv_reg_mr\n", ret);
228245
ibv_mr = ibv_reg_mr(ibv_pd, ptr, len, ibv_access);
246+
info->dmabuf_supported_by_device = EFA_DMABUF_NOT_SUPPORTED;
229247
}
230248
} else {
231-
EFA_INFO(FI_LOG_CORE,
232-
"Unable to retrieve dmabuf fd of ROCr device buffer: %d. "
233-
"Fall back to ibv_reg_mr\n", ret);
249+
EFA_INFO(FI_LOG_CORE, "FI_HMEM_ROCR_USE_DMABUF set to false. Not using DMABUF for ROCr.\n");
234250
ibv_mr = ibv_reg_mr(ibv_pd, ptr, len, ibv_access);
235251
}
236252
#else
@@ -263,84 +279,6 @@ static inline void efa_hmem_info_check_p2p_support_rocr(struct efa_hmem_info *in
263279
return;
264280
}
265281

266-
static inline void efa_hmem_info_check_p2p_support_neuron(struct efa_hmem_info *info) {
267-
#if HAVE_NEURON
268-
struct ibv_mr *ibv_mr = NULL;
269-
struct ibv_pd *ibv_pd;
270-
int ibv_access = IBV_ACCESS_LOCAL_WRITE;
271-
void *handle;
272-
void *ptr = NULL;
273-
size_t len = ofi_get_page_size() * 2;
274-
int dmabuf_fd;
275-
uint64_t offset;
276-
int ret;
277-
278-
if (g_efa_selected_device_list[0].device_caps & EFADV_DEVICE_ATTR_CAPS_RDMA_READ) {
279-
ibv_access |= IBV_ACCESS_REMOTE_READ;
280-
}
281-
282-
ptr = neuron_alloc(&handle, len);
283-
/*
284-
* neuron_alloc will fail if application did not call nrt_init,
285-
* which is ok if it's not running neuron workloads. libfabric
286-
* will move on and leave info->initialized as false.
287-
*/
288-
if (!ptr) {
289-
info->initialized = false;
290-
EFA_INFO(FI_LOG_CORE, "Cannot allocate Neuron buffer\n");
291-
return;
292-
}
293-
294-
ibv_pd = ibv_alloc_pd(g_efa_selected_device_list[0].ibv_ctx);
295-
if (!ibv_pd) {
296-
EFA_WARN(FI_LOG_CORE, "failed to allocate ibv_pd: %d", errno);
297-
neuron_free(&handle);
298-
return;
299-
}
300-
301-
#if HAVE_EFA_DMABUF_MR
302-
ret = neuron_get_dmabuf_fd(ptr, (uint64_t)len, &dmabuf_fd, &offset);
303-
if (ret == FI_SUCCESS) {
304-
ibv_mr = ibv_reg_dmabuf_mr(
305-
ibv_pd, offset,
306-
len, (uint64_t)ptr, dmabuf_fd, ibv_access);
307-
} else if (ret == -FI_EOPNOTSUPP) {
308-
EFA_INFO(FI_LOG_MR,
309-
"Unable to retrieve dmabuf fd of Neuron device buffer, "
310-
"Fall back to ibv_reg_mr\n");
311-
ibv_mr = ibv_reg_mr(ibv_pd, ptr, len, ibv_access);
312-
}
313-
#else
314-
ibv_mr = ibv_reg_mr(ibv_pd, ptr, len, ibv_access);
315-
#endif
316-
317-
if (!ibv_mr) {
318-
info->p2p_supported_by_device = false;
319-
/* We do not expect to support Neuron on non p2p systems */
320-
EFA_WARN(FI_LOG_CORE,
321-
"Failed to register Neuron buffer with the EFA device, "
322-
"FI_HMEM transfers that require peer to peer support will fail.\n");
323-
neuron_free(&handle);
324-
(void) ibv_dealloc_pd(ibv_pd);
325-
return;
326-
}
327-
328-
ret = ibv_dereg_mr(ibv_mr);
329-
neuron_free(&handle);
330-
(void) ibv_dealloc_pd(ibv_pd);
331-
if (ret) {
332-
EFA_WARN(FI_LOG_CORE,
333-
"Failed to deregister Neuron buffer: %s\n",
334-
fi_strerror(-ret));
335-
return;
336-
}
337-
338-
info->p2p_supported_by_device = true;
339-
return;
340-
#endif
341-
return;
342-
}
343-
344282
/**
345283
* @brief Initialize the efa_hmem_info state for iface
346284
*
@@ -366,9 +304,26 @@ efa_hmem_info_init_iface(enum fi_hmem_iface iface)
366304
}
367305

368306
info->initialized = true;
307+
info->dmabuf_supported_by_device = EFA_DMABUF_ASSUMED;
308+
info->dmabuf_fallback_enabled = false;
309+
310+
if (iface == FI_HMEM_SYNAPSEAI || iface == FI_HMEM_SYSTEM ||
311+
iface == FI_HMEM_NEURON) {
312+
/* It is not recommended to allocate neuron buffs this
313+
* early in initialization, so we must skip the explicit
314+
* check to see if p2p will work. Instead, assume it works.
315+
* and set fallback to true
316+
*/
317+
if (iface == FI_HMEM_NEURON)
318+
info->dmabuf_fallback_enabled = true;
369319

370-
if (iface == FI_HMEM_SYNAPSEAI || iface == FI_HMEM_SYSTEM) {
371320
info->p2p_supported_by_device = true;
321+
322+
if (!ofi_hmem_is_dmabuf_env_var_enabled(iface)) {
323+
info->dmabuf_supported_by_device = EFA_DMABUF_NOT_SUPPORTED;
324+
EFA_INFO(FI_LOG_CORE, "%s DMABUF disabled by environment variable\n",
325+
fi_tostr(&iface, FI_TYPE_HMEM_IFACE));
326+
}
372327
} else if (ofi_hmem_p2p_disabled()) {
373328
info->p2p_supported_by_device = false;
374329
} else {
@@ -379,9 +334,6 @@ efa_hmem_info_init_iface(enum fi_hmem_iface iface)
379334
case FI_HMEM_ROCR:
380335
efa_hmem_info_check_p2p_support_rocr(info);
381336
break;
382-
case FI_HMEM_NEURON:
383-
efa_hmem_info_check_p2p_support_neuron(info);
384-
break;
385337
default:
386338
break;
387339
}

prov/efa/src/efa_hmem.h

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,16 +31,34 @@ static const enum fi_hmem_iface efa_hmem_ifaces[] = {
3131
FI_HMEM_SYNAPSEAI
3232
};
3333

34+
enum efa_dmabuf_support {
35+
EFA_DMABUF_NOT_SUPPORTED,
36+
EFA_DMABUF_SUPPORTED,
37+
EFA_DMABUF_ASSUMED
38+
};
39+
3440
struct efa_hmem_info {
3541
bool initialized; /* do we support it at all */
3642
bool p2p_supported_by_device; /* do we support p2p with this device */
43+
bool dmabuf_fallback_enabled;
44+
enum efa_dmabuf_support dmabuf_supported_by_device; /* do we support dmabuf with this device */
3745

3846
size_t max_medium_msg_size;
3947
size_t runt_size;
4048
size_t min_read_msg_size;
4149
size_t min_read_write_size;
4250
};
4351

52+
#define DMABUF_IS_SUPPORTED(info) \
53+
((info)->dmabuf_supported_by_device == EFA_DMABUF_SUPPORTED || \
54+
(info)->dmabuf_supported_by_device == EFA_DMABUF_ASSUMED)
55+
56+
#define DMABUF_IS_NOT_SUPPORTED(info) \
57+
((info)->dmabuf_supported_by_device == EFA_DMABUF_NOT_SUPPORTED)
58+
59+
#define DMABUF_IS_ASSUMED(info) \
60+
((info)->dmabuf_supported_by_device == EFA_DMABUF_ASSUMED)
61+
4462
extern struct efa_hmem_info g_efa_hmem_info[OFI_HMEM_MAX];
4563

4664
int efa_hmem_validate_p2p_opt(enum fi_hmem_iface iface, int p2p_opt, uint32_t api_version);

0 commit comments

Comments
 (0)