Skip to content

Commit c5d2df5

Browse files
authored
UCT/GDA: Check that nvidia peermem driver is loaded. (#10987)
1 parent 87e88f7 commit c5d2df5

File tree

1 file changed

+23
-3
lines changed

1 file changed

+23
-3
lines changed

src/uct/ib/mlx5/gdaki/gdaki.c

Lines changed: 23 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -610,9 +610,10 @@ uct_gdaki_query_tl_devices(uct_md_h tl_md,
610610
uct_tl_device_resource_t **tl_devices_p,
611611
unsigned *num_tl_devices_p)
612612
{
613-
static int uar_supported = -1;
614-
uct_ib_mlx5_md_t *md = ucs_derived_of(tl_md, uct_ib_mlx5_md_t);
615-
unsigned num_tl_devices = 0;
613+
static int uar_supported = -1;
614+
static int peermem_loaded = -1;
615+
uct_ib_mlx5_md_t *md = ucs_derived_of(tl_md, uct_ib_mlx5_md_t);
616+
unsigned num_tl_devices = 0;
616617
uct_tl_device_resource_t *tl_devices;
617618
ucs_status_t status;
618619
CUdevice device;
@@ -657,6 +658,25 @@ uct_gdaki_query_tl_devices(uct_md_h tl_md,
657658
goto err;
658659
}
659660

661+
/*
662+
* Save the result of peermem driver check in a global flag to avoid
663+
* printing diag message for each GPU and MD.
664+
*/
665+
if (peermem_loaded == -1) {
666+
peermem_loaded = !!(md->super.reg_mem_types &
667+
UCS_BIT(UCS_MEMORY_TYPE_CUDA));
668+
if (peermem_loaded == 0) {
669+
ucs_diag("GDAKI not supported, please load "
670+
"Nvidia peermem driver by running "
671+
"\"modprobe nvidia_peermem\"");
672+
}
673+
}
674+
675+
if (peermem_loaded == 0) {
676+
status = UCS_ERR_NO_DEVICE;
677+
goto err;
678+
}
679+
660680
uct_cuda_base_get_sys_dev(device, &dev);
661681
status = ucs_topo_get_distance(dev, md->super.dev.sys_dev, &dist);
662682
if (status != UCS_OK) {

0 commit comments

Comments
 (0)