@@ -610,9 +610,10 @@ uct_gdaki_query_tl_devices(uct_md_h tl_md,
610610 uct_tl_device_resource_t * * tl_devices_p ,
611611 unsigned * num_tl_devices_p )
612612{
613- static int uar_supported = -1 ;
614- uct_ib_mlx5_md_t * md = ucs_derived_of (tl_md , uct_ib_mlx5_md_t );
615- unsigned num_tl_devices = 0 ;
613+ static int uar_supported = -1 ;
614+ static int peermem_loaded = -1 ;
615+ uct_ib_mlx5_md_t * md = ucs_derived_of (tl_md , uct_ib_mlx5_md_t );
616+ unsigned num_tl_devices = 0 ;
616617 uct_tl_device_resource_t * tl_devices ;
617618 ucs_status_t status ;
618619 CUdevice device ;
@@ -657,6 +658,25 @@ uct_gdaki_query_tl_devices(uct_md_h tl_md,
657658 goto err ;
658659 }
659660
661+ /*
662+ * Save the result of peermem driver check in a global flag to avoid
663+ * printing diag message for each GPU and MD.
664+ */
665+ if (peermem_loaded == -1 ) {
666+ peermem_loaded = !!(md -> super .reg_mem_types &
667+ UCS_BIT (UCS_MEMORY_TYPE_CUDA ));
668+ if (peermem_loaded == 0 ) {
669+ ucs_diag ("GDAKI not supported, please load "
670+ "Nvidia peermem driver by running "
671+ "\"modprobe nvidia_peermem\"" );
672+ }
673+ }
674+
675+ if (peermem_loaded == 0 ) {
676+ status = UCS_ERR_NO_DEVICE ;
677+ goto err ;
678+ }
679+
660680 uct_cuda_base_get_sys_dev (device , & dev );
661681 status = ucs_topo_get_distance (dev , md -> super .dev .sys_dev , & dist );
662682 if (status != UCS_OK ) {
0 commit comments