Merge pull request #115 from oci-hpc/2.10.2.1_ds_fixes

arnaudfroidmont · web-flow · commit b0cc06e0d777 · 2023-06-09T10:41:28.000-06:00
add --mca coll ^hcoll by default
diff --git a/samples/gpu/nccl_run_allreduce.sh b/samples/gpu/nccl_run_allreduce.sh
@@ -68,6 +68,7 @@ fi
   # final version
   mpirun --mca pml ucx \
   --bind-to numa \
+  --mca coll ^hcoll \
   -x NCCL_DEBUG=WARN \
   -x NCCL_IB_SL=0 \
   -x NCCL_IB_TC=41 \
diff --git a/samples/gpu/nccl_run_alltoall.sh b/samples/gpu/nccl_run_alltoall.sh
@@ -73,8 +73,10 @@ fi
 
   # Use  -x NCCL_MAX_P2P_NCHANNELS=16 until NCCL 2.12 release which has a fix to allow NCCL_MAX_P2P_NCHANNELS=32 for nodes with 16 RDMA NICss
   # final version
+  # you need --mca coll ^hcoll when using an image that has OFED 5.4 or newer
   mpirun --mca pml ucx \
   --bind-to numa \
+  --mca coll ^hcoll \
   -x NCCL_MAX_P2P_NCHANNELS=16 \
   -x NCCL_DEBUG=WARN \
   -x NCCL_IB_SL=0 \
diff --git a/samples/gpu/qfabv1_nccl_run_allreduce.sbatch b/samples/gpu/qfabv1_nccl_run_allreduce.sbatch
@@ -53,8 +53,10 @@ then
   var_NCCL_IB_HCA="=mlx5_0,mlx5_2,mlx5_6,mlx5_8,mlx5_10,mlx5_12,mlx5_14,mlx5_16"
 fi
 
+# you need --mca coll ^hcoll when using an image that has OFED 5.4 or newer
   mpirun --mca pml ucx \
   --bind-to numa \
+  --mca coll ^hcoll \ 
   -x NCCL_DEBUG=WARN \
   -x NCCL_IB_SL=0 \
   -x NCCL_IB_TC=41 \
diff --git a/samples/gpu/qfabv1_nccl_run_allreduce.sh b/samples/gpu/qfabv1_nccl_run_allreduce.sh
@@ -66,9 +66,11 @@ then
   var_NCCL_IB_HCA="=mlx5_0,mlx5_2,mlx5_6,mlx5_8,mlx5_10,mlx5_12,mlx5_14,mlx5_16"
 fi
 
+# you need --mca coll ^hcoll when using an image that has OFED 5.4 or newer
   # final version
   mpirun --mca pml ucx \
   --bind-to numa \
+  --mca coll ^hcoll \ 
   -x NCCL_DEBUG=WARN \
   -x NCCL_IB_SL=0 \
   -x NCCL_IB_TC=41 \
diff --git a/samples/gpu/qfabv1_nccl_run_alltoall.sh b/samples/gpu/qfabv1_nccl_run_alltoall.sh
@@ -78,8 +78,10 @@ fi
   # Use NCCL_IB_QPS_PER_CONNECTION=4 for QFAB1.0, should get around 15GB/s NCCL Bus BW. 
   # Use  -x NCCL_MAX_P2P_NCHANNELS=16 until NCCL 2.12 release which has a fix to allow NCCL_MAX_P2P_NCHANNELS=32 for nodes with 16 RDMA NICss
   # final version
+  # you need --mca coll ^hcoll when using an image that has OFED 5.4 or newer
   mpirun --mca pml ucx \
   --bind-to numa \
+  --mca coll ^hcoll \
   -x NCCL_MAX_P2P_NCHANNELS=16 \
   -x NCCL_DEBUG=WARN \
   -x NCCL_IB_SL=0 \