Skip to content

Commit b0cc06e

Browse files
Merge pull request #115 from oci-hpc/2.10.2.1_ds_fixes
add --mca coll ^hcoll by default
2 parents 2738190 + e6d22b6 commit b0cc06e

File tree

5 files changed

+9
-0
lines changed

5 files changed

+9
-0
lines changed

samples/gpu/nccl_run_allreduce.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,7 @@ fi
6868
# final version
6969
mpirun --mca pml ucx \
7070
--bind-to numa \
71+
--mca coll ^hcoll \
7172
-x NCCL_DEBUG=WARN \
7273
-x NCCL_IB_SL=0 \
7374
-x NCCL_IB_TC=41 \

samples/gpu/nccl_run_alltoall.sh

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,8 +73,10 @@ fi
7373

7474
# Use -x NCCL_MAX_P2P_NCHANNELS=16 until NCCL 2.12 release which has a fix to allow NCCL_MAX_P2P_NCHANNELS=32 for nodes with 16 RDMA NICss
7575
# final version
76+
# you need --mca coll ^hcoll when using an image that has OFED 5.4 or newer
7677
mpirun --mca pml ucx \
7778
--bind-to numa \
79+
--mca coll ^hcoll \
7880
-x NCCL_MAX_P2P_NCHANNELS=16 \
7981
-x NCCL_DEBUG=WARN \
8082
-x NCCL_IB_SL=0 \

samples/gpu/qfabv1_nccl_run_allreduce.sbatch

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,8 +53,10 @@ then
5353
var_NCCL_IB_HCA="=mlx5_0,mlx5_2,mlx5_6,mlx5_8,mlx5_10,mlx5_12,mlx5_14,mlx5_16"
5454
fi
5555

56+
# you need --mca coll ^hcoll when using an image that has OFED 5.4 or newer
5657
mpirun --mca pml ucx \
5758
--bind-to numa \
59+
--mca coll ^hcoll \
5860
-x NCCL_DEBUG=WARN \
5961
-x NCCL_IB_SL=0 \
6062
-x NCCL_IB_TC=41 \

samples/gpu/qfabv1_nccl_run_allreduce.sh

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,9 +66,11 @@ then
6666
var_NCCL_IB_HCA="=mlx5_0,mlx5_2,mlx5_6,mlx5_8,mlx5_10,mlx5_12,mlx5_14,mlx5_16"
6767
fi
6868

69+
# you need --mca coll ^hcoll when using an image that has OFED 5.4 or newer
6970
# final version
7071
mpirun --mca pml ucx \
7172
--bind-to numa \
73+
--mca coll ^hcoll \
7274
-x NCCL_DEBUG=WARN \
7375
-x NCCL_IB_SL=0 \
7476
-x NCCL_IB_TC=41 \

samples/gpu/qfabv1_nccl_run_alltoall.sh

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,8 +78,10 @@ fi
7878
# Use NCCL_IB_QPS_PER_CONNECTION=4 for QFAB1.0, should get around 15GB/s NCCL Bus BW.
7979
# Use -x NCCL_MAX_P2P_NCHANNELS=16 until NCCL 2.12 release which has a fix to allow NCCL_MAX_P2P_NCHANNELS=32 for nodes with 16 RDMA NICss
8080
# final version
81+
# you need --mca coll ^hcoll when using an image that has OFED 5.4 or newer
8182
mpirun --mca pml ucx \
8283
--bind-to numa \
84+
--mca coll ^hcoll \
8385
-x NCCL_MAX_P2P_NCHANNELS=16 \
8486
-x NCCL_DEBUG=WARN \
8587
-x NCCL_IB_SL=0 \

0 commit comments

Comments
 (0)