Skip to content

Commit a9c854e

Browse files
committed
Merge branch '2.10.2.1' into 2.10.2.1_ds_monitoring
2 parents 0f66271 + b0cc06e commit a9c854e

File tree

6 files changed

+19
-6
lines changed

6 files changed

+19
-6
lines changed

playbooks/roles/nvidia_peermem/tasks/common.yml

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3,23 +3,27 @@
33
shell:
44
cmd: "curl -sH \"Authorization: Bearer Oracle\" -L http://169.254.169.254/opc/v2/instance/ | jq .shape | grep GPU"
55
warn: false
6-
register: shape
6+
register: shape_gpu
77
failed_when: false
88

9-
109
- name: Check if nvidia drivers are installed
1110
shell: cat /sys/module/nvidia/version | wc -l
1211
register: nvidia
13-
when: shape.stdout != ""
14-
12+
when: shape_gpu.stdout != ""
1513

1614
- name: Check if nvidia_peermem module is loaded
1715
shell: lsmod | grep nvidia_peermem | wc -l
1816
register: result
19-
when: shape.stdout != "" and nvidia.stdout == '1'
17+
when: shape_gpu.stdout != "" and nvidia.stdout == '1'
2018

19+
- name: Check ofed version
20+
shell:
21+
cmd: |
22+
/usr/bin/ofed_info |grep MLNX_OFED_LINUX|grep -v rpm|awk -F "(" '{print $2}'|cut -c 6-|awk -F "-" '{print $1}'
23+
register: ofed_version_local
24+
when: shape_gpu.stdout != "" and nvidia.stdout == '1'
2125

2226
- name: Load nvidia_peermem module
2327
become: true
2428
shell: modprobe nvidia_peermem
25-
when: shape.stdout != "" and nvidia.stdout == '1' and result.stdout != '3'
29+
when: shape_gpu.stdout != "" and nvidia.stdout == '1' and result.stdout != '3' and ofed_version_local.stdout|int >= '5.1'

samples/gpu/nccl_run_allreduce.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,7 @@ fi
6868
# final version
6969
mpirun --mca pml ucx \
7070
--bind-to numa \
71+
--mca coll ^hcoll \
7172
-x NCCL_DEBUG=WARN \
7273
-x NCCL_IB_SL=0 \
7374
-x NCCL_IB_TC=41 \

samples/gpu/nccl_run_alltoall.sh

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,8 +73,10 @@ fi
7373

7474
# Use -x NCCL_MAX_P2P_NCHANNELS=16 until NCCL 2.12 release which has a fix to allow NCCL_MAX_P2P_NCHANNELS=32 for nodes with 16 RDMA NICss
7575
# final version
76+
# you need --mca coll ^hcoll when using an image that has OFED 5.4 or newer
7677
mpirun --mca pml ucx \
7778
--bind-to numa \
79+
--mca coll ^hcoll \
7880
-x NCCL_MAX_P2P_NCHANNELS=16 \
7981
-x NCCL_DEBUG=WARN \
8082
-x NCCL_IB_SL=0 \

samples/gpu/qfabv1_nccl_run_allreduce.sbatch

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,8 +53,10 @@ then
5353
var_NCCL_IB_HCA="=mlx5_0,mlx5_2,mlx5_6,mlx5_8,mlx5_10,mlx5_12,mlx5_14,mlx5_16"
5454
fi
5555

56+
# you need --mca coll ^hcoll when using an image that has OFED 5.4 or newer
5657
mpirun --mca pml ucx \
5758
--bind-to numa \
59+
--mca coll ^hcoll \
5860
-x NCCL_DEBUG=WARN \
5961
-x NCCL_IB_SL=0 \
6062
-x NCCL_IB_TC=41 \

samples/gpu/qfabv1_nccl_run_allreduce.sh

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,9 +66,11 @@ then
6666
var_NCCL_IB_HCA="=mlx5_0,mlx5_2,mlx5_6,mlx5_8,mlx5_10,mlx5_12,mlx5_14,mlx5_16"
6767
fi
6868

69+
# you need --mca coll ^hcoll when using an image that has OFED 5.4 or newer
6970
# final version
7071
mpirun --mca pml ucx \
7172
--bind-to numa \
73+
--mca coll ^hcoll \
7274
-x NCCL_DEBUG=WARN \
7375
-x NCCL_IB_SL=0 \
7476
-x NCCL_IB_TC=41 \

samples/gpu/qfabv1_nccl_run_alltoall.sh

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,8 +78,10 @@ fi
7878
# Use NCCL_IB_QPS_PER_CONNECTION=4 for QFAB1.0, should get around 15GB/s NCCL Bus BW.
7979
# Use -x NCCL_MAX_P2P_NCHANNELS=16 until NCCL 2.12 release which has a fix to allow NCCL_MAX_P2P_NCHANNELS=32 for nodes with 16 RDMA NICss
8080
# final version
81+
# you need --mca coll ^hcoll when using an image that has OFED 5.4 or newer
8182
mpirun --mca pml ucx \
8283
--bind-to numa \
84+
--mca coll ^hcoll \
8385
-x NCCL_MAX_P2P_NCHANNELS=16 \
8486
-x NCCL_DEBUG=WARN \
8587
-x NCCL_IB_SL=0 \

0 commit comments

Comments
 (0)