Skip to content

Commit 6c5d03b

Browse files
Chnage H100 Parameters for Single Subnet and NCCL 2.20.3
1 parent a5e5ea0 commit 6c5d03b

File tree

7 files changed

+21
-15
lines changed

7 files changed

+21
-15
lines changed
Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
1-
NCCL_CROSS_NIC=0
2-
NCCL_SOCKET_NTHREADS=16
1+
NCCL_CROSS_NIC=1
32
NCCL_DEBUG=WARN
43
NCCL_CUMEM_ENABLE=0
54
NCCL_IB_SPLIT_DATA_ON_QPS=0
@@ -8,8 +7,8 @@ NCCL_IB_GID_INDEX=3
87
NCCL_IB_TC=41
98
NCCL_IB_SL=0
109
NCCL_IB_TIMEOUT=22
10+
NCCL_BUFFSIZE=16777216
1111
NCCL_NET_PLUGIN=none
1212
NCCL_SOCKET_IFNAME=eth0
1313
NCCL_IGNORE_CPU_AFFINITY=1
14-
NCCL_IB_HCA==mlx5_0,mlx5_1,mlx5_3,mlx5_4,mlx5_5,mlx5_6,mlx5_7,mlx5_8,mlx5_9,mlx5_10,mlx5_12,mlx5_13,mlx5_14,mlx5_15,mlx5_16,mlx5_17
15-
NCCL_TOPO_FILE=/nfs/cluster/H100-topology.xml
14+
NCCL_IB_HCA==mlx5_0,mlx5_1,mlx5_3,mlx5_4,mlx5_5,mlx5_6,mlx5_7,mlx5_8,mlx5_9,mlx5_10,mlx5_12,mlx5_13,mlx5_14,mlx5_15,mlx5_16,mlx5_17
-161 KB
Binary file not shown.
-165 KB
Binary file not shown.

playbooks/roles/nccl-conf/tasks/main.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@
3838
- name: copy libnccl-ocituner for OL
3939
become: true
4040
get_url:
41-
url: wget https://objectstorage.eu-frankfurt-1.oraclecloud.com/p/m1Gdcbiguqst6n_aVwRZIFpRZxUG-wGMvqWS5QJeJbIvNZnqTTA3N1_DDRuYpvJx/n/hpc/b/source/o/tuner/libnccl-ocituner.so.1.0.1-OL
41+
url: https://objectstorage.eu-frankfurt-1.oraclecloud.com/p/m1Gdcbiguqst6n_aVwRZIFpRZxUG-wGMvqWS5QJeJbIvNZnqTTA3N1_DDRuYpvJx/n/hpc/b/source/o/tuner/libnccl-ocituner.so.1.0.1-OL
4242
dest: /home/opc/libnccl-ocituner.so.1.0.1
4343
owner: opc
4444
group: privilege
@@ -48,7 +48,7 @@
4848
- name: copy libnccl-ocituner for Ubuntu
4949
become: true
5050
get_url:
51-
url: wget https://objectstorage.eu-frankfurt-1.oraclecloud.com/p/m1Gdcbiguqst6n_aVwRZIFpRZxUG-wGMvqWS5QJeJbIvNZnqTTA3N1_DDRuYpvJx/n/hpc/b/source/o/tuner/libnccl-ocituner.so.1.0.1-ubuntu
51+
url: https://objectstorage.eu-frankfurt-1.oraclecloud.com/p/m1Gdcbiguqst6n_aVwRZIFpRZxUG-wGMvqWS5QJeJbIvNZnqTTA3N1_DDRuYpvJx/n/hpc/b/source/o/tuner/libnccl-ocituner.so.1.0.1-ubuntu
5252
dest: /home/ubuntu/libnccl-ocituner.so.1.0.1
5353
owner: ubuntu
5454
group: privilege

samples/gpu/nccl_run_allreduce_H100.sbatch

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -43,8 +43,7 @@ fi
4343
--bind-to numa \
4444
-npernode 8 \
4545
--mca coll ^hcoll \
46-
-x NCCL_CROSS_NIC=0 \
47-
-x NCCL_SOCKET_NTHREADS=16 \
46+
-x NCCL_CROSS_NIC=1 \
4847
-x NCCL_DEBUG=WARN \
4948
-x NCCL_CUMEM_ENABLE=0 \
5049
-x NCCL_IB_SPLIT_DATA_ON_QPS=0 \
@@ -60,8 +59,11 @@ fi
6059
-x UCX_NET_DEVICES=${var_UCX_NET_DEVICES} \
6160
-x RX_QUEUE_LEN=8192 \
6261
-x IB_RX_QUEUE_LEN=8192 \
62+
-x NCCL_BUFFSIZE=16777216 \
6363
-x NCCL_SOCKET_IFNAME=eth0 \
6464
-x NCCL_IGNORE_CPU_AFFINITY=1 \
6565
-x NCCL_IB_HCA="${var_NCCL_IB_HCA}" \
66-
-x NCCL_TOPO_FILE=~/H100-topology.xml \
67-
--np $((SLURM_NNODES*SLURM_NTASKS_PER_NODE)) --hostfile $MACHINEFILE /opt/oci-hpc/nccl-test/build/all_reduce_perf -b 1G -e 16G -f 2 -g 1
66+
--np $((SLURM_NNODES*SLURM_NTASKS_PER_NODE)) --hostfile $MACHINEFILE /opt/oci-hpc/nccl-test/build/all_reduce_perf -b 1G -e 16G -f 2 -g 1
67+
68+
# If NCCL version is lower than 2.20.3, it is recommended to use the topology filefor optimal performances
69+
# -x NCCL_TOPO_FILE=~/H100-topology.xml \

samples/gpu/nccl_run_allreduce_H100.sh

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -57,8 +57,7 @@ do
5757
--bind-to numa \
5858
-npernode 8 \
5959
--mca coll ^hcoll \
60-
-x NCCL_CROSS_NIC=0 \
61-
-x NCCL_SOCKET_NTHREADS=16 \
60+
-x NCCL_CROSS_NIC=1 \
6261
-x NCCL_DEBUG=WARN \
6362
-x NCCL_CUMEM_ENABLE=0 \
6463
-x NCCL_IB_SPLIT_DATA_ON_QPS=0 \
@@ -74,6 +73,7 @@ do
7473
-x UCX_NET_DEVICES=${var_UCX_NET_DEVICES} \
7574
-x RX_QUEUE_LEN=8192 \
7675
-x IB_RX_QUEUE_LEN=8192 \
76+
-x NCCL_BUFFSIZE=16777216 \
7777
-x NCCL_SOCKET_IFNAME=eth0 \
7878
-x NCCL_IGNORE_CPU_AFFINITY=1 \
7979
-x NCCL_IB_HCA="${var_NCCL_IB_HCA}" \
@@ -82,5 +82,8 @@ do
8282

8383
tail -n 32 $logfile
8484

85+
done
8586

86-
done
87+
88+
# If NCCL version is lower than 2.20.3, it is recommended to use the topology filefor optimal performances
89+
# -x NCCL_TOPO_FILE=~/H100-topology.xml \

samples/gpu/no_ncclparam_tuner_nccl_run_allreduce.sbatch

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,9 +21,11 @@ cat $MACHINEFILE
2121

2222
source /etc/os-release
2323
if [ $ID == "ol" ] || [ $ID == "centos" ] ; then
24-
python3 /home/opc/node_ordering_by_rack.py --input_file $MACHINEFILE > /dev/null
24+
python3 /home/opc/node_ordering_by_rack.py --input_file $hostfile > /dev/null
25+
homedirectory=/home/opc
2526
elif [ $ID == "debian" ] || [ $ID == "ubuntu" ] ; then
26-
python3 /home/ubuntu/node_ordering_by_rack.py --input_file $MACHINEFILE > /dev/null
27+
python3 /home/ubuntu/node_ordering_by_rack.py --input_file $hostfile > /dev/null
28+
homedirectory=/home/ubuntu
2729
fi
2830

2931

0 commit comments

Comments
 (0)