Skip to content

Commit c929c9b

Browse files
Change default H100 values
1 parent 6bdf4d4 commit c929c9b

File tree

2 files changed

+21
-9
lines changed

2 files changed

+21
-9
lines changed

samples/gpu/nccl_run_allreduce_H100.sbatch

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -43,11 +43,11 @@ fi
4343
--bind-to numa \
4444
-npernode 8 \
4545
--mca coll ^hcoll \
46-
-x NCCL_CROSS_NIC=1 \
46+
-x NCCL_CROSS_NIC=2 \
4747
-x NCCL_DEBUG=WARN \
4848
-x NCCL_CUMEM_ENABLE=0 \
4949
-x NCCL_IB_SPLIT_DATA_ON_QPS=0 \
50-
-x NCCL_IB_QPS_PER_CONNECTION=16 \
50+
-x NCCL_IB_QPS_PER_CONNECTION=1 \
5151
-x NCCL_IB_GID_INDEX=3 \
5252
-x NCCL_IB_TC=41 \
5353
-x NCCL_IB_SL=0 \
@@ -59,11 +59,17 @@ fi
5959
-x UCX_NET_DEVICES=${var_UCX_NET_DEVICES} \
6060
-x RX_QUEUE_LEN=8192 \
6161
-x IB_RX_QUEUE_LEN=8192 \
62-
-x NCCL_BUFFSIZE=16777216 \
63-
-x NCCL_SOCKET_IFNAME=eth0 \
62+
-x NCCL_SOCKET_IFNAME=${var_UCX_NET_DEVICES} \
6463
-x NCCL_IGNORE_CPU_AFFINITY=1 \
6564
-x NCCL_IB_HCA="${var_NCCL_IB_HCA}" \
6665
--np $((SLURM_NNODES*SLURM_NTASKS_PER_NODE)) --hostfile $MACHINEFILE /opt/oci-hpc/nccl-test/build/all_reduce_perf -b 1G -e 16G -f 2 -g 1
6766

6867
# If NCCL version is lower than 2.20.3, it is recommended to use the topology filefor optimal performances
69-
# -x NCCL_TOPO_FILE=~/H100-topology.xml \
68+
# -x NCCL_TOPO_FILE=~/H100-topology.xml \
69+
70+
# If NCCL version is lower than 2.20.3, it is recommended to use
71+
# -x NCCL_CROSS_NIC=0 for multiple subnets and large scale jobs (>16 nodes)
72+
# -x NCCL_CROSS_NIC=1 for single subnets and small scale jobs (<16 nodes)
73+
74+
# If NCCL version is higher than 2.20.3, the absolute max NCCL throughput at large message size will be obtained with
75+
# -x NCCL_MIN_NCHANNELS=32 \ But it does take some processing power away from the GPU for networking gains and is not recommended while running jobs.

samples/gpu/nccl_run_allreduce_H100.sh

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ do
6161
-x NCCL_DEBUG=WARN \
6262
-x NCCL_CUMEM_ENABLE=0 \
6363
-x NCCL_IB_SPLIT_DATA_ON_QPS=0 \
64-
-x NCCL_IB_QPS_PER_CONNECTION=16 \
64+
-x NCCL_IB_QPS_PER_CONNECTION=1 \
6565
-x NCCL_IB_GID_INDEX=3 \
6666
-x NCCL_IB_TC=41 \
6767
-x NCCL_IB_SL=0 \
@@ -74,11 +74,10 @@ do
7474
-x RX_QUEUE_LEN=8192 \
7575
-x IB_RX_QUEUE_LEN=8192 \
7676
-x NCCL_BUFFSIZE=16777216 \
77-
-x NCCL_SOCKET_IFNAME=eth0 \
77+
-x NCCL_SOCKET_IFNAME=${var_UCX_NET_DEVICES} \
7878
-x NCCL_IGNORE_CPU_AFFINITY=1 \
7979
-x NCCL_IB_HCA="${var_NCCL_IB_HCA}" \
8080
-x NCCL_TOPO_FILE=~/H100-topology.xml \
81-
-x NCCL_MIN_NCHANNELS=32 \
8281
--np $np --hostfile $hostfile /opt/oci-hpc/nccl-test/build/all_reduce_perf -b 1G -e 16G -f 2 -g 1 >> $logfile
8382

8483
tail -n 32 $logfile
@@ -87,4 +86,11 @@ done
8786

8887

8988
# If NCCL version is lower than 2.20.3, it is recommended to use the topology filefor optimal performances
90-
# -x NCCL_TOPO_FILE=~/H100-topology.xml \
89+
# -x NCCL_TOPO_FILE=~/H100-topology.xml \
90+
91+
# If NCCL version is lower than 2.20.3, it is recommended to use
92+
# -x NCCL_CROSS_NIC=0 for multiple subnets and large scale jobs (>16 nodes)
93+
# -x NCCL_CROSS_NIC=1 for single subnets and small scale jobs (<16 nodes)
94+
95+
# If NCCL version is higher than 2.20.3, the absolute max NCCL throughput at large message size will be obtained with
96+
# -x NCCL_MIN_NCHANNELS=32 \ But it does take some processing power away from the GPU for networking gains and is not recommended while running jobs.

0 commit comments

Comments
 (0)