File tree Expand file tree Collapse file tree 2 files changed +21
-9
lines changed Expand file tree Collapse file tree 2 files changed +21
-9
lines changed Original file line number Diff line number Diff line change 4343 --bind-to numa \
4444 -npernode 8 \
4545 --mca coll ^hcoll \
46- -x NCCL_CROSS_NIC=1 \
46+ -x NCCL_CROSS_NIC=2 \
4747 -x NCCL_DEBUG=WARN \
4848 -x NCCL_CUMEM_ENABLE=0 \
4949 -x NCCL_IB_SPLIT_DATA_ON_QPS=0 \
50- -x NCCL_IB_QPS_PER_CONNECTION=16 \
50+ -x NCCL_IB_QPS_PER_CONNECTION=1 \
5151 -x NCCL_IB_GID_INDEX=3 \
5252 -x NCCL_IB_TC=41 \
5353 -x NCCL_IB_SL=0 \
5959 -x UCX_NET_DEVICES=${var_UCX_NET_DEVICES} \
6060 -x RX_QUEUE_LEN=8192 \
6161 -x IB_RX_QUEUE_LEN=8192 \
62- -x NCCL_BUFFSIZE=16777216 \
63- -x NCCL_SOCKET_IFNAME=eth0 \
62+ -x NCCL_SOCKET_IFNAME=${var_UCX_NET_DEVICES} \
6463 -x NCCL_IGNORE_CPU_AFFINITY=1 \
6564 -x NCCL_IB_HCA=" ${var_NCCL_IB_HCA} " \
6665 --np $(( SLURM_NNODES* SLURM_NTASKS_PER_NODE)) --hostfile $MACHINEFILE /opt/oci-hpc/nccl-test/build/all_reduce_perf -b 1G -e 16G -f 2 -g 1
6766
6867 # If NCCL version is lower than 2.20.3, it is recommended to use the topology filefor optimal performances
69- # -x NCCL_TOPO_FILE=~/H100-topology.xml \
68+ # -x NCCL_TOPO_FILE=~/H100-topology.xml \
69+
70+ # If NCCL version is lower than 2.20.3, it is recommended to use
71+ # -x NCCL_CROSS_NIC=0 for multiple subnets and large scale jobs (>16 nodes)
72+ # -x NCCL_CROSS_NIC=1 for single subnets and small scale jobs (<16 nodes)
73+
74+ # If NCCL version is higher than 2.20.3, the absolute max NCCL throughput at large message size will be obtained with
75+ # -x NCCL_MIN_NCHANNELS=32 \ But it does take some processing power away from the GPU for networking gains and is not recommended while running jobs.
Original file line number Diff line number Diff line change 6161 -x NCCL_DEBUG=WARN \
6262 -x NCCL_CUMEM_ENABLE=0 \
6363 -x NCCL_IB_SPLIT_DATA_ON_QPS=0 \
64- -x NCCL_IB_QPS_PER_CONNECTION=16 \
64+ -x NCCL_IB_QPS_PER_CONNECTION=1 \
6565 -x NCCL_IB_GID_INDEX=3 \
6666 -x NCCL_IB_TC=41 \
6767 -x NCCL_IB_SL=0 \
7474 -x RX_QUEUE_LEN=8192 \
7575 -x IB_RX_QUEUE_LEN=8192 \
7676 -x NCCL_BUFFSIZE=16777216 \
77- -x NCCL_SOCKET_IFNAME=eth0 \
77+ -x NCCL_SOCKET_IFNAME=${var_UCX_NET_DEVICES} \
7878 -x NCCL_IGNORE_CPU_AFFINITY=1 \
7979 -x NCCL_IB_HCA=" ${var_NCCL_IB_HCA} " \
8080 -x NCCL_TOPO_FILE=~ /H100-topology.xml \
81- -x NCCL_MIN_NCHANNELS=32 \
8281 --np $np --hostfile $hostfile /opt/oci-hpc/nccl-test/build/all_reduce_perf -b 1G -e 16G -f 2 -g 1 >> $logfile
8382
8483 tail -n 32 $logfile
8786
8887
8988 # If NCCL version is lower than 2.20.3, it is recommended to use the topology filefor optimal performances
90- # -x NCCL_TOPO_FILE=~/H100-topology.xml \
89+ # -x NCCL_TOPO_FILE=~/H100-topology.xml \
90+
91+ # If NCCL version is lower than 2.20.3, it is recommended to use
92+ # -x NCCL_CROSS_NIC=0 for multiple subnets and large scale jobs (>16 nodes)
93+ # -x NCCL_CROSS_NIC=1 for single subnets and small scale jobs (<16 nodes)
94+
95+ # If NCCL version is higher than 2.20.3, the absolute max NCCL throughput at large message size will be obtained with
96+ # -x NCCL_MIN_NCHANNELS=32 \ But it does take some processing power away from the GPU for networking gains and is not recommended while running jobs.
You can’t perform that action at this time.
0 commit comments