File tree Expand file tree Collapse file tree 9 files changed +40
-24
lines changed Expand file tree Collapse file tree 9 files changed +40
-24
lines changed Original file line number Diff line number Diff line change @@ -22,10 +22,8 @@ cat $MACHINEFILE
2222source /etc/os-release
2323if [ $ID == " ol" ] || [ $ID == " centos" ] ; then
2424 python3 /home/opc/node_ordering_by_rack.py --input_file $MACHINEFILE > /dev/null
25- USER=opc
2625elif [ $ID == " debian" ] || [ $ID == " ubuntu" ] ; then
2726 python3 /home/ubuntu/node_ordering_by_rack.py --input_file $MACHINEFILE > /dev/null
28- USER=ubuntu
2927fi
3028
3129
7472 -x NCCL_IB_GID_INDEX=3 \
7573 -x NCCL_ALGO=Ring \
7674 -x NCCL_IB_HCA=" ${var_NCCL_IB_HCA} " \
77- --np $(( SLURM_NNODES* SLURM_NTASKS_PER_NODE)) --rankfile $ORDEREDRANKMACHINEFILE /home/ $USER /nccl-tests /build/all_reduce_perf -b1G -e10G -i$(( 1024 * 1024 * 1024 * 9 )) -n 100
75+ --np $(( SLURM_NNODES* SLURM_NTASKS_PER_NODE)) --rankfile $ORDEREDRANKMACHINEFILE /opt/oci-hpc /nccl-test /build/all_reduce_perf -b1G -e10G -i$(( 1024 * 1024 * 1024 * 9 )) -n 100
7876
7977
Original file line number Diff line number Diff line change @@ -18,7 +18,13 @@ echo INPUTFILE
1818cat $hostfile
1919
2020# will generate rack-aware ordered host file
21- python3 /home/opc/node_ordering_by_rack.py --input_file $hostfile > /dev/null
21+ source /etc/os-release
22+ if [ $ID == " ol" ] || [ $ID == " centos" ] ; then
23+ python3 /home/opc/node_ordering_by_rack.py --input_file $hostfile > /dev/null
24+ elif [ $ID == " debian" ] || [ $ID == " ubuntu" ] ; then
25+ python3 /home/ubuntu/node_ordering_by_rack.py --input_file $hostfile > /dev/null
26+ fi
27+
2228hostfile=$ORDEREDMACHINEFILE
2329
2430echo ORDEREDMACHINEFILE
7379 -x NCCL_IB_GID_INDEX=3 \
7480 -x NCCL_ALGO=Ring \
7581 -x NCCL_IB_HCA=" ${var_NCCL_IB_HCA} " \
76- --np $np --hostfile $hostfile -N 8 /home/opc /nccl-tests /build/all_reduce_perf -b1G -e10G -i$(( 1024 * 1024 * 1024 * 9 )) -n $iter >> $logfile
82+ --np $np --hostfile $hostfile -N 8 /opt/oci-hpc /nccl-test /build/all_reduce_perf -b1G -e10G -i$(( 1024 * 1024 * 1024 * 9 )) -n $iter >> $logfile
7783
7884 tail -n 32 $logfile
7985
Original file line number Diff line number Diff line change @@ -23,10 +23,8 @@ cat $MACHINEFILE
2323source /etc/os-release
2424if [ $ID == " ol" ] || [ $ID == " centos" ] ; then
2525 python3 /home/opc/node_ordering_by_rack.py --input_file $MACHINEFILE > /dev/null
26- USER=opc
2726elif [ $ID == " debian" ] || [ $ID == " ubuntu" ] ; then
2827 python3 /home/ubuntu/node_ordering_by_rack.py --input_file $MACHINEFILE > /dev/null
29- USER=ubuntu
3028fi
3129
3230echo ORDEREDMACHINEFILE
@@ -80,6 +78,7 @@ export RX_QUEUE_LEN=8192 \
8078 NCCL_IB_QPS_PER_CONNECTION=4
8179
8280env | grep " SLURMD_NODENAME="
81+ USER=` whoami`
8382
8483CONTAINER_IMAGE=" /nfs/scratch/nvcr.io+nvidia+pytorch+22.12-py3.sqsh"
8584CONTAINER_MOUNTS=" /home/$USER /nccl-tests:/nccl,$LOCAL_MPI :$LOCAL_MPI "
Original file line number Diff line number Diff line change @@ -29,8 +29,6 @@ if [[ "$mpivars_path" == "" ]]; then
2929source $mpivars_path
3030echo $mpivars_path
3131
32- USER=` whoami`
33-
3432shape=` curl -sH " Authorization: Bearer Oracle" -L http://169.254.169.254/opc/v2/instance/ | jq .shape`
3533if [ $shape == \" BM.GPU.B4.8\" ] || [ $shape == \" BM.GPU.A100-v2.8\" ]
3634then
@@ -57,4 +55,4 @@ export NCCL_DEBUG=WARN \
5755 NCCL_IB_GID_INDEX=3 \
5856 NCCL_ALGO=Ring \
5957 NCCL_IB_HCA=" ${var_NCCL_IB_HCA} "
60- srun --mpi=pmix_v3 --gpus-per-node=$SLURM_GPUS_PER_NODE --ntasks-per-node=$SLURM_NTASKS_PER_NODE /home/ $USER /nccl-tests /build/all_reduce_perf -b1G -e10G -i$(( 1024 * 1024 * 1024 * 9 )) -n 100
58+ srun --mpi=pmix_v3 --gpus-per-node=$SLURM_GPUS_PER_NODE --ntasks-per-node=$SLURM_NTASKS_PER_NODE /opt/oci-hpc /nccl-test /build/all_reduce_perf -b1G -e10G -i$(( 1024 * 1024 * 1024 * 9 )) -n 100
Original file line number Diff line number Diff line change 4343 source $mpivars_path
4444 echo $mpivars_path
4545
46- USER=` whoami`
47-
4846 first_node=` head $hostfile -n 1`
4947 shape=` ssh $first_node ' curl -sH "Authorization: Bearer Oracle" -L http://169.254.169.254/opc/v2/instance/' | jq .shape`
5048 if [ $shape == \" BM.GPU.B4.8\" ] || [ $shape == \" BM.GPU.A100-v2.8\" ]
7270 NCCL_IB_GID_INDEX=3 \
7371 NCCL_ALGO=Ring \
7472 NCCL_IB_HCA=" ${var_NCCL_IB_HCA} "
75- srun --mpi=pmix_v3 --nodefile=$hostfile --gpus-per-node=8 --ntasks-per-node=8 /home/ $USER /nccl-tests /build/all_reduce_perf -b1G -e10G -i$(( 1024 * 1024 * 1024 * 9 )) -n 100 >> $logfile
73+ srun --mpi=pmix_v3 --nodefile=$hostfile --gpus-per-node=8 --ntasks-per-node=8 /opt/oci-hpc /nccl-test /build/all_reduce_perf -b1G -e10G -i$(( 1024 * 1024 * 1024 * 9 )) -n 100 >> $logfile
7674
7775
7876
Original file line number Diff line number Diff line change @@ -22,7 +22,12 @@ echo INPUTFILE
2222cat $hostfile
2323
2424# will generate rack-aware ordered host file
25- python3 /home/opc/node_ordering_by_rack.py --input_file $hostfile > /dev/null
25+ if [ $ID == " ol" ] || [ $ID == " centos" ] ; then
26+ python3 /home/opc/node_ordering_by_rack.py --input_file $hostfile > /dev/null
27+ elif [ $ID == " debian" ] || [ $ID == " ubuntu" ] ; then
28+ python3 /home/ubuntu/node_ordering_by_rack.py --input_file $hostfile > /dev/null
29+ fi
30+
2631hostfile=$ORDEREDMACHINEFILE
2732
2833echo ORDEREDMACHINEFILE
8287 -x NCCL_IB_GID_INDEX=3 \
8388 -x NCCL_ALGO=Ring \
8489 -x NCCL_IB_HCA=" ${var_NCCL_IB_HCA} " \
85- --np $np --hostfile $hostfile -N 8 /home/opc /nccl-tests /build/alltoall_perf -f 2 -g 1 -c 0 -n $iter >> $logfile
90+ --np $np --hostfile $hostfile -N 8 /opt/oci-hpc /nccl-test /build/alltoall_perf -f 2 -g 1 -c 0 -n $iter >> $logfile
8691
8792 tail -n 15 $logfile
8893
Original file line number Diff line number Diff line change @@ -19,7 +19,12 @@ scontrol show hostnames $SLURM_JOB_NODELIST > $MACHINEFILE
1919echo MACHINEFILE
2020cat $MACHINEFILE
2121
22- python3 /home/opc/node_ordering_by_rack.py --input_file $MACHINEFILE > /dev/null
22+ source /etc/os-release
23+ if [ $ID == " ol" ] || [ $ID == " centos" ] ; then
24+ python3 /home/opc/node_ordering_by_rack.py --input_file $MACHINEFILE > /dev/null
25+ elif [ $ID == " debian" ] || [ $ID == " ubuntu" ] ; then
26+ python3 /home/ubuntu/node_ordering_by_rack.py --input_file $MACHINEFILE > /dev/null
27+ fi
2328
2429echo ORDEREDMACHINEFILE
2530cat $ORDEREDMACHINEFILE
@@ -31,9 +36,6 @@ source $mpivars_path
3136
3237if [[ " $mpivars_path " == " " ]]; then echo " Could not find MPIPATH" ; exit ; fi
3338
34- # source /usr/mpi/gcc/openmpi-4.1.0rc5/bin/mpivars.sh
35- # source /usr/mpi/gcc/openmpi-4.0.3rc4/bin/mpivars.sh
36-
3739export NCCL_DEBUG=WARN
3840
3941
6466 -x NCCL_IB_GID_INDEX=3 \
6567 -x NCCL_ALGO=Ring \
6668 -x NCCL_IB_HCA=" ${var_NCCL_IB_HCA} " \
67- --np $(( SLURM_NNODES* SLURM_NTASKS_PER_NODE)) --rankfile $ORDEREDRANKMACHINEFILE /home/opc /nccl-tests /build/all_reduce_perf -b8 -e 4G -f 2 -n 100
69+ --np $(( SLURM_NNODES* SLURM_NTASKS_PER_NODE)) --rankfile $ORDEREDRANKMACHINEFILE /opt/oci-hpc /nccl-test /build/all_reduce_perf -b8 -e 4G -f 2 -n 100
6870
6971
Original file line number Diff line number Diff line change @@ -18,7 +18,12 @@ echo INPUTFILE
1818cat $hostfile
1919
2020# will generate rack-aware ordered host file
21- python3 /home/opc/node_ordering_by_rack.py --input_file $hostfile > /dev/null
21+ if [ $ID == " ol" ] || [ $ID == " centos" ] ; then
22+ python3 /home/opc/node_ordering_by_rack.py --input_file $hostfile > /dev/null
23+ elif [ $ID == " debian" ] || [ $ID == " ubuntu" ] ; then
24+ python3 /home/ubuntu/node_ordering_by_rack.py --input_file $hostfile > /dev/null
25+ fi
26+
2227hostfile=$ORDEREDMACHINEFILE
2328
2429echo ORDEREDMACHINEFILE
7580 -x NCCL_IB_GID_INDEX=3 \
7681 -x NCCL_ALGO=Ring \
7782 -x NCCL_IB_HCA=" ${var_NCCL_IB_HCA} " \
78- --np $np --hostfile $hostfile -N 8 /home/opc /nccl-tests /build/all_reduce_perf -b8 -e 4G -f 2 -n $iter >> $logfile
83+ --np $np --hostfile $hostfile -N 8 /opt/oci-hpc /nccl-test /build/all_reduce_perf -b8 -e 4G -f 2 -n $iter >> $logfile
7984
8085 tail -n 32 $logfile
8186
Original file line number Diff line number Diff line change @@ -24,7 +24,12 @@ echo INPUTFILE
2424cat $hostfile
2525
2626# will generate rack-aware ordered host file
27- python3 /home/opc/node_ordering_by_rack.py --input_file $hostfile > /dev/null
27+ if [ $ID == " ol" ] || [ $ID == " centos" ] ; then
28+ python3 /home/opc/node_ordering_by_rack.py --input_file $hostfile > /dev/null
29+ elif [ $ID == " debian" ] || [ $ID == " ubuntu" ] ; then
30+ python3 /home/ubuntu/node_ordering_by_rack.py --input_file $hostfile > /dev/null
31+ fi
32+
2833hostfile=$ORDEREDMACHINEFILE
2934
3035echo ORDEREDMACHINEFILE
8792 -x NCCL_IB_GID_INDEX=3 \
8893 -x NCCL_ALGO=Ring \
8994 -x NCCL_IB_HCA=" ${var_NCCL_IB_HCA} " \
90- --np $np --hostfile $hostfile -N 8 /home/opc /nccl-tests /build/alltoall_perf -f 2 -g 1 -c 0 -n $iter >> $logfile
95+ --np $np --hostfile $hostfile -N 8 /opt/oci-hpc /nccl-test /build/alltoall_perf -f 2 -g 1 -c 0 -n $iter >> $logfile
9196
9297 tail -n 15 $logfile
9398
You can’t perform that action at this time.
0 commit comments