Skip to content

Commit 1130569

Browse files
authored
Test pt 26 (#5142)
1 parent a66b896 commit 1130569

File tree

6 files changed

+48
-152
lines changed

6 files changed

+48
-152
lines changed

test/dlc_tests/container_tests/bin/pytorch_tests/evaluateResults

Lines changed: 0 additions & 43 deletions
This file was deleted.

test/dlc_tests/container_tests/bin/pytorch_tests/installOSSPyTorch

Lines changed: 0 additions & 33 deletions
This file was deleted.
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
#!/bin/bash
2+
3+
set -e
4+
5+
# setup benchmark repo
6+
git clone https://github.com/pytorch/benchmark.git
7+
cd benchmark
8+
PYTORCH_VERSION=$(python -c "import torch; print(torch.__version__)" 2>/dev/null || echo "pytorch not found")
9+
# Add git checkout for PyTorch 2.6
10+
if [[ "$PYTORCH_VERSION" == "2.6"* ]]; then
11+
git checkout 4c7763b619b4eb1de88e57aee901eda92e270d7b
12+
fi
13+
pip install -r requirements.txt
14+
pip install numba

test/dlc_tests/container_tests/bin/pytorch_tests/setupPyTorchBackendTest

Lines changed: 0 additions & 34 deletions
This file was deleted.

test/dlc_tests/container_tests/bin/pytorch_tests/testPyTorchGlooMpi

Lines changed: 18 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -2,25 +2,30 @@
22

33
set -e
44

5-
source /test/bin/pytorch_tests/setupPyTorchBackendTest
5+
# check GPU status
6+
if nvidia-smi &> /dev/null; then
7+
DEVICE="cuda"
8+
NUM_GPUS=$(nvidia-smi -L | wc -l)
9+
else
10+
DEVICE="cpu"
11+
NUM_GPUS=0
12+
fi
13+
14+
git config --global user.email "[email protected]"
15+
git config --global user.name "Your Name"
16+
17+
if which conda >/dev/null 2>&1; then
18+
conda init
19+
source ~/.bashrc
20+
fi
21+
22+
source /test/bin/pytorch_tests/installPyTorchBenchmarkRepository
623

724
export PT_BACKEND=$1
825
export USE_INDUCTOR=$2
926
# assign the lower of 2 and gpu #, or 2 when no gpu
1027
export WORLD_SIZE=$(($NUM_GPUS == 0 ? 2 : ($NUM_GPUS < 2 ? $NUM_GPUS : 2)))
1128

12-
# test aws pytorch
1329
OUT_PATH=$AWS_LOG_DIR/aws_res.csv timeout 360 python run.py BERT_pytorch -d $DEVICE -t train
1430

15-
bash ${BIN_DIR}/pytorch_tests/installOSSPyTorch
16-
17-
# test oss pytorch
18-
OUT_PATH=$OSS_LOG_DIR/oss_res.csv timeout 360 python run.py BERT_pytorch -d $DEVICE -t train
19-
20-
# compare aws and oss results process by process (result metrics are aleady medians)
21-
for ((i=0; i<$WORLD_SIZE; i++))
22-
do
23-
bash ${BIN_DIR}/pytorch_tests/evaluateResults "$AWS_LOG_DIR/aws_res.csv_$i" "$OSS_LOG_DIR/oss_res.csv_$i"
24-
done
25-
2631
exit 0

test/dlc_tests/container_tests/bin/pytorch_tests/testPyTorchNccl

Lines changed: 16 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,21 @@
22

33
set -e
44

5-
source /test/bin/pytorch_tests/setupPyTorchBackendTest
5+
export HOME_DIR=/test
6+
export LOG_DIR=${HOME_DIR}/logs
7+
export AWS_LOG_DIR=$LOG_DIR/aws_log
8+
9+
# check GPU status
10+
if nvidia-smi &> /dev/null; then
11+
DEVICE="cuda"
12+
NUM_GPUS=$(nvidia-smi -L | wc -l)
13+
else
14+
DEVICE="cpu"
15+
NUM_GPUS=0
16+
fi
17+
18+
source /test/bin/pytorch_tests/installPyTorchBenchmarkRepository
19+
620
export USE_INDUCTOR=$1
721
echo testPyTorchNCCL: USE_INDUCTOR, $USE_INDUCTOR
822

@@ -19,38 +33,11 @@ python userbenchmark/ddp_experiments/run.py \
1933
--job_dir $AWS_LOG_DIR \
2034
--nccl-socket-ifname eth0
2135

22-
2336
# generate the result csv
2437
JOB_ID=$(ls $AWS_LOG_DIR | grep .out | head -n 1 | cut -d'_' -f 1)
2538
python userbenchmark/ddp_experiments/parse_ddp.py \
2639
--job_id $JOB_ID \
2740
--results_dir $AWS_LOG_DIR \
2841
--csv_out > $AWS_LOG_DIR/aws_res.csv
2942
echo testPyTorchNCCL: AWS-PyTorch test results && cat $AWS_LOG_DIR/aws_res.csv
30-
31-
# get OSS perf data
32-
bash ${BIN_DIR}/pytorch_tests/installOSSPyTorch
33-
34-
# run NCCL benchmarking with 2 GPUS and 1 local node (resnet will use torchvision)
35-
python userbenchmark/ddp_experiments/run.py \
36-
--ngpus $WORLD_SIZE \
37-
--distributed ddp \
38-
--nodes 1 \
39-
--cluster local \
40-
--filter_models resnet50 \
41-
--timeout 10 \
42-
--job_dir $OSS_LOG_DIR \
43-
--nccl-socket-ifname eth0
44-
45-
# generate the result csv
46-
JOB_ID=$(ls $OSS_LOG_DIR | grep .out | head -n 1 | cut -d'_' -f 1)
47-
python userbenchmark/ddp_experiments/parse_ddp.py \
48-
--job_id $JOB_ID \
49-
--results_dir $OSS_LOG_DIR \
50-
--csv_out > $OSS_LOG_DIR/oss_res.csv
51-
echo testPyTorchNCCL: OSS-PyTorch test results && cat $OSS_LOG_DIR/oss_res.csv
52-
53-
# evaluate the performance
54-
bash ${BIN_DIR}/pytorch_tests/evaluateResults $AWS_LOG_DIR/aws_res.csv $OSS_LOG_DIR/oss_res.csv
55-
56-
exit 0
43+
exit 0

0 commit comments

Comments
 (0)