Skip to content

Commit 8cf185a

Browse files
authored
fix: use standalone tests' exit code (#20430)
1 parent 173cb8c commit 8cf185a

File tree

4 files changed

+87
-61
lines changed

4 files changed

+87
-61
lines changed

.azure/gpu-tests-fabric.yml

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -134,13 +134,13 @@ jobs:
134134
condition: and(succeeded(), eq(variables['PACKAGE_NAME'], 'fabric'))
135135
displayName: "Adjust tests & examples"
136136
137-
- bash: python -m coverage run --source ${COVERAGE_SOURCE} -m pytest . -v --durations=50
138-
workingDirectory: tests/tests_fabric/
137+
- bash: python -m coverage run --source ${COVERAGE_SOURCE} -m pytest tests_fabric/ -v --durations=50
138+
workingDirectory: tests/
139139
displayName: "Testing: fabric standard"
140140
timeoutInMinutes: "10"
141141

142-
- bash: bash ../run_standalone_tests.sh "."
143-
workingDirectory: tests/tests_fabric/
142+
- bash: bash ./run_standalone_tests.sh "tests_fabric"
143+
workingDirectory: tests/
144144
env:
145145
PL_STANDALONE_TESTS_SOURCE: $(COVERAGE_SOURCE)
146146
displayName: "Testing: fabric standalone"
@@ -157,7 +157,7 @@ jobs:
157157
./codecov --token=$(CODECOV_TOKEN) --commit=$(Build.SourceVersion) \
158158
--flags=gpu,pytest,${COVERAGE_SOURCE} --name="GPU-coverage" --env=linux,azure
159159
ls -l
160-
workingDirectory: tests/tests_fabric/
160+
workingDirectory: tests/
161161
displayName: "Statistics"
162162
163163
- script: |

.azure/gpu-tests-pytorch.yml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -155,13 +155,13 @@ jobs:
155155
ls -l checkpoints/
156156
displayName: "Get legacy checkpoints"
157157
158-
- bash: python -m coverage run --source ${COVERAGE_SOURCE} -m pytest -v --durations=50
159-
workingDirectory: tests/tests_pytorch
158+
- bash: python -m coverage run --source ${COVERAGE_SOURCE} -m pytest tests_pytorch/ -v --durations=50
159+
workingDirectory: tests/
160160
displayName: "Testing: PyTorch standard"
161161
timeoutInMinutes: "35"
162162

163-
- bash: bash ../run_standalone_tests.sh "."
164-
workingDirectory: tests/tests_pytorch
163+
- bash: bash ./run_standalone_tests.sh "tests_pytorch"
164+
workingDirectory: tests/
165165
env:
166166
PL_USE_MOCKED_MNIST: "1"
167167
PL_STANDALONE_TESTS_SOURCE: $(COVERAGE_SOURCE)

tests/run_standalone_tests.sh

Lines changed: 71 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -19,79 +19,99 @@ set -e
1919
# It can be set through the env variable PL_STANDALONE_TESTS_BATCH_SIZE and defaults to 6 if not set
2020
test_batch_size="${PL_STANDALONE_TESTS_BATCH_SIZE:-6}"
2121
source="${PL_STANDALONE_TESTS_SOURCE:-"lightning"}"
22+
# this is the directory where the tests are located
23+
test_dir=$1 # parse the first argument
24+
COLLECTED_TESTS_FILE="collected_tests.txt"
25+
26+
ls -lh . # show the contents of the directory
2227

2328
# this environment variable allows special tests to run
2429
export PL_RUN_STANDALONE_TESTS=1
2530
# python arguments
2631
defaults=" -m coverage run --source ${source} --append -m pytest --no-header -v -s --timeout 120 "
2732
echo "Using defaults: ${defaults}"
2833

29-
# get the testing location as the first argument
30-
test_path=$1
31-
printf "source path: $test_path\n"
32-
33-
# collect all tests with parametrization based filtering with PL_RUN_STANDALONE_TESTS
34-
standalone_tests=$(python3 -m pytest $test_path -q --collect-only --pythonwarnings ignore)
35-
printf "Collected tests: \n $standalone_tests\n"
36-
# match only lines with tests
37-
parametrizations=$(perl -nle 'print $& while m{\S+::test_\S+}g' <<< "$standalone_tests")
38-
# convert the list to be array
39-
parametrizations_arr=($parametrizations)
40-
report=''
34+
# get the list of parametrizations. we need to call them separately. the last two lines are removed.
35+
# note: if there's a syntax error, this will fail with some garbled output
36+
python3 -um pytest $test_dir -q --collect-only --pythonwarnings ignore 2>&1 > $COLLECTED_TESTS_FILE
37+
# early terminate if collection failed (e.g. syntax error)
38+
if [[ $? != 0 ]]; then
39+
cat $COLLECTED_TESTS_FILE
40+
exit 1
41+
fi
4142

42-
rm -f standalone_test_output.txt # in case it exists, remove it
43-
rm -f testnames.txt
43+
# removes the last line of the file
44+
sed -i '$d' $COLLECTED_TESTS_FILE
4445

45-
function show_batched_output {
46-
if [ -f standalone_test_output.txt ]; then # if exists
47-
cat standalone_test_output.txt
48-
# heuristic: stop if there's mentions of errors. this can prevent false negatives when only some of the ranks fail
49-
if perl -nle 'print if /error|(?<!(?-i)on_)exception|traceback|(?<!(?-i)x)failed/i' standalone_test_output.txt | grep -qv -f testnames.txt; then
50-
echo "Potential error! Stopping."
51-
perl -nle 'print if /error|(?<!(?-i)on_)exception|traceback|(?<!(?-i)x)failed/i' standalone_test_output.txt
52-
rm standalone_test_output.txt
53-
exit 1
54-
fi
55-
rm standalone_test_output.txt
56-
fi
57-
}
58-
trap show_batched_output EXIT # show the output on exit
46+
# Get test list and run each test individually
47+
tests=($(grep -oP '\S+::test_\S+' "$COLLECTED_TESTS_FILE"))
48+
test_count=${#tests[@]}
49+
# present the collected tests
50+
printf "collected $test_count tests:\n-------------------\n"
51+
# replace space with new line
52+
echo "${tests[@]}" | tr ' ' '\n'
53+
printf "\n===================\n"
5954

60-
# remove the "tests/tests_pytorch/" path suffixes
61-
path_prefix=$(basename "$(dirname "$(pwd)")")/$(basename "$(pwd)")"/" # https://stackoverflow.com/a/8223345
55+
# if test count is one print warning
56+
if [[ $test_count -eq 1 ]]; then
57+
printf "WARNING: only one test found!\n"
58+
elif [ $test_count -eq 0 ]; then
59+
printf "ERROR: no tests found!\n"
60+
exit 1
61+
fi
6262

63-
for i in "${!parametrizations_arr[@]}"; do
64-
parametrization=${parametrizations_arr[$i]//$path_prefix/}
65-
prefix="$((i+1))/${#parametrizations_arr[@]}"
63+
# clear all the collected reports
64+
rm -f parallel_test_output-*.txt # in case it exists, remove it
6665

67-
echo "$prefix: Running $parametrization"
68-
echo $parametrization | sed 's/\[[^][]*\]//g' >> testnames.txt
6966

70-
# fix the port to avoid race condition when batched distributed tests select the port randomly
71-
export MASTER_PORT=$((29500 + $i % $test_batch_size))
67+
status=0 # reset the script status
68+
report="" # final report
69+
pids=() # array of PID for running tests
70+
test_ids=() # array of indexes of running tests
71+
printf "Running $test_count tests in batches of $test_batch_size\n"
72+
for i in "${!tests[@]}"; do
73+
# remove initial "tests/" from the test name
74+
test=${tests[$i]/tests\//}
75+
printf "Running test $((i+1))/$test_count: $test\n"
7276

7377
# execute the test in the background
74-
# redirect to a log file that buffers test output. since the tests will run in the background, we cannot let them
75-
# output to std{out,err} because the outputs would be garbled together
76-
python3 ${defaults} "$parametrization" &>> standalone_test_output.txt &
77-
# save the PID in an array
78-
pids[${i}]=$!
79-
# add row to the final report
80-
report+="Ran\t$parametrization\n"
78+
# redirect to a log file that buffers test output. since the tests will run in the background,
79+
# we cannot let them output to std{out,err} because the outputs would be garbled together
80+
python3 ${defaults} "$test" 2>&1 > "standalone_test_output-$i.txt" &
81+
test_ids+=($i) # save the test's id in an array with running tests
82+
pids+=($!) # save the PID in an array with running tests
8183

82-
if ((($i + 1) % $test_batch_size == 0)); then
84+
# if we reached the batch size, wait for all tests to finish
85+
if (( (($i + 1) % $test_batch_size == 0) || $i == $test_count-1 )); then
86+
printf "Waiting for batch to finish: $(IFS=' '; echo "${pids[@]}")\n"
8387
# wait for running tests
84-
for pid in ${pids[*]}; do wait $pid; done
85-
unset pids # empty the array
86-
show_batched_output
88+
for j in "${!test_ids[@]}"; do
89+
i=${test_ids[$j]} # restore the global test's id
90+
pid=${pids[$j]} # restore the particular PID
91+
test=${tests[$i]} # restore the test name
92+
printf "Waiting for $tests >> standalone_test_output-$i.txt (PID: $pid)\n"
93+
wait -n $pid
94+
# get the exit status of the test
95+
test_status=$?
96+
# add row to the final report
97+
report+="Ran\t$test\t>> exit:$test_status\n"
98+
if [[ $test_status != 0 ]]; then
99+
# show the output of the failed test
100+
cat "standalone_test_output-$i.txt"
101+
# Process exited with a non-zero exit status
102+
status=$test_status
103+
fi
104+
done
105+
test_ids=() # reset the test's id array
106+
pids=() # reset the PID array
87107
fi
88108
done
89-
# wait for leftover tests
90-
for pid in ${pids[*]}; do wait $pid; done
91-
show_batched_output
92109

93110
# echo test report
94111
printf '=%.s' {1..80}
95112
printf "\n$report"
96113
printf '=%.s' {1..80}
97114
printf '\n'
115+
116+
# exit with the worst test result
117+
exit $status

tests/tests_pytorch/run_standalone_tasks.sh

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,13 @@ export PL_RUN_STANDALONE_TESTS=1
2121
# test that a user can manually launch individual processes
2222
echo "Running manual ddp launch test"
2323
export PYTHONPATH="${PYTHONPATH}:$(pwd)"
24-
args="fit --trainer.accelerator gpu --trainer.devices 2 --trainer.strategy ddp --trainer.max_epochs=1 --trainer.limit_train_batches=1 --trainer.limit_val_batches=1 --trainer.limit_test_batches=1"
24+
args="fit --trainer.accelerator gpu \
25+
--trainer.devices 2 \
26+
--trainer.strategy ddp \
27+
--trainer.max_epochs=1 \
28+
--trainer.limit_train_batches=1 \
29+
--trainer.limit_val_batches=1 \
30+
--trainer.limit_test_batches=1"
2531
MASTER_ADDR="localhost" MASTER_PORT=1234 LOCAL_RANK=1 python strategies/scripts/cli_script.py ${args} &
2632
MASTER_ADDR="localhost" MASTER_PORT=1234 LOCAL_RANK=0 python strategies/scripts/cli_script.py ${args}
2733

0 commit comments

Comments
 (0)