Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions .azure/gpu-tests-fabric.yml
Original file line number Diff line number Diff line change
Expand Up @@ -134,13 +134,13 @@ jobs:
condition: and(succeeded(), eq(variables['PACKAGE_NAME'], 'fabric'))
displayName: "Adjust tests & examples"

- bash: python -m coverage run --source ${COVERAGE_SOURCE} -m pytest . -v --durations=50
workingDirectory: tests/tests_fabric/
- bash: python -m coverage run --source ${COVERAGE_SOURCE} -m pytest tests_fabric/ -v --durations=50
workingDirectory: tests/
displayName: "Testing: fabric standard"
timeoutInMinutes: "10"

- bash: bash ../run_standalone_tests.sh "."
workingDirectory: tests/tests_fabric/
- bash: bash ./run_standalone_tests.sh "tests_fabric"
workingDirectory: tests/
env:
PL_STANDALONE_TESTS_SOURCE: $(COVERAGE_SOURCE)
displayName: "Testing: fabric standalone"
Expand All @@ -157,7 +157,7 @@ jobs:
./codecov --token=$(CODECOV_TOKEN) --commit=$(Build.SourceVersion) \
--flags=gpu,pytest,${COVERAGE_SOURCE} --name="GPU-coverage" --env=linux,azure
ls -l
workingDirectory: tests/tests_fabric/
workingDirectory: tests/
displayName: "Statistics"

- script: |
Expand Down
8 changes: 4 additions & 4 deletions .azure/gpu-tests-pytorch.yml
Original file line number Diff line number Diff line change
Expand Up @@ -155,13 +155,13 @@ jobs:
ls -l checkpoints/
displayName: "Get legacy checkpoints"

- bash: python -m coverage run --source ${COVERAGE_SOURCE} -m pytest -v --durations=50
workingDirectory: tests/tests_pytorch
- bash: python -m coverage run --source ${COVERAGE_SOURCE} -m pytest tests_pytorch/ -v --durations=50
workingDirectory: tests/
displayName: "Testing: PyTorch standard"
timeoutInMinutes: "35"

- bash: bash ../run_standalone_tests.sh "."
workingDirectory: tests/tests_pytorch
- bash: bash ./run_standalone_tests.sh "tests_pytorch"
workingDirectory: tests/
env:
PL_USE_MOCKED_MNIST: "1"
PL_STANDALONE_TESTS_SOURCE: $(COVERAGE_SOURCE)
Expand Down
122 changes: 71 additions & 51 deletions tests/run_standalone_tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -19,79 +19,99 @@ set -e
# It can be set through the env variable PL_STANDALONE_TESTS_BATCH_SIZE and defaults to 6 if not set
test_batch_size="${PL_STANDALONE_TESTS_BATCH_SIZE:-6}"
source="${PL_STANDALONE_TESTS_SOURCE:-"lightning"}"
# this is the directory where the tests are located
test_dir=$1 # parse the first argument
COLLECTED_TESTS_FILE="collected_tests.txt"

ls -lh . # show the contents of the directory

# this environment variable allows special tests to run
export PL_RUN_STANDALONE_TESTS=1
# python arguments
defaults=" -m coverage run --source ${source} --append -m pytest --no-header -v -s --timeout 120 "
echo "Using defaults: ${defaults}"

# get the testing location as the first argument
test_path=$1
printf "source path: $test_path\n"

# collect all tests with parametrization based filtering with PL_RUN_STANDALONE_TESTS
standalone_tests=$(python3 -m pytest $test_path -q --collect-only --pythonwarnings ignore)
printf "Collected tests: \n $standalone_tests\n"
# match only lines with tests
parametrizations=$(perl -nle 'print $& while m{\S+::test_\S+}g' <<< "$standalone_tests")
# convert the list to be array
parametrizations_arr=($parametrizations)
report=''
# get the list of parametrizations. we need to call them separately. the last two lines are removed.
# note: if there's a syntax error, this will fail with some garbled output
python3 -um pytest $test_dir -q --collect-only --pythonwarnings ignore 2>&1 > $COLLECTED_TESTS_FILE
# early terminate if collection failed (e.g. syntax error)
if [[ $? != 0 ]]; then
cat $COLLECTED_TESTS_FILE
exit 1
fi

rm -f standalone_test_output.txt # in case it exists, remove it
rm -f testnames.txt
# removes the last line of the file
sed -i '$d' $COLLECTED_TESTS_FILE

function show_batched_output {
if [ -f standalone_test_output.txt ]; then # if exists
cat standalone_test_output.txt
# heuristic: stop if there's mentions of errors. this can prevent false negatives when only some of the ranks fail
if perl -nle 'print if /error|(?<!(?-i)on_)exception|traceback|(?<!(?-i)x)failed/i' standalone_test_output.txt | grep -qv -f testnames.txt; then
echo "Potential error! Stopping."
perl -nle 'print if /error|(?<!(?-i)on_)exception|traceback|(?<!(?-i)x)failed/i' standalone_test_output.txt
rm standalone_test_output.txt
exit 1
fi
rm standalone_test_output.txt
fi
}
trap show_batched_output EXIT # show the output on exit
# Get test list and run each test individually
tests=($(grep -oP '\S+::test_\S+' "$COLLECTED_TESTS_FILE"))
test_count=${#tests[@]}
# present the collected tests
printf "collected $test_count tests:\n-------------------\n"
# replace space with new line
echo "${tests[@]}" | tr ' ' '\n'
printf "\n===================\n"

# remove the "tests/tests_pytorch/" path suffixes
path_prefix=$(basename "$(dirname "$(pwd)")")/$(basename "$(pwd)")"/" # https://stackoverflow.com/a/8223345
# if test count is one print warning
if [[ $test_count -eq 1 ]]; then
printf "WARNING: only one test found!\n"
elif [ $test_count -eq 0 ]; then
printf "ERROR: no tests found!\n"
exit 1
fi

for i in "${!parametrizations_arr[@]}"; do
parametrization=${parametrizations_arr[$i]//$path_prefix/}
prefix="$((i+1))/${#parametrizations_arr[@]}"
# clear all the collected reports
rm -f parallel_test_output-*.txt # in case it exists, remove it

echo "$prefix: Running $parametrization"
echo $parametrization | sed 's/\[[^][]*\]//g' >> testnames.txt

# fix the port to avoid race condition when batched distributed tests select the port randomly
export MASTER_PORT=$((29500 + $i % $test_batch_size))
status=0 # reset the script status
report="" # final report
pids=() # array of PID for running tests
test_ids=() # array of indexes of running tests
printf "Running $test_count tests in batches of $test_batch_size\n"
for i in "${!tests[@]}"; do
# remove initial "tests/" from the test name
test=${tests[$i]/tests\//}
printf "Running test $((i+1))/$test_count: $test\n"

# execute the test in the background
# redirect to a log file that buffers test output. since the tests will run in the background, we cannot let them
# output to std{out,err} because the outputs would be garbled together
python3 ${defaults} "$parametrization" &>> standalone_test_output.txt &
# save the PID in an array
pids[${i}]=$!
# add row to the final report
report+="Ran\t$parametrization\n"
# redirect to a log file that buffers test output. since the tests will run in the background,
# we cannot let them output to std{out,err} because the outputs would be garbled together
python3 ${defaults} "$test" 2>&1 > "standalone_test_output-$i.txt" &
test_ids+=($i) # save the test's id in an array with running tests
pids+=($!) # save the PID in an array with running tests

if ((($i + 1) % $test_batch_size == 0)); then
# if we reached the batch size, wait for all tests to finish
if (( (($i + 1) % $test_batch_size == 0) || $i == $test_count-1 )); then
printf "Waiting for batch to finish: $(IFS=' '; echo "${pids[@]}")\n"
# wait for running tests
for pid in ${pids[*]}; do wait $pid; done
unset pids # empty the array
show_batched_output
for j in "${!test_ids[@]}"; do
i=${test_ids[$j]} # restore the global test's id
pid=${pids[$j]} # restore the particular PID
test=${tests[$i]} # restore the test name
printf "Waiting for $tests >> standalone_test_output-$i.txt (PID: $pid)\n"
wait -n $pid
# get the exit status of the test
test_status=$?
# add row to the final report
report+="Ran\t$test\t>> exit:$test_status\n"
if [[ $test_status != 0 ]]; then
# show the output of the failed test
cat "standalone_test_output-$i.txt"
# Process exited with a non-zero exit status
status=$test_status
fi
done
test_ids=() # reset the test's id array
pids=() # reset the PID array
fi
done
# wait for leftover tests
for pid in ${pids[*]}; do wait $pid; done
show_batched_output

# echo test report
printf '=%.s' {1..80}
printf "\n$report"
printf '=%.s' {1..80}
printf '\n'

# exit with the worst test result
exit $status
8 changes: 7 additions & 1 deletion tests/tests_pytorch/run_standalone_tasks.sh
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,13 @@ export PL_RUN_STANDALONE_TESTS=1
# test that a user can manually launch individual processes
echo "Running manual ddp launch test"
export PYTHONPATH="${PYTHONPATH}:$(pwd)"
args="fit --trainer.accelerator gpu --trainer.devices 2 --trainer.strategy ddp --trainer.max_epochs=1 --trainer.limit_train_batches=1 --trainer.limit_val_batches=1 --trainer.limit_test_batches=1"
args="fit --trainer.accelerator gpu \
--trainer.devices 2 \
--trainer.strategy ddp \
--trainer.max_epochs=1 \
--trainer.limit_train_batches=1 \
--trainer.limit_val_batches=1 \
--trainer.limit_test_batches=1"
MASTER_ADDR="localhost" MASTER_PORT=1234 LOCAL_RANK=1 python strategies/scripts/cli_script.py ${args} &
MASTER_ADDR="localhost" MASTER_PORT=1234 LOCAL_RANK=0 python strategies/scripts/cli_script.py ${args}

Expand Down
Loading