Merge branch 'master' into chualan/add-example-for-TBPTT

lantiga · web-flow · commit 4e37ab931e0c · 2024-11-19T21:34:13.000+01:00
diff --git a/.azure/gpu-tests-fabric.yml b/.azure/gpu-tests-fabric.yml
@@ -134,13 +134,13 @@ jobs:
         condition: and(succeeded(), eq(variables['PACKAGE_NAME'], 'fabric'))
         displayName: "Adjust tests & examples"
 
-      - bash: python -m coverage run --source ${COVERAGE_SOURCE} -m pytest . -v --durations=50
-        workingDirectory: tests/tests_fabric/
+      - bash: python -m coverage run --source ${COVERAGE_SOURCE} -m pytest tests_fabric/ -v --durations=50
+        workingDirectory: tests/
         displayName: "Testing: fabric standard"
         timeoutInMinutes: "10"
 
-      - bash: bash ../run_standalone_tests.sh "."
-        workingDirectory: tests/tests_fabric/
+      - bash: bash ./run_standalone_tests.sh "tests_fabric"
+        workingDirectory: tests/
         env:
           PL_STANDALONE_TESTS_SOURCE: $(COVERAGE_SOURCE)
         displayName: "Testing: fabric standalone"
@@ -157,7 +157,7 @@ jobs:
           ./codecov --token=$(CODECOV_TOKEN) --commit=$(Build.SourceVersion) \
             --flags=gpu,pytest,${COVERAGE_SOURCE} --name="GPU-coverage" --env=linux,azure
           ls -l
-        workingDirectory: tests/tests_fabric/
+        workingDirectory: tests/
         displayName: "Statistics"
 
       - script: |
diff --git a/.azure/gpu-tests-pytorch.yml b/.azure/gpu-tests-pytorch.yml
@@ -155,13 +155,13 @@ jobs:
           ls -l checkpoints/
         displayName: "Get legacy checkpoints"
 
-      - bash: python -m coverage run --source ${COVERAGE_SOURCE} -m pytest -v --durations=50
-        workingDirectory: tests/tests_pytorch
+      - bash: python -m coverage run --source ${COVERAGE_SOURCE} -m pytest tests_pytorch/ -v --durations=50
+        workingDirectory: tests/
         displayName: "Testing: PyTorch standard"
         timeoutInMinutes: "35"
 
-      - bash: bash ../run_standalone_tests.sh "."
-        workingDirectory: tests/tests_pytorch
+      - bash: bash ./run_standalone_tests.sh "tests_pytorch"
+        workingDirectory: tests/
         env:
           PL_USE_MOCKED_MNIST: "1"
           PL_STANDALONE_TESTS_SOURCE: $(COVERAGE_SOURCE)
diff --git a/README.md b/README.md
@@ -585,7 +585,6 @@ Lightning is rigorously tested across multiple CPUs, GPUs and TPUs and against m
 |       System / PyTorch ver.        | 1.13                                                                                                                                                                                                                            | 2.0                                                                                                                                                                                                                             |                                                                                                               2.1                                                                                                               |
 | :--------------------------------: | :-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |:-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------:|
 |        Linux py3.9 \[GPUs\]        |  |  | [![Build Status](https://dev.azure.com/Lightning-AI/lightning/_apis/build/status%2Fpytorch-lightning%20%28GPUs%29?branchName=master)](https://dev.azure.com/Lightning-AI/lightning/_build/latest?definitionId=24&branchName=master) |
-|        Linux py3.9 \[TPUs\]        |                                                                                                                                                                                                                                 |  [![Test PyTorch - TPU](https://github.com/Lightning-AI/lightning/actions/workflows/tpu-tests.yml/badge.svg)](https://github.com/Lightning-AI/lightning/actions/workflows/tpu-tests.yml)     |      |
 |  Linux (multiple Python versions)  | [![Test PyTorch](https://github.com/Lightning-AI/lightning/actions/workflows/ci-tests-pytorch.yml/badge.svg)](https://github.com/Lightning-AI/lightning/actions/workflows/ci-tests-pytorch.yml)                                 | [![Test PyTorch](https://github.com/Lightning-AI/lightning/actions/workflows/ci-tests-pytorch.yml/badge.svg)](https://github.com/Lightning-AI/lightning/actions/workflows/ci-tests-pytorch.yml)                                 |                 [![Test PyTorch](https://github.com/Lightning-AI/lightning/actions/workflows/ci-tests-pytorch.yml/badge.svg)](https://github.com/Lightning-AI/lightning/actions/workflows/ci-tests-pytorch.yml)                 |
 |   OSX (multiple Python versions)   | [![Test PyTorch](https://github.com/Lightning-AI/lightning/actions/workflows/ci-tests-pytorch.yml/badge.svg)](https://github.com/Lightning-AI/lightning/actions/workflows/ci-tests-pytorch.yml)                                 | [![Test PyTorch](https://github.com/Lightning-AI/lightning/actions/workflows/ci-tests-pytorch.yml/badge.svg)](https://github.com/Lightning-AI/lightning/actions/workflows/ci-tests-pytorch.yml)                                 |                 [![Test PyTorch](https://github.com/Lightning-AI/lightning/actions/workflows/ci-tests-pytorch.yml/badge.svg)](https://github.com/Lightning-AI/lightning/actions/workflows/ci-tests-pytorch.yml)                 |
 | Windows (multiple Python versions) | [![Test PyTorch](https://github.com/Lightning-AI/lightning/actions/workflows/ci-tests-pytorch.yml/badge.svg)](https://github.com/Lightning-AI/lightning/actions/workflows/ci-tests-pytorch.yml)                                 | [![Test PyTorch](https://github.com/Lightning-AI/lightning/actions/workflows/ci-tests-pytorch.yml/badge.svg)](https://github.com/Lightning-AI/lightning/actions/workflows/ci-tests-pytorch.yml)                                 |                 [![Test PyTorch](https://github.com/Lightning-AI/lightning/actions/workflows/ci-tests-pytorch.yml/badge.svg)](https://github.com/Lightning-AI/lightning/actions/workflows/ci-tests-pytorch.yml)                 |
diff --git a/docs/source-pytorch/accelerators/gpu_intermediate.rst b/docs/source-pytorch/accelerators/gpu_intermediate.rst
@@ -26,7 +26,7 @@ Lightning supports multiple ways of doing distributed training.
     If you request multiple GPUs or nodes without setting a strategy, DDP will be automatically used.
 
 For a deeper understanding of what Lightning is doing, feel free to read this
-`guide <https://medium.com/@_willfalcon/9-tips-for-training-lightning-fast-neural-networks-in-pytorch-8e63a502f565>`_.
+`guide <https://towardsdatascience.com/9-tips-for-training-lightning-fast-neural-networks-in-pytorch-8e63a502f565>`_.
 
 
 ----
diff --git a/tests/run_standalone_tests.sh b/tests/run_standalone_tests.sh
@@ -19,79 +19,99 @@ set -e
 # It can be set through the env variable PL_STANDALONE_TESTS_BATCH_SIZE and defaults to 6 if not set
 test_batch_size="${PL_STANDALONE_TESTS_BATCH_SIZE:-6}"
 source="${PL_STANDALONE_TESTS_SOURCE:-"lightning"}"
+# this is the directory where the tests are located
+test_dir=$1 # parse the first argument
+COLLECTED_TESTS_FILE="collected_tests.txt"
+
+ls -lh .  # show the contents of the directory
 
 # this environment variable allows special tests to run
 export PL_RUN_STANDALONE_TESTS=1
 # python arguments
 defaults=" -m coverage run --source ${source} --append -m pytest --no-header -v -s --timeout 120 "
 echo "Using defaults: ${defaults}"
 
-# get the testing location as the first argument
-test_path=$1
-printf "source path: $test_path\n"
-
-# collect all tests with parametrization based filtering with PL_RUN_STANDALONE_TESTS
-standalone_tests=$(python3 -m pytest $test_path -q --collect-only --pythonwarnings ignore)
-printf "Collected tests: \n $standalone_tests\n"
-# match only lines with tests
-parametrizations=$(perl -nle 'print $& while m{\S+::test_\S+}g' <<< "$standalone_tests")
-# convert the list to be array
-parametrizations_arr=($parametrizations)
-report=''
+# get the list of parametrizations. we need to call them separately. the last two lines are removed.
+# note: if there's a syntax error, this will fail with some garbled output
+python3 -um pytest $test_dir -q --collect-only --pythonwarnings ignore 2>&1 > $COLLECTED_TESTS_FILE
+# early terminate if collection failed (e.g. syntax error)
+if [[ $? != 0 ]]; then
+  cat $COLLECTED_TESTS_FILE
+  exit 1
+fi
 
-rm -f standalone_test_output.txt  # in case it exists, remove it
-rm -f testnames.txt
+# removes the last line of the file
+sed -i '$d' $COLLECTED_TESTS_FILE
 
-function show_batched_output {
-  if [ -f standalone_test_output.txt ]; then  # if exists
-    cat standalone_test_output.txt
-    # heuristic: stop if there's mentions of errors. this can prevent false negatives when only some of the ranks fail
-    if perl -nle 'print if /error|(?<!(?-i)on_)exception|traceback|(?<!(?-i)x)failed/i' standalone_test_output.txt | grep -qv -f testnames.txt; then
-      echo "Potential error! Stopping."
-      perl -nle 'print if /error|(?<!(?-i)on_)exception|traceback|(?<!(?-i)x)failed/i' standalone_test_output.txt
-      rm standalone_test_output.txt
-      exit 1
-    fi
-    rm standalone_test_output.txt
-  fi
-}
-trap show_batched_output EXIT  # show the output on exit
+# Get test list and run each test individually
+tests=($(grep -oP '\S+::test_\S+' "$COLLECTED_TESTS_FILE"))
+test_count=${#tests[@]}
+# present the collected tests
+printf "collected $test_count tests:\n-------------------\n"
+# replace space with new line
+echo "${tests[@]}" | tr ' ' '\n'
+printf "\n===================\n"
 
-# remove the "tests/tests_pytorch/" path suffixes
-path_prefix=$(basename "$(dirname "$(pwd)")")/$(basename "$(pwd)")"/"  # https://stackoverflow.com/a/8223345
+# if test count is one print warning
+if [[ $test_count -eq 1 ]]; then
+  printf "WARNING: only one test found!\n"
+elif [ $test_count -eq 0 ]; then
+  printf "ERROR: no tests found!\n"
+  exit 1
+fi
 
-for i in "${!parametrizations_arr[@]}"; do
-  parametrization=${parametrizations_arr[$i]//$path_prefix/}
-  prefix="$((i+1))/${#parametrizations_arr[@]}"
+# clear all the collected reports
+rm -f parallel_test_output-*.txt  # in case it exists, remove it
 
-  echo "$prefix: Running $parametrization"
-  echo $parametrization | sed 's/\[[^][]*\]//g' >> testnames.txt
 
-  # fix the port to avoid race condition when batched distributed tests select the port randomly
-  export MASTER_PORT=$((29500 + $i % $test_batch_size))
+status=0 # reset the script status
+report="" # final report
+pids=() # array of PID for running tests
+test_ids=() # array of indexes of running tests
+printf "Running $test_count tests in batches of $test_batch_size\n"
+for i in "${!tests[@]}"; do
+  # remove initial "tests/" from the test name
+  test=${tests[$i]/tests\//}
+  printf "Running test $((i+1))/$test_count: $test\n"
 
   # execute the test in the background
-  # redirect to a log file that buffers test output. since the tests will run in the background, we cannot let them
-  # output to std{out,err} because the outputs would be garbled together
-  python3 ${defaults} "$parametrization" &>> standalone_test_output.txt &
-  # save the PID in an array
-  pids[${i}]=$!
-  # add row to the final report
-  report+="Ran\t$parametrization\n"
+  # redirect to a log file that buffers test output. since the tests will run in the background,
+  # we cannot let them output to std{out,err} because the outputs would be garbled together
+  python3 ${defaults} "$test" 2>&1 > "standalone_test_output-$i.txt" &
+  test_ids+=($i) # save the test's id in an array with running tests
+  pids+=($!) # save the PID in an array with running tests
 
-  if ((($i + 1) % $test_batch_size == 0)); then
+  # if we reached the batch size, wait for all tests to finish
+  if (( (($i + 1) % $test_batch_size == 0) || $i == $test_count-1 )); then
+    printf "Waiting for batch to finish: $(IFS=' '; echo "${pids[@]}")\n"
     # wait for running tests
-    for pid in ${pids[*]}; do wait $pid; done
-    unset pids  # empty the array
-    show_batched_output
+    for j in "${!test_ids[@]}"; do
+      i=${test_ids[$j]} # restore the global test's id
+      pid=${pids[$j]} # restore the particular PID
+      test=${tests[$i]} # restore the test name
+      printf "Waiting for $tests >> standalone_test_output-$i.txt (PID: $pid)\n"
+      wait -n $pid
+      # get the exit status of the test
+      test_status=$?
+      # add row to the final report
+      report+="Ran\t$test\t>> exit:$test_status\n"
+      if [[ $test_status != 0 ]]; then
+        # show the output of the failed test
+        cat "standalone_test_output-$i.txt"
+        # Process exited with a non-zero exit status
+        status=$test_status
+      fi
+    done
+    test_ids=()  # reset the test's id array
+    pids=()  # reset the PID array
   fi
 done
-# wait for leftover tests
-for pid in ${pids[*]}; do wait $pid; done
-show_batched_output
 
 # echo test report
 printf '=%.s' {1..80}
 printf "\n$report"
 printf '=%.s' {1..80}
 printf '\n'
+
+# exit with the worst test result
+exit $status
diff --git a/tests/tests_pytorch/run_standalone_tasks.sh b/tests/tests_pytorch/run_standalone_tasks.sh
@@ -21,7 +21,13 @@ export PL_RUN_STANDALONE_TESTS=1
 # test that a user can manually launch individual processes
 echo "Running manual ddp launch test"
 export PYTHONPATH="${PYTHONPATH}:$(pwd)"
-args="fit --trainer.accelerator gpu --trainer.devices 2 --trainer.strategy ddp --trainer.max_epochs=1 --trainer.limit_train_batches=1 --trainer.limit_val_batches=1 --trainer.limit_test_batches=1"
+args="fit --trainer.accelerator gpu \
+--trainer.devices 2 \
+--trainer.strategy ddp \
+--trainer.max_epochs=1 \
+--trainer.limit_train_batches=1 \
+--trainer.limit_val_batches=1 \
+--trainer.limit_test_batches=1"
 MASTER_ADDR="localhost" MASTER_PORT=1234 LOCAL_RANK=1 python strategies/scripts/cli_script.py ${args} &
 MASTER_ADDR="localhost" MASTER_PORT=1234 LOCAL_RANK=0 python strategies/scripts/cli_script.py ${args}