Skip to content

Commit d058190

Browse files
authored
Run standalone tests in batches (#13673)
1 parent 0449e86 commit d058190

File tree

2 files changed

+36
-17
lines changed

2 files changed

+36
-17
lines changed

tests/tests_pytorch/run_standalone_tests.sh

Lines changed: 32 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ set -e
1818
# this environment variable allows special tests to run
1919
export PL_RUN_STANDALONE_TESTS=1
2020
# python arguments
21-
defaults='-m coverage run --source pytorch_lightning --append -m pytest --capture=no'
21+
defaults='-m coverage run --source pytorch_lightning --append -m pytest --no-header'
2222

2323
# find tests marked as `@RunIf(standalone=True)`. done manually instead of with pytest because it is faster
2424
grep_output=$(grep --recursive --word-regexp . --regexp 'standalone=True' --include '*.py')
@@ -40,22 +40,47 @@ parametrizations_arr=($parametrizations)
4040
# tests to skip - space separated
4141
blocklist='profilers/test_profiler.py::test_pytorch_profiler_nested_emit_nvtx utilities/test_warnings.py'
4242
report=''
43+
test_batch_size=6
44+
45+
rm -f standalone_test_output.txt # in case it exists, remove it
46+
function show_batched_output {
47+
if [ -f standalone_test_output.txt ]; then # if exists
48+
cat standalone_test_output.txt
49+
rm standalone_test_output.txt
50+
fi
51+
}
52+
trap show_batched_output EXIT # show the output on exit
4353

4454
for i in "${!parametrizations_arr[@]}"; do
4555
parametrization=${parametrizations_arr[$i]}
4656

4757
# check blocklist
4858
if echo $blocklist | grep -F "${parametrization}"; then
4959
report+="Skipped\t$parametrization\n"
50-
continue
60+
# do not continue the loop because we might need to wait for batched jobs
61+
else
62+
echo "Running $parametrization"
63+
# execute the test in the background
64+
# redirect to a log file that buffers test output. since the tests will run in the background, we cannot let them
65+
# output to std{out,err} because the outputs would be garbled together
66+
python ${defaults} "$parametrization" &>> standalone_test_output.txt &
67+
# save the PID in an array
68+
pids[${i}]=$!
69+
# add row to the final report
70+
report+="Ran\t$parametrization\n"
5171
fi
5272

53-
# run the test
54-
echo "Running $parametrization"
55-
python ${defaults} "$parametrization"
56-
57-
report+="Ran\t$parametrization\n"
73+
if ((($i + 1) % $test_batch_size == 0)); then
74+
# wait for running tests
75+
for pid in ${pids[*]}; do wait $pid; done
76+
unset pids # empty the array
77+
show_batched_output
78+
fi
5879
done
80+
# wait for leftover tests
81+
for pid in ${pids[*]}; do wait $pid; done
82+
show_batched_output
83+
echo "Batched mode finished. Continuing with the rest of standalone tests."
5984

6085
if nvcc --version; then
6186
nvprof --profile-from-start off -o trace_name.prof -- python ${defaults} profilers/test_profiler.py::test_pytorch_profiler_nested_emit_nvtx

tests/tests_pytorch/strategies/test_deepspeed_strategy.py

Lines changed: 4 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
from torch.utils.data import DataLoader
2727
from torchmetrics import Accuracy
2828

29-
from pytorch_lightning import LightningDataModule, LightningModule, seed_everything, Trainer
29+
from pytorch_lightning import LightningDataModule, LightningModule, Trainer
3030
from pytorch_lightning.callbacks import Callback, LearningRateMonitor, ModelCheckpoint
3131
from pytorch_lightning.demos.boring_classes import BoringModel, RandomDataset
3232
from pytorch_lightning.plugins import DeepSpeedPrecisionPlugin
@@ -712,7 +712,6 @@ def test_deepspeed_multigpu_stage_3_manual_optimization(tmpdir, deepspeed_config
712712
@pytest.mark.parametrize(("accumulate_grad_batches", "automatic_optimization"), [(1, False), (2, True)])
713713
@RunIf(min_cuda_gpus=2, standalone=True, deepspeed=True)
714714
def test_deepspeed_multigpu_stage_3_checkpointing(tmpdir, automatic_optimization, accumulate_grad_batches):
715-
seed_everything(1)
716715
if automatic_optimization:
717716
model = ModelParallelClassificationModel()
718717
else:
@@ -734,9 +733,7 @@ def test_deepspeed_multigpu_stage_3_checkpointing(tmpdir, automatic_optimization
734733
trainer.fit(model, datamodule=dm)
735734

736735
results = trainer.test(datamodule=dm)
737-
assert results[0]["test_acc"] > 0.7
738736
saved_results = trainer.test(ckpt_path=ck.best_model_path, datamodule=dm)
739-
assert saved_results[0]["test_acc"] > 0.7
740737
assert saved_results == results
741738

742739
if automatic_optimization:
@@ -752,9 +749,7 @@ def test_deepspeed_multigpu_stage_3_checkpointing(tmpdir, automatic_optimization
752749
enable_progress_bar=False,
753750
enable_model_summary=False,
754751
)
755-
756-
results = trainer.test(model, datamodule=dm, ckpt_path=ck.best_model_path)
757-
assert results[0]["test_acc"] > 0.7
752+
trainer.test(model, datamodule=dm, ckpt_path=ck.best_model_path)
758753

759754

760755
@RunIf(min_cuda_gpus=1, standalone=True, deepspeed=True)
@@ -861,7 +856,6 @@ def on_train_epoch_start(self, trainer: Trainer, pl_module: LightningModule) ->
861856
@RunIf(min_cuda_gpus=2, standalone=True, deepspeed=True)
862857
def test_deepspeed_multigpu_stage_2_accumulated_grad_batches(tmpdir, offload_optimizer):
863858
"""Test to ensure with Stage 2 and multiple GPUs, accumulated grad batches works."""
864-
seed_everything(42)
865859

866860
class VerificationCallback(Callback):
867861
def __init__(self):
@@ -1109,7 +1103,7 @@ def test_dataloader(self):
11091103
@pytest.mark.parametrize("max_epoch", [2])
11101104
@pytest.mark.parametrize("limit_train_batches", [2])
11111105
@RunIf(min_cuda_gpus=1, standalone=True, deepspeed=True)
1112-
def test_scheduler_step_count(mock_step, max_epoch, limit_train_batches, interval):
1106+
def test_scheduler_step_count(mock_step, tmpdir, max_epoch, limit_train_batches, interval):
11131107
"""Test to ensure that the scheduler is called the correct amount of times during training when scheduler is
11141108
set to step or epoch."""
11151109

@@ -1124,7 +1118,7 @@ def configure_optimizers(self):
11241118

11251119
model = TestModel()
11261120
trainer = Trainer(
1127-
default_root_dir=os.getcwd(),
1121+
default_root_dir=tmpdir,
11281122
limit_train_batches=limit_train_batches,
11291123
limit_val_batches=0,
11301124
max_epochs=max_epoch,

0 commit comments

Comments
 (0)