Skip to content

Commit 83acb86

Browse files
author
Sean Naren
authored
Update DeepSpeed version, fix failing tests (#9898)
1 parent f9d2612 commit 83acb86

File tree

3 files changed

+24
-10
lines changed

3 files changed

+24
-10
lines changed

.azure-pipelines/gpu-tests.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ jobs:
5151
- bash: |
5252
python -c "fname = 'requirements/extra.txt' ; lines = [line for line in open(fname).readlines() if 'horovod' not in line] ; open(fname, 'w').writelines(lines)"
5353
pip install fairscale>=0.3.4
54-
pip install "deepspeed==0.4.3" # FIXME: bug with >= 0.4.4
54+
pip install deepspeed==0.5.4
5555
pip install . --requirement requirements/devel.txt
5656
pip list
5757
displayName: 'Install dependencies'

tests/models/test_hooks.py

Lines changed: 18 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -416,18 +416,35 @@ def _predict_batch(trainer, model, batches):
416416
return out
417417

418418

419+
@RunIf(deepspeed=True, min_gpus=1, special=True)
420+
def test_trainer_model_hook_system_fit_deepspeed_automatic_optimization(tmpdir):
421+
_run_trainer_model_hook_system_fit(
422+
dict(gpus=1, precision=16, plugins="deepspeed"), tmpdir, automatic_optimization=True
423+
)
424+
425+
426+
@RunIf(deepspeed=True, min_gpus=1, special=True)
427+
def test_trainer_model_hook_system_fit_deepspeed_manual_optimization(tmpdir):
428+
_run_trainer_model_hook_system_fit(
429+
dict(gpus=1, precision=16, plugins="deepspeed"), tmpdir, automatic_optimization=False
430+
)
431+
432+
419433
@pytest.mark.parametrize(
420434
"kwargs",
421435
[
422436
{},
423437
# these precision plugins modify the optimization flow, so testing them explicitly
424-
pytest.param(dict(gpus=1, precision=16, plugins="deepspeed"), marks=RunIf(deepspeed=True, min_gpus=1)),
425438
pytest.param(dict(gpus=1, precision=16, amp_backend="native"), marks=RunIf(min_gpus=1)),
426439
pytest.param(dict(gpus=1, precision=16, amp_backend="apex"), marks=RunIf(amp_apex=True, min_gpus=1)),
427440
],
428441
)
429442
@pytest.mark.parametrize("automatic_optimization", (True, False))
430443
def test_trainer_model_hook_system_fit(tmpdir, kwargs, automatic_optimization):
444+
_run_trainer_model_hook_system_fit(kwargs, tmpdir, automatic_optimization)
445+
446+
447+
def _run_trainer_model_hook_system_fit(kwargs, tmpdir, automatic_optimization):
431448
called = []
432449

433450
class TestModel(HookedModel):
@@ -459,14 +476,11 @@ def training_step(self, batch, batch_idx):
459476
callbacks=[callback],
460477
**kwargs,
461478
)
462-
463479
assert called == [
464480
dict(name="Callback.on_init_start", args=(trainer,)),
465481
dict(name="Callback.on_init_end", args=(trainer,)),
466482
]
467-
468483
trainer.fit(model)
469-
470484
saved_ckpt = {
471485
"callbacks": ANY,
472486
"epoch": 1,
@@ -481,7 +495,6 @@ def training_step(self, batch, batch_idx):
481495
elif kwargs.get("amp_backend") == "apex":
482496
saved_ckpt["amp_scaling_state"] = ANY
483497
device = torch.device("cuda:0" if "gpus" in kwargs else "cpu")
484-
485498
expected = [
486499
dict(name="Callback.on_init_start", args=(trainer,)),
487500
dict(name="Callback.on_init_end", args=(trainer,)),

tests/plugins/test_deepspeed_plugin.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -213,20 +213,21 @@ def backward(self, loss: Tensor, optimizer: Optimizer, optimizer_idx: int, *args
213213
trainer.fit(model)
214214

215215

216-
@RunIf(min_gpus=1, deepspeed=True, special=True)
216+
@RunIf(min_gpus=1, deepspeed=True)
217217
@pytest.mark.parametrize(
218218
["dataset_cls", "value"],
219219
[(RandomDataset, "auto"), (RandomDataset, 10), (RandomIterableDataset, "auto"), (RandomIterableDataset, 10)],
220220
)
221-
def test_deepspeed_auto_batch_size_config_select(tmpdir, dataset_cls, value):
221+
@mock.patch("deepspeed.init_distributed", autospec=True)
222+
def test_deepspeed_auto_batch_size_config_select(mock_deepspeed_distributed, tmpdir, dataset_cls, value):
222223
"""Test to ensure that the batch size is correctly set as expected for deepspeed logging purposes."""
223224

224225
class TestModel(BoringModel):
225226
def train_dataloader(self):
226227
return DataLoader(dataset_cls(32, 64))
227228

228229
class AssertCallback(Callback):
229-
def on_train_start(self, trainer, pl_module) -> None:
230+
def setup(self, trainer, pl_module, stage: Optional[str] = None) -> None:
230231
assert isinstance(trainer.accelerator.training_type_plugin, DeepSpeedPlugin)
231232
config = trainer.accelerator.training_type_plugin.config
232233

@@ -855,7 +856,7 @@ def test_deepspeed_multigpu_no_schedulers(tmpdir):
855856
_assert_save_model_is_equal(model, tmpdir, trainer)
856857

857858

858-
@RunIf(min_gpus=1, deepspeed=True)
859+
@RunIf(min_gpus=1, deepspeed=True, special=True)
859860
def test_deepspeed_skip_backward_raises(tmpdir):
860861
class TestModel(BoringModel):
861862
def training_step(self, batch, batch_idx):

0 commit comments

Comments
 (0)