Skip to content

Commit c7f8c8c

Browse files
[bugfix] DeepSpeed with no schedulers (#8580)
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
1 parent 39de7fe commit c7f8c8c

File tree

3 files changed

+27
-3
lines changed

3 files changed

+27
-3
lines changed

CHANGELOG.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -236,6 +236,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
236236
- Fixed `BackboneFinetuning` restoration ([#8501](https://github.com/PyTorchLightning/pytorch-lightning/pull/8501))
237237
- Fixed `lr_scheduler` with metric (e.g. `torch.optim.lr_scheduler.ReduceLROnPlateau`) when using `automatic_optimization = False` ([#7643](https://github.com/PyTorchLightning/pytorch-lightning/pull/7643))
238238

239+
- Fixed `DeepSpeed` breaking with no schedulers ([#8580](https://github.com/PyTorchLightning/pytorch-lightning/pull/8580))
240+
241+
239242

240243
## [1.3.8] - 2021-07-01
241244

pytorch_lightning/plugins/training_type/deepspeed.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -397,7 +397,7 @@ def _init_optimizers(self) -> Tuple[Optimizer, Optional[Union[LRSchedulerTypeTup
397397
)
398398
return (
399399
optimizers[0],
400-
schedulers[0] if schedulers else None,
400+
schedulers[0] if schedulers else _get_default_scheduler_config(),
401401
optimizer_frequencies[0] if optimizer_frequencies else None,
402402
)
403403

@@ -414,6 +414,7 @@ def _initialize_deepspeed_train(self, model):
414414
"Using `configure_optimizers` to define optimizer and scheduler."
415415
)
416416
optimizer, lr_scheduler, _ = self._init_optimizers()
417+
417418
scheduler = lr_scheduler["scheduler"]
418419

419420
model_parameters = filter(lambda p: p.requires_grad, self.model.parameters())
@@ -430,8 +431,9 @@ def _initialize_deepspeed_train(self, model):
430431

431432
# although we set these here, deepspeed manages the specific optimizer logic
432433
self.lightning_module.trainer.optimizers = [deepspeed_optimizer]
433-
lr_scheduler["scheduler"] = deepspeed_scheduler
434-
self.lightning_module.trainer.lr_schedulers = [lr_scheduler]
434+
if deepspeed_scheduler is not None:
435+
lr_scheduler["scheduler"] = deepspeed_scheduler
436+
self.lightning_module.trainer.lr_schedulers = [lr_scheduler]
435437
self.model = model
436438

437439
@contextlib.contextmanager

tests/plugins/test_deepspeed_plugin.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,11 @@ def on_load_checkpoint(self, checkpoint: Dict[str, Any]) -> None:
3333
self.configure_sharded_model()
3434

3535

36+
class ModelParallelBoringModelNoSchedulers(ModelParallelBoringModel):
37+
def configure_optimizers(self):
38+
return torch.optim.SGD(self.layer.parameters(), lr=0.1)
39+
40+
3641
class ModelParallelBoringModelManualOptim(BoringModel):
3742
def __init__(self):
3843
super().__init__()
@@ -687,3 +692,17 @@ def _assert_save_model_is_equal(model, tmpdir, trainer, cls=BoringModel):
687692
# Assert model parameters are identical after loading
688693
for orig_param, trained_model_param in zip(model.parameters(), saved_model.parameters()):
689694
assert torch.equal(orig_param, trained_model_param)
695+
696+
697+
@RunIf(min_gpus=2, deepspeed=True, special=True)
698+
def test_deepspeed_multigpu_no_schedulers(tmpdir):
699+
"""
700+
Test to ensure ZeRO Stage 3 works with a parallel model and no schedulers.
701+
"""
702+
model = ModelParallelBoringModelNoSchedulers()
703+
trainer = Trainer(
704+
default_root_dir=tmpdir, plugins=[DeepSpeedPlugin(stage=3)], gpus=2, fast_dev_run=True, precision=16
705+
)
706+
trainer.fit(model)
707+
708+
_assert_save_model_is_equal(model, tmpdir, trainer, cls=ModelParallelBoringModelNoSchedulers)

0 commit comments

Comments
 (0)