DeepSpeed support for device IDs (#9847)

Sean Naren · web-flow · commit 6da5829e538c · 2021-10-12T09:24:46.000Z
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -489,6 +489,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Fixed missing arguments when saving hyperparameters from the parent class but not from the child class ([#9800](https://github.com/PyTorchLightning/pytorch-lightning/pull/9800))
 
 
+- Fixed DeepSpeed GPU device IDs ([#9847](https://github.com/PyTorchLightning/pytorch-lightning/pull/9847))
+
+
 - Reset `val_dataloader` in `tuner/batch_size_scaling` ([#9857](https://github.com/PyTorchLightning/pytorch-lightning/pull/9857))
 
 
diff --git a/pytorch_lightning/plugins/training_type/deepspeed.py b/pytorch_lightning/plugins/training_type/deepspeed.py
@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import argparse
 import contextlib
 import json
 import logging
@@ -429,6 +430,7 @@ def _initialize_deepspeed_train(self, model):
 
         model_parameters = filter(lambda p: p.requires_grad, self.model.parameters())
         model, deepspeed_optimizer, _, deepspeed_scheduler = deepspeed.initialize(
+            args=argparse.Namespace(device_rank=self.root_device.index),
             config=self.config,
             model=model,
             model_parameters=model_parameters,
@@ -505,6 +507,7 @@ def _initialize_deepspeed_inference(self, model):
         # Remove all module hooks before initializing new model
         remove_module_hooks(model)
         model, _, _, _ = deepspeed.initialize(
+            args=argparse.Namespace(device_rank=self.root_device.index),
             config=inference_config,
             model=model,
             optimizer=optimizer,
diff --git a/tests/plugins/test_deepspeed_plugin.py b/tests/plugins/test_deepspeed_plugin.py
@@ -970,3 +970,40 @@ def test_different_accumulate_grad_batches_fails(tmpdir):
         MisconfigurationException, match="DeepSpeed currently does not support different `accumulate_grad_batches`"
     ):
         trainer.fit(model)
+
+
+@RunIf(min_gpus=2, deepspeed=True, special=True)
+def test_specific_gpu_device_id(tmpdir):
+    class TestCallback(Callback):
+        def on_train_start(self, trainer: Trainer, pl_module: LightningModule) -> None:
+            assert model.device.index == 1
+
+        def on_train_batch_start(
+            self,
+            trainer: Trainer,
+            pl_module: LightningModule,
+            batch: Any,
+            batch_idx: int,
+            dataloader_idx: int,
+        ) -> None:
+            assert batch.device.index == 1
+
+        def on_test_start(self, trainer: Trainer, pl_module: LightningModule) -> None:
+            assert model.device.index == 1
+
+        def on_test_batch_start(
+            self,
+            trainer: Trainer,
+            pl_module: LightningModule,
+            batch: Any,
+            batch_idx: int,
+            dataloader_idx: int,
+        ) -> None:
+            assert batch.device.index == 1
+
+    model = BoringModel()
+    trainer = Trainer(
+        default_root_dir=tmpdir, fast_dev_run=True, gpus=[1], plugins="deepspeed", callbacks=TestCallback()
+    )
+    trainer.fit(model)
+    trainer.test(model)