Change lightning-cv example to make it work for gpu training (#293)

aivanou · facebook-github-bot · commit a7727c7cd4ea · 2021-10-20T19:24:52.000-07:00
Summary: Pull Request resolved: #293 Change `lightning-cv` example to make it work for gpu training It seems there are bugs with `ddp2` running on GPUs, so changing accelerator to `ddp` Reviewed By: d4l3k Differential Revision: D31818113 fbshipit-source-id: 6bbe4f0c3d62d90cb8d78ca4d912378c659ba24f
diff --git a/dev-requirements.txt b/dev-requirements.txt
@@ -4,7 +4,7 @@ kfp==1.6.2
 pyre-extensions>=0.0.21
 black>=21.5b1
 usort==0.6.4
-pytorch-lightning>=0.5.3
+pytorch-lightning>=1.4.9
 torch>=1.9.0
 torchvision>=0.10.0
 classy-vision>=0.6.0
diff --git a/scripts/kube_dist_trainer.py b/scripts/kube_dist_trainer.py
@@ -33,7 +33,7 @@ def register_gpu_resource() -> None:
     res = Resource(
         cpu=2,
         gpu=1,
-        memMB=4 * GiB,
+        memMB=8 * GiB,
     )
     print(f"Registering resource: {res}")
     named_resources["GPU_X1"] = res
diff --git a/torchx/examples/apps/lightning_classy_vision/train.py b/torchx/examples/apps/lightning_classy_vision/train.py
@@ -91,6 +91,9 @@ def get_gpu_devices() -> int:
 def get_model_checkpoint(args: argparse.Namespace) -> Optional[ModelCheckpoint]:
     if not args.output_path:
         return None
+    # Note: It is important that each rank behaves the same.
+    # All of the ranks, or none of them should return ModelCheckpoint
+    # Otherwise, there will be deadlock for distributed training
     return ModelCheckpoint(
         monitor="train_loss",
         dirpath=args.output_path,
@@ -132,12 +135,14 @@ def main(argv: List[str]) -> None:
         logger = TensorBoardLogger(
             save_dir=args.log_path, version=1, name="lightning_logs"
         )
-
         # Initialize a trainer
         num_nodes = int(os.environ.get("GROUP_WORLD_SIZE", 1))
+        num_processes = int(os.environ.get("LOCAL_WORLD_SIZE", 1))
         trainer = pl.Trainer(
             num_nodes=num_nodes,
-            accelerator="ddp2",
+            num_processes=num_processes,
+            gpus=get_gpu_devices(),
+            accelerator="ddp",
             logger=logger,
             max_epochs=args.epochs,
             callbacks=callbacks,
@@ -150,7 +155,8 @@ def main(argv: List[str]) -> None:
             f"train acc: {model.train_acc.compute()}, val acc: {model.val_acc.compute()}"
         )
 
-        if not args.skip_export and args.output_path:
+        rank = int(os.environ.get("RANK", 0))
+        if rank == 0 and not args.skip_export and args.output_path:
             # Export the inference model
             export_inference_model(model, args.output_path, tmpdir)
 

Original file line number	Diff line number	Diff line change
`@@ -33,7 +33,7 @@ def register_gpu_resource() -> None:`
`33`	`33`	`res = Resource(`
`34`	`34`	`cpu=2,`
`35`	`35`	`gpu=1,`
`36`		`- memMB=4 * GiB,`
	`36`	`+ memMB=8 * GiB,`
`37`	`37`	`)`
`38`	`38`	`print(f"Registering resource: {res}")`
`39`	`39`	`named_resources["GPU_X1"] = res`