Break hpu graphs into two for better performance (#14656)

jerome-habana · pre-commit-ci[bot] · Borda · lexierule · commit 87f01ca75246 · 2022-09-22T09:29:49.000-04:00
Signed-off-by: Jerome &lt;janand@habana.ai&gt;
Co-authored-by: pre-commit-ci[bot] &lt;66853113+pre-commit-ci[bot]@users.noreply.github.com&gt;
Co-authored-by: Jirka Borovec &lt;Borda@users.noreply.github.com&gt;
Co-authored-by: Kaushik B &lt;45285388+kaushikb11@users.noreply.github.com&gt;
diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md
@@ -10,6 +10,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 ### Fixed
 
 - Fixed the availability check for the neptune-client package ([#14714](https://github.com/Lightning-AI/lightning/pull/14714))
+- Break HPU Graphs into two parts (forward + backward as one and optimizer as another) for better performance ([#14656](https://github.com/Lightning-AI/lightning/pull/14656))
 
 
 ## [1.7.6] - 2022-09-13
diff --git a/src/pytorch_lightning/strategies/hpu_parallel.py b/src/pytorch_lightning/strategies/hpu_parallel.py
@@ -13,9 +13,11 @@
 # limitations under the License.
 import logging
 import os
-from typing import Any, Callable, Dict, List, Optional
+from typing import Any, Callable, Dict, List, Optional, Union
 
 import torch.distributed
+from torch.nn import Module
+from torch.optim.optimizer import Optimizer
 
 import pytorch_lightning as pl
 from pytorch_lightning.overrides import LightningDistributedModule
@@ -137,10 +139,22 @@ def broadcast(self, obj: object, src: int = 0) -> object:  # type: ignore
         broadcast_object_list(obj, src, group=_group.WORLD)
         return obj[0]
 
-    def training_step_end(self, step_output: STEP_OUTPUT) -> STEP_OUTPUT:
-        # Break lazy accumulation of graph after every step
+    def on_after_backward(self) -> None:
+        # Break lazy accumulation of graph after fwd+bwd
         htcore.mark_step()
-        return step_output
+
+    def optimizer_step(
+        self,
+        optimizer: Optimizer,
+        opt_idx: int,
+        closure: Callable[[], Any],
+        model: Optional[Union["pl.LightningModule", Module]] = None,
+        **kwargs: Any,
+    ) -> Any:
+        optimizer_output = super().optimizer_step(optimizer, opt_idx, closure, model, **kwargs)
+        # Break lazy accumulation of graph after optimizer
+        htcore.mark_step()
+        return optimizer_output
 
     def validation_step_end(self, step_output: STEP_OUTPUT) -> STEP_OUTPUT:
         # Break lazy accumulation of graph after every step
diff --git a/src/pytorch_lightning/strategies/single_hpu.py b/src/pytorch_lightning/strategies/single_hpu.py
@@ -12,7 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import Dict, Optional
+from typing import Any, Callable, Dict, Optional, Union
+
+from torch.nn import Module
+from torch.optim.optimizer import Optimizer
 
 import pytorch_lightning as pl
 from pytorch_lightning.plugins.io.checkpoint_plugin import CheckpointIO
@@ -78,10 +81,22 @@ def setup_optimizers(self, trainer: "pl.Trainer") -> None:
     def model_to_device(self) -> None:
         self.model.to(self.root_device)  # type: ignore
 
-    def training_step_end(self, step_output: STEP_OUTPUT) -> STEP_OUTPUT:
-        # Break lazy accumulation of graph after every step
+    def on_after_backward(self) -> None:
+        # Break lazy accumulation of graph after fwd+bwd
         htcore.mark_step()
-        return step_output
+
+    def optimizer_step(
+        self,
+        optimizer: Optimizer,
+        opt_idx: int,
+        closure: Callable[[], Any],
+        model: Optional[Union["pl.LightningModule", Module]] = None,
+        **kwargs: Any,
+    ) -> Any:
+        optimizer_output = super().optimizer_step(optimizer, opt_idx, closure, model, **kwargs)
+        # Break lazy accumulation of graph after optimizer
+        htcore.mark_step()
+        return optimizer_output
 
     def validation_step_end(self, step_output: STEP_OUTPUT) -> STEP_OUTPUT:
         # Break lazy accumulation of graph after every step