support recursive torch compile (#999)

JKSenthil · facebook-github-bot · commit 353223e0a0de · 2025-05-12T16:55:04.000-07:00
Summary: Pull Request resolved: #999 # Context Applying torch compile recursively on submodules (rather than once at the top-level module) is a common application, especially when targetting llama architectures where only the self attention layer(s) should be compiled. # This Diff Adds `recursive_module_types` flag to TorchCompileParams. Will recursively apply torch compile on any submodules matching the name Reviewed By: galrotem Differential Revision: D74410717 fbshipit-source-id: 319d15a109f132a216915d200bbdd04dd2c35871
diff --git a/tests/utils/test_prepare_module.py b/tests/utils/test_prepare_module.py
@@ -17,6 +17,7 @@
 from torchtnt.utils.env import init_from_env
 from torchtnt.utils.prepare_module import (
     _check_and_convert_mp_policy_dtypes,
+    apply_torch_compile,
     DDPStrategy,
     FSDPStrategy,
     materialize_meta_params,
@@ -266,3 +267,67 @@ def test_check_and_convert_mp_policy_dtypes(self) -> None:
             "MixedPrecisionPolicy requires all dtypes to be torch.dtype.",
         ):
             _check_and_convert_mp_policy_dtypes(invalid_mp_policy)
+
+    def test_apply_torch_compile_recursive_module_types(self) -> None:
+        """
+        Test that recursive_module_types is apply correctly.
+        """
+
+        # Create a mock module with submodules
+        class B(torch.nn.Module):
+            def forward(self, x):
+                return x
+
+        class C(torch.nn.Module):
+            def forward(self, x):
+                return x
+
+        class A(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.b = B()
+                self.c = C()
+
+            def forward(self, x):
+                x = self.b(x)
+                x = self.c(x)
+                return x
+
+        module = A()
+
+        # Mock the torch.compile function
+        with patch("torch.compile", return_value=None) as mock_compile:
+            # Define TorchCompileParams with recursive_module_types
+            torch_compile_params = TorchCompileParams(
+                fullgraph=False,
+                dynamic=False,
+                backend="inductor",
+                mode=None,
+                options=None,
+                disable=False,
+                recursive_module_types=[B, "C"],
+            )
+
+            # Apply torch compile
+            apply_torch_compile(module, torch_compile_params)
+
+            # Check that torch.compile was called on C and B
+            self.assertEqual(mock_compile.call_count, 2)
+            mock_compile.assert_any_call(
+                module.b._call_impl,
+                fullgraph=False,
+                dynamic=False,
+                backend="inductor",
+                mode=None,
+                options=None,
+                disable=False,
+            )
+            mock_compile.assert_any_call(
+                module.c._call_impl,
+                fullgraph=False,
+                dynamic=False,
+                backend="inductor",
+                mode=None,
+                options=None,
+                disable=False,
+            )
diff --git a/torchtnt/utils/prepare_module.py b/torchtnt/utils/prepare_module.py
@@ -7,12 +7,13 @@
 # pyre-strict
 
 import logging
-from dataclasses import asdict, dataclass
+from dataclasses import asdict, dataclass, field
 from functools import partial
 from typing import (
     Any,
     Callable,
     cast,
+    Collection,
     ContextManager,
     Dict,
     Iterable,
@@ -231,6 +232,10 @@ class FSDP2Strategy(Strategy):
 class TorchCompileParams:
     """
     Dataclass to store parameters for torch compile. See https://pytorch.org/docs/stable/generated/torch.compile.html for details.
+
+    TNT specific args:
+        recursive_module_types: list of module types to recursively compile. If not specified, applies compile to top-level module only.
+            ex. ["TransformerCrossAttentionLayer", torch.nn.Linear] both work
     """
 
     fullgraph: bool = False
@@ -241,6 +246,11 @@ class TorchCompileParams:
     options: Optional[Dict[str, Union[str, int, bool]]] = None
     disable: bool = False
 
+    # TNT specific params
+    recursive_module_types: Collection[Union[str, Type[torch.nn.Module]]] = field(
+        default_factory=list
+    )
+
 
 @dataclass
 class ActivationCheckpointParams:
@@ -478,16 +488,38 @@ def apply_torch_compile(
     torch_compile_params: TorchCompileParams,
 ) -> None:
     """
-    Applies torch.compile in-place.
+    Applies torch.compile in-place on a given module.
 
     Args:
         module: module to apply torch.compile on
         torch_compile_params: params to configure the torch.compile
     """
-
+    recursive_module_types = torch_compile_params.recursive_module_types
+    params_dict = asdict(torch_compile_params)
+    # remove recursive_module_types from params dict as we pass this directly to torch.compile
+    params_dict.pop("recursive_module_types")
     try:
         # use in-place compile to avoid altering the state_dict keys
-        module.compile(**asdict(torch_compile_params))
+
+        if len(recursive_module_types) == 0:
+            # compile only top-level module
+            module.compile(**params_dict)
+        else:
+            # compile submodules recursively based on recursive_module_types
+
+            # 1) separate str and torch.nn.Module types from recursive_module_types
+            module_names: Set[str] = set()
+            module_types: Tuple[Type[torch.nn.Module], ...] = ()
+            for v in recursive_module_types:
+                if isinstance(v, str):
+                    module_names.add(v)
+                else:
+                    module_types = module_types + (v,)
+
+            # 2) apply torch.compile recursively
+            for m in reversed(list(module.modules())):
+                if isinstance(m, module_types) or type(m).__name__ in module_names:
+                    m.compile(**params_dict)
     except AttributeError:
         rank_zero_warn(
             "Please install PyTorch nightlies to use in-place compile to avoid altering the state_dict keys when checkpointing. Skipping torch compile."