add profile plugin (#2683)

t-vi · web-flow · commit a5899682b7b1 · 2025-10-22T06:48:03.000+02:00
diff --git a/thunder/dev_utils/profile_transform.py b/thunder/dev_utils/profile_transform.py
@@ -1,6 +1,7 @@
 import thunder
 import torch
 import contextlib
+import re
 
 from thunder.core import prims
 from thunder.core.symbol import Symbol
@@ -20,15 +21,22 @@ def bind_postprocess(debug_bsym):
 
 
 class ProfileTransform(thunder.core.transform_common.Transform):
-    def __init__(self, *, warmup_runs=3, number_runs=1, start_idx=0, end_idx=-1, backward=False):
-        self.start_idx = start_idx
-        self.end_idx = end_idx
+    def __init__(self, *, warmup_runs=3, number_runs=1, start_idx=0, end_idx=None, input_match=None, backward=False):
+        self.input_match = input_match
+        if input_match is None:
+            self.start_idx = start_idx
+            self.end_idx = end_idx if end_idx is not None else -1
+        else:
+            self.match_start_idx = start_idx
+            self.match_end_idx = end_idx if end_idx is not None else 1
+
         self.enabled = True
         self.run_counter = 0
         self.warmup_runs = warmup_runs
         self.number_runs = number_runs
         self.prof = None
         self.backward = backward
+        self.computed_enabled = False
 
     def start_profile(self):
         self.run_counter += 1
@@ -94,13 +102,27 @@ def transform_trace_post_optimization(self, computation_trace, **kwargs):
         if self.backward ^ (TraceTag.BACKWARD in computation_trace.tags):
             return computation_trace
 
+        if self.input_match is not None:
+            self.match_list = []
+            for i, bsym in enumerate(computation_trace.bound_symbols):
+                for a in bsym.args:
+                    if isinstance(a, thunder.TensorProxy) and re.match(self.input_match, a.name):
+                        self.match_list.append((i, a.name))
+            start_idx = self.match_list[self.match_start_idx][0]
+            end_idx = self.match_list[self.match_end_idx][0]
+        else:
+            start_idx = self.start_idx
+            end_idx = self.end_idx
+
         new_bound_symbols = []
 
         new_trace = thunder.core.trace.from_trace(computation_trace)
-        start_idx = self.start_idx
+
+        need_end = False
 
         for i, bsym in enumerate(computation_trace.bound_symbols[:]):
-            if i == self.end_idx or bsym.sym == prims.python_return:
+            if i == end_idx or (bsym.sym == prims.python_return and need_end):
+                need_end = False
                 new_bound_symbols.append(create_boundsymbol("end_profiling", None, self.end_profile))
             if bsym.sym in {
                 prims.unpack_trivial,
@@ -115,6 +137,7 @@ def transform_trace_post_optimization(self, computation_trace, **kwargs):
                     start_idx += 1
                 continue
             if i == start_idx:
+                need_end = True
                 new_bound_symbols.append(create_boundsymbol("start_profiling", None, self.start_profile))
             new_bound_symbols.append(
                 create_boundsymbol(
diff --git a/thunder/plugins/__init__.py b/thunder/plugins/__init__.py
@@ -1,6 +1,7 @@
 from thunder.plugins.distributed import DDP, FSDP
 from thunder.plugins.quantization import QuantizeInt4
 from thunder.plugins.fp8 import FP8
+from thunder.plugins.profile import Profile as Profile
 from thunder.plugins.reduce_overhead import ReduceOverhead
 
 names_to_plugins = {
diff --git a/thunder/plugins/profile.py b/thunder/plugins/profile.py
@@ -0,0 +1,14 @@
+from thunder.core.recipe import Plugin, PluginPolicy
+from thunder.dev_utils.profile_transform import ProfileTransform
+
+
+class Profile(Plugin):
+    policy = PluginPolicy.POST
+
+    def __init__(self, input_match, from_match_idx=0, to_match_idx=1):
+        self.profile_transform = ProfileTransform(
+            input_match=input_match, start_idx=from_match_idx, end_idx=to_match_idx
+        )
+
+    def setup_transforms(self):
+        return [self.profile_transform]
diff --git a/thunder/recipes/base.py b/thunder/recipes/base.py
@@ -78,13 +78,11 @@ def __init__(
         self.fuser = fuser
         self.executor_names = []
 
-        if torch.cuda.is_available():
-            self.executor_names = ["cudnn", "sdpa"]
-            if self.fuser == "nvfuser":
-                self.executor_names.append("torchcompile_xentropy")
-        else:
-            print("GPU not found, nvFuser not available. Setting fusing executor to torch.compile")
-            self.fuser = "torch.compile"
+        if fuser is None:
+            if torch.cuda.is_available():
+                self.fuser = "nvfuser"
+            else:
+                self.fuser = "torch.compile"
 
         self.setup_fuser()
         self.show_progress = show_progress
diff --git a/thunder/recipes/hf_transformers.py b/thunder/recipes/hf_transformers.py
@@ -10,6 +10,19 @@
 from thunder import Recipe
 
 
+# for materializing models, we need reset_parameters, which is part of the unwritten
+# spec for idiomatic PyTorch, but not implemented everywhere
+def RotaryEmbedding_reset_parameters(self):
+    inv_freq, self.attention_scaling = self.rope_init_fn(self.config, self.inv_freq.device)
+    with torch.no_grad():
+        self.inv_freq.copy_(inv_freq)
+
+
+def RMSNorm_reset_parameters(self):
+    with torch.no_grad():
+        self.weight.fill_(1)
+
+
 class InplaceIndexCopyTransform(thunder.Transform):
     def __init__(self):
         super().__init__()
@@ -308,6 +321,16 @@ def apply(self, model):
             transformers.PreTrainedModel: Thunder-compiled model ready
             for inference.
         """
+
+        # We need reset_parameters for initialization of buffers in materialization.
+        # This seems to work for transformers 4.5x with Llama, Llama4 and Qwen2 at least
+        for submodule in model.modules():
+            cls = submodule.__class__
+            if cls.__name__.endswith("RotaryEmbedding") and not hasattr(cls, "reset_parameters"):
+                cls.reset_parameters = RotaryEmbedding_reset_parameters
+            elif cls.__name__.endswith("RMSNorm") and not hasattr(cls, "reset_parameters"):
+                cls.reset_parameters = RMSNorm_reset_parameters
+
         thunder_model = super().apply(model)
 
         if getattr(thunder_model, "generate", None):
diff --git a/thunder/tests/test_recipes.py b/thunder/tests/test_recipes.py
@@ -11,11 +11,13 @@
 from torch.testing import assert_close
 from thunder.recipes import HFTransformers
 from thunder.executors import nvfuser_available
-from thunder.executors.cudnnex import cudnn_available
 from thunder.tests.framework import IS_WINDOWS
 
 
-@pytest.mark.skipif(not cudnn_available(), reason="cuDNN is not available")
+def get_expected_executors():
+    return [ex for ex in thunder.get_default_executors() if ex.name not in {"cudnn", "sdpa", "torchcompile_xentropy"}]
+
+
 @pytest.mark.skipif(not nvfuser_available(), reason="nvFuser is not available")
 @pytest.mark.skipif(IS_WINDOWS, reason="slow on Windows")
 def test_default_recipe_basic_bert():
@@ -33,7 +35,6 @@ def test_default_recipe_basic_bert():
     assert_close(actual, expected)
 
 
-@pytest.mark.skipif(not cudnn_available(), reason="cuDNN is not available")
 @pytest.mark.skipif(not nvfuser_available(), reason="nvFuser is not available")
 @pytest.mark.skipif(IS_WINDOWS, reason="slow on Windows")
 def test_recipe_basic_bert():
@@ -65,7 +66,6 @@ def test_recipe_basic_bert():
     deregister_executor("sdpa_mask_transform_ex")
 
 
-@pytest.mark.skipif(not cudnn_available(), reason="cuDNN is not available")
 @pytest.mark.skipif(not nvfuser_available(), reason="nvFuser is not available")
 def test_recipe_basic_bert_fx():
     bert = transformers.BertForSequenceClassification(transformers.BertConfig())
@@ -88,7 +88,6 @@ def test_recipe_basic_bert_fx():
     deregister_executor("sdpa_mask_transform_ex")
 
 
-@pytest.mark.skipif(not cudnn_available(), reason="cuDNN is not available")
 @pytest.mark.skipif(not nvfuser_available(), reason="nvFuser is not available")
 @pytest.mark.parametrize(
     "model_cls, config_cls",
@@ -186,7 +185,6 @@ def __init__(self):
     deregister_executor("sdpa_mask_transform_ex")
 
 
-@pytest.mark.skipif(not cudnn_available(), reason="cuDNN is not available")
 @pytest.mark.skipif(not nvfuser_available(), reason="nvFuser is not available")
 def test_plugins_basics():
     model = torch.nn.Sequential(torch.nn.Linear(2048, 4096), torch.nn.ReLU(), torch.nn.Linear(4096, 64))
@@ -198,12 +196,11 @@ def test_plugins_basics():
     _ = thunder_model(x)
     cd = get_compile_data(thunder_model)
     assert cd is not None
-    for ex in thunder.get_default_executors():
+    for ex in get_expected_executors():
         assert ex.name in [el.name for el in cd.executors_list]
 
 
 # test skipped if nvfuser isn't available because providing plugins calls BaseRecipe
-@pytest.mark.skipif(not cudnn_available(), reason="cuDNN is not available")
 @pytest.mark.skipif(not nvfuser_available(), reason="nvFuser is not available")
 @pytest.mark.skipif(IS_WINDOWS, reason="libuv error with PT build on windows")
 def test_plugins_composition(monkeypatch):
@@ -215,21 +212,21 @@ def test_plugins_composition(monkeypatch):
         _ = thunder.compile(model, plugins="fp8")
         call_args = mock_jit.call_args
         assert "transformer_engine_v1" in [el.name for el in call_args.kwargs["executors"]]
-        for ex in thunder.get_default_executors():
+        for ex in get_expected_executors():
             assert ex.name in [el.name for el in call_args.kwargs["executors"]]
 
         _ = thunder.compile(model, plugins=["fp8"])
         call_args = mock_jit.call_args
         assert "transformer_engine_v1" in [el.name for el in call_args.kwargs["executors"]]
-        for ex in thunder.get_default_executors():
+        for ex in get_expected_executors():
             assert ex.name in [el.name for el in call_args.kwargs["executors"]]
 
         from thunder.plugins import FP8
 
         _ = thunder.compile(model, plugins=[FP8()])
         call_args = mock_jit.call_args
         assert "transformer_engine_v1" in [el.name for el in call_args.kwargs["executors"]]
-        for ex in thunder.get_default_executors():
+        for ex in get_expected_executors():
             assert ex.name in [el.name for el in call_args.kwargs["executors"]]
 
     if not torch.distributed.is_initialized():
@@ -259,7 +256,6 @@ def test_plugins_composition(monkeypatch):
         assert "transformer_engine_v1" in [el.name for el in call_args.kwargs["executors"]]
 
 
-@pytest.mark.skipif(not cudnn_available(), reason="cuDNN is not available")
 @pytest.mark.skipif(not nvfuser_available(), reason="nvFuser is not available")
 @pytest.mark.skipif(IS_WINDOWS, reason="libuv error with PT build on windows")
 def test_plugins_hybrid_ddpfsdp(monkeypatch):
diff --git a/thunder/tests/test_transformer_engine_executor.py b/thunder/tests/test_transformer_engine_executor.py
@@ -718,7 +718,13 @@ def thunder_model(x):
 
 
 @requiresCUDA
+@pytest.mark.skipif(
+    LooseVersion(transformer_engine.__version__) < LooseVersion("2.9"),
+    reason="need TE >= 2.9 for quantizer location",
+)
 def test_te_inference_8bit():
+    from thunder.transforms.te_inference import TEInference8BitTransform
+
     with torch.device("cuda"):
         m = torch.nn.Sequential(
             torch.nn.Linear(1024, 2048),
@@ -733,6 +739,7 @@ def test_te_inference_8bit():
         a = torch.randn(16, 1024, device="cuda")
 
     quant_transform = TEInference8BitTransform()
+    te_inference_executor = quant_transform.get_executor()
     quant_transform2 = TEInference8BitTransform()
     jm = thunder.jit(
         m, transforms=[quant_transform], executors=(te_inference_executor, *thunder.get_default_executors())