Redo D75092426: [internal] Expose additional metadata to compilation callbacks (pytorch#155063)

xmfan · pytorchmergebot · commit 28796f71d043 · 2025-06-05T23:40:31.000Z
Originally pytorch#153596 --------------- Summary: via reverting D75708685 gate the ROCm failure Test Plan: Unit tests in OSS, sandcastle Rollback Plan: Bifferential Revision: D75894349 Pull Request resolved: pytorch#155063 Approved by: https://github.com/masnesral
diff --git a/test/dynamo/test_callback.py b/test/dynamo/test_callback.py
@@ -1,9 +1,14 @@
 # Owner(s): ["module: dynamo"]
 
+import unittest
 from unittest.mock import Mock
 
-from torch._dynamo.callback import callback_handler
+import torch
+from torch._dynamo.callback import callback_handler, CallbackArgs, CallbackTrigger
 from torch._dynamo.test_case import run_tests, TestCase
+from torch._guards import CompileId
+from torch.testing._internal.common_utils import TEST_WITH_ROCM
+from torch.testing._internal.inductor_utils import HAS_CUDA
 
 
 class CallbackTests(TestCase):
@@ -15,16 +20,22 @@ def setUp(self) -> None:
         callback_handler.register_end_callback(self._on_compile_end)
 
     def tearDown(self) -> None:
-        return super().tearDown()
         callback_handler.clear()
+        return super().tearDown()
 
     def test_callbacks_with_duplicate_prevention(self) -> None:
-        with callback_handler.install_callbacks(), callback_handler.install_callbacks():
+        trigger = CallbackTrigger.DYNAMO
+        compile_id = CompileId(0, 0)
+        with callback_handler.install_callbacks(
+            trigger, compile_id
+        ), callback_handler.install_callbacks(trigger, compile_id):
             self._on_compile_start.assert_called_once()
         self._on_compile_end.assert_called_once()
 
     def test_counter(self) -> None:
-        with callback_handler.install_callbacks():
+        trigger = CallbackTrigger.DYNAMO
+        compile_id = CompileId(0, 0)
+        with callback_handler.install_callbacks(trigger, compile_id):
             self.assertEqual(
                 callback_handler._CompilationCallbackHandler__pending_callbacks_counter,
                 1,
@@ -35,18 +46,95 @@ def test_counter(self) -> None:
 
     def test_counter_assertion(self) -> None:
         callback_handler._CompilationCallbackHandler__pending_callbacks_counter -= 1
+        with self.assertRaisesRegex(
+            AssertionError, "Pending callbacks counter cannot become negative."
+        ):
+            trigger = CallbackTrigger.DYNAMO
+            compile_id = CompileId(0, 0)
+            with callback_handler.install_callbacks(trigger, str(compile_id)):
+                pass
+        self.assertEqual(
+            callback_handler._CompilationCallbackHandler__pending_callbacks_counter, 0
+        )
+
+    @unittest.skipIf(
+        TEST_WITH_ROCM, "ROCm outputs a different number of autotuning logs"
+    )
+    @unittest.skipIf(not HAS_CUDA, "requires triton")
+    @torch._inductor.config.patch(force_disable_caches=True)
+    def test_triggers(self) -> None:
+        torch._dynamo.reset()
+        order = []
+
+        def on_start(args: CallbackArgs):
+            nonlocal order
+            order.append(f"start={args}")
 
-        with self.assertRaises(
-            AssertionError
-        ) as e, callback_handler.install_callbacks():
-            pass
+        def on_end(args: CallbackArgs):
+            nonlocal order
+            order.append(f"end={args}")
+
+        torch._dynamo.callback.on_compile_start(on_start)
+        torch._dynamo.callback.on_compile_start(on_end)
+
+        class TinyModel(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.fc1 = torch.nn.Linear(10, 10)
+                self.relu = torch.nn.ReLU()
+                self.fc2 = torch.nn.Linear(10, 10)
+
+            def forward(self, x):
+                temp = self.fc1(x)
+                temp = self.relu(temp)
+                torch._dynamo.graph_break()
+                return self.fc2(temp)
+
+        model = TinyModel().to("cuda")
+        compiled_model = torch.compile(model, mode="max-autotune")
+        x = torch.randn(10, 10, device="cuda")
+
+        loss = compiled_model(x).sum()
+        loss.backward()
+        self.assertExpectedInline(
+            "\n".join(order),
+            """\
+start=CallbackArgs(callback_trigger=<CallbackTrigger.DYNAMO: 1>, compile_id='0/0')
+end=CallbackArgs(callback_trigger=<CallbackTrigger.DYNAMO: 1>, compile_id='0/0')
+start=CallbackArgs(callback_trigger=<CallbackTrigger.DYNAMO: 1>, compile_id='1/0')
+end=CallbackArgs(callback_trigger=<CallbackTrigger.DYNAMO: 1>, compile_id='1/0')
+start=CallbackArgs(callback_trigger=<CallbackTrigger.LAZY_BACKWARD: 2>, compile_id='1/0')
+end=CallbackArgs(callback_trigger=<CallbackTrigger.LAZY_BACKWARD: 2>, compile_id='1/0')
+start=CallbackArgs(callback_trigger=<CallbackTrigger.TRITON_AUTOTUNING: 3>, compile_id='1/0')
+end=CallbackArgs(callback_trigger=<CallbackTrigger.TRITON_AUTOTUNING: 3>, compile_id='1/0')
+start=CallbackArgs(callback_trigger=<CallbackTrigger.LAZY_BACKWARD: 2>, compile_id='0/0')
+end=CallbackArgs(callback_trigger=<CallbackTrigger.LAZY_BACKWARD: 2>, compile_id='0/0')
+start=CallbackArgs(callback_trigger=<CallbackTrigger.TRITON_AUTOTUNING: 3>, compile_id='0/0')
+end=CallbackArgs(callback_trigger=<CallbackTrigger.TRITON_AUTOTUNING: 3>, compile_id='0/0')""",  # noqa: B950
+        )
+        order.clear()
 
-        self.assertIn(
-            "Pending callbacks counter cannot become negative.",
-            str(e.exception),
+        compiled_model.zero_grad()
+        loss = compiled_model(x).sum()
+        loss.backward()
+        self.assertExpectedInline(
+            "\n".join(order),
+            """\
+start=CallbackArgs(callback_trigger=<CallbackTrigger.CUDAGRAPH_RECORDING: 4>, compile_id='0/0')
+end=CallbackArgs(callback_trigger=<CallbackTrigger.CUDAGRAPH_RECORDING: 4>, compile_id='0/0')
+start=CallbackArgs(callback_trigger=<CallbackTrigger.CUDAGRAPH_RECORDING: 4>, compile_id='1/0')
+end=CallbackArgs(callback_trigger=<CallbackTrigger.CUDAGRAPH_RECORDING: 4>, compile_id='1/0')
+start=CallbackArgs(callback_trigger=<CallbackTrigger.CUDAGRAPH_RECORDING: 4>, compile_id='1/0')
+end=CallbackArgs(callback_trigger=<CallbackTrigger.CUDAGRAPH_RECORDING: 4>, compile_id='1/0')
+start=CallbackArgs(callback_trigger=<CallbackTrigger.CUDAGRAPH_RECORDING: 4>, compile_id='0/0')
+end=CallbackArgs(callback_trigger=<CallbackTrigger.CUDAGRAPH_RECORDING: 4>, compile_id='0/0')""",  # noqa: B950
         )
+        order.clear()
 
-        callback_handler._CompilationCallbackHandler__pending_callbacks_counter += 1
+        compiled_model.zero_grad()
+        loss = compiled_model(x).sum()
+        loss.backward()
+        self.assertEqual(len(order), 0)
 
 
 if __name__ == "__main__":
diff --git a/test/dynamo/test_compile.py b/test/dynamo/test_compile.py
@@ -81,11 +81,11 @@ def test_compilation_callback(self):
         torch._dynamo.reset()
 
         @torch._dynamo.on_compile_start
-        def start_callback():
+        def start_callback(_):
             print("Compilation started.")
 
         @torch._dynamo.on_compile_end
-        def end_callback():
+        def end_callback(_):
             print("Compilation ended.")
 
         mod = ToyModel()
@@ -116,13 +116,13 @@ def test_compilation_callback_with_graph_break(self):
         counter = 0
 
         @torch._dynamo.on_compile_start
-        def start_callback():
+        def start_callback(_):
             nonlocal counter
             counter += 1
             print(f"Counter = {counter}")
 
         @torch._dynamo.on_compile_end
-        def end_callback():
+        def end_callback(_):
             nonlocal counter
             counter += 1
             print(f"Counter = {counter}")
diff --git a/torch/_dynamo/callback.py b/torch/_dynamo/callback.py
@@ -25,26 +25,44 @@ def my_end_callback():
         print("Compilation complete")
 """
 
+import enum
 import threading
 from collections.abc import Generator
 from contextlib import contextmanager
 from dataclasses import dataclass, field  # noqa: F811
 from typing import Any, Callable
 
 
+class CallbackTrigger(enum.Enum):
+    # most common case, dynamo attempts to trace a new frame
+    DYNAMO = 1
+    # backward compilation can be deferred to runtime
+    LAZY_BACKWARD = 2
+    # some backends autotune at runtime
+    TRITON_AUTOTUNING = 3
+    # cudagraphs record at runtime
+    CUDAGRAPH_RECORDING = 4
+
+
+@dataclass
+class CallbackArgs:
+    callback_trigger: CallbackTrigger
+    compile_id: str
+
+
 @dataclass
 class CompilationCallbackHandler:
-    start_callbacks: list[Callable[[], None]] = field(default_factory=list)
-    end_callbacks: list[Callable[[], None]] = field(default_factory=list)
+    start_callbacks: list[Callable[[CallbackArgs], None]] = field(default_factory=list)
+    end_callbacks: list[Callable[[CallbackArgs], None]] = field(default_factory=list)
 
     __pending_callbacks_counter: int = field(default=0, init=False, repr=False)
     __pending_callbacks_counter_lock: threading.Lock = field(
         default_factory=threading.Lock, init=False, repr=False
     )
 
     def register_start_callback(
-        self, callback: Callable[[], None]
-    ) -> Callable[[], None]:
+        self, callback: Callable[[CallbackArgs], None]
+    ) -> Callable[[CallbackArgs], None]:
         """
         Register a callback function to be called when the compilation starts.
 
@@ -54,7 +72,9 @@ def register_start_callback(
         self.start_callbacks.append(callback)
         return callback
 
-    def register_end_callback(self, callback: Callable[[], None]) -> Callable[[], None]:
+    def register_end_callback(
+        self, callback: Callable[[CallbackArgs], None]
+    ) -> Callable[[CallbackArgs], None]:
         """
         Register a callback function to be called when the compilation ends.
 
@@ -64,7 +84,7 @@ def register_end_callback(self, callback: Callable[[], None]) -> Callable[[], No
         self.end_callbacks.append(callback)
         return callback
 
-    def remove_start_callback(self, callback: Callable[[], None]) -> None:
+    def remove_start_callback(self, callback: Callable[[CallbackArgs], None]) -> None:
         """
         Remove a registered start callback function.
 
@@ -73,7 +93,7 @@ def remove_start_callback(self, callback: Callable[[], None]) -> None:
         """
         self.start_callbacks.remove(callback)
 
-    def remove_end_callback(self, callback: Callable[[], None]) -> None:
+    def remove_end_callback(self, callback: Callable[[CallbackArgs], None]) -> None:
         """
         Remove a registered end callback function.
 
@@ -82,29 +102,32 @@ def remove_end_callback(self, callback: Callable[[], None]) -> None:
         """
         self.end_callbacks.remove(callback)
 
-    def run_start_callbacks(self) -> None:
+    def run_start_callbacks(self, args: CallbackArgs) -> None:
         """
         Execute all registered start callbacks.
         """
         for callback in self.start_callbacks:
-            callback()
+            callback(args)
 
-    def run_end_callbacks(self) -> None:
+    def run_end_callbacks(self, args: CallbackArgs) -> None:
         """
         Execute all registered end callbacks.
         """
         for callback in self.end_callbacks:
-            callback()
+            callback(args)
 
     @contextmanager
-    def install_callbacks(self) -> Generator[None, Any, Any]:
+    def install_callbacks(
+        self, trigger: CallbackTrigger, compile_id: str
+    ) -> Generator[None, Any, Any]:
         """
         Context manager to install the callbacks and run them when the context is exited.
         """
+        args = CallbackArgs(trigger, compile_id)
         try:
             with self.__pending_callbacks_counter_lock:
                 if self.__pending_callbacks_counter == 0:
-                    self.run_start_callbacks()
+                    self.run_start_callbacks(args)
                 self.__pending_callbacks_counter += 1
             yield
         finally:
@@ -113,7 +136,7 @@ def install_callbacks(self) -> Generator[None, Any, Any]:
                     "Pending callbacks counter cannot become negative."
                 )
                 if self.__pending_callbacks_counter == 1:
-                    self.run_end_callbacks()
+                    self.run_end_callbacks(args)
                 self.__pending_callbacks_counter -= 1
 
     def clear(self) -> None:
@@ -122,20 +145,25 @@ def clear(self) -> None:
         """
         self.start_callbacks.clear()
         self.end_callbacks.clear()
+        assert self.__pending_callbacks_counter == 0
 
 
 callback_handler = CompilationCallbackHandler()
 
 
-def on_compile_start(callback: Callable[[], None]) -> Callable[[], None]:
+def on_compile_start(
+    callback: Callable[[CallbackArgs], None],
+) -> Callable[[CallbackArgs], None]:
     """
     Decorator to register a callback function for the start of the compilation.
     """
     callback_handler.register_start_callback(callback)
     return callback
 
 
-def on_compile_end(callback: Callable[[], None]) -> Callable[[], None]:
+def on_compile_end(
+    callback: Callable[[CallbackArgs], None],
+) -> Callable[[CallbackArgs], None]:
     """
     Decorator to register a callback function for the end of the compilation.
     """
diff --git a/torch/_dynamo/convert_frame.py b/torch/_dynamo/convert_frame.py
@@ -48,6 +48,7 @@
 import torch
 import torch._logging
 from torch._C._dynamo.guards import GlobalStateGuard
+from torch._dynamo.callback import CallbackTrigger
 from torch._dynamo.distributed import get_compile_pg
 from torch._dynamo.symbolic_convert import TensorifyState
 from torch._guards import compile_context, CompileContext, CompileId, tracing
@@ -774,7 +775,11 @@ def compile_inner(
         transform: Callable[[list[Instruction], dict[str, Any]], Any],
     ) -> ConvertFrameReturn:
         with contextlib.ExitStack() as stack:
-            stack.enter_context(torch._dynamo.callback_handler.install_callbacks())
+            stack.enter_context(
+                torch._dynamo.callback_handler.install_callbacks(
+                    CallbackTrigger.DYNAMO, str(CompileContext.current_compile_id())
+                )
+            )
             stack.enter_context(CompileTimeInstructionCounter.record())
             return _compile_inner(code, one_graph, hooks, transform)
 
diff --git a/torch/_functorch/_aot_autograd/runtime_wrappers.py b/torch/_functorch/_aot_autograd/runtime_wrappers.py
@@ -20,6 +20,7 @@
 import torch
 import torch.utils.dlpack
 from torch import Tensor
+from torch._dynamo.callback import callback_handler, CallbackTrigger
 from torch._dynamo.utils import CompileEventLogger, dynamo_timed, get_metrics_context
 from torch._guards import (
     compile_context,
@@ -2290,6 +2291,9 @@ def _backward_impl(ctx, all_args):
                         dynamo_compile_column_us="backward_cumulative_compile_time_us",
                         log_waitcounter=True,
                         waitcounter_name_override="entire_backward_compile",
+                    ), callback_handler.install_callbacks(
+                        CallbackTrigger.LAZY_BACKWARD,
+                        str(CompileContext.current_compile_id()),
                     ):
                         CompileEventLogger.compilation_metric(is_forward=False)
                         # See Note: [Backward graph lazy lowering]
diff --git a/torch/_inductor/async_compile.py b/torch/_inductor/async_compile.py
@@ -346,6 +346,7 @@ def reload_kernel_in_parent():
             else:
                 return future.result()
 
+        # Cache miss
         if is_parallel:
             # We want to support changing these env vars after (and while) the
             # process pool is running, so pass them to the subprocess to reset.
diff --git a/torch/_inductor/compile_fx.py b/torch/_inductor/compile_fx.py
@@ -707,7 +707,6 @@ def compile_fx_inner(
                 dynamo_compile_column_us="inductor_cumulative_compile_time_us",
             )
         )
-        stack.enter_context(torch._dynamo.callback_handler.install_callbacks())
         stack.enter_context(with_fresh_cache_if_config())
         stack.enter_context(DebugContext())
         CompileEventLogger.pt2_compile(
diff --git a/torch/_inductor/cudagraph_trees.py b/torch/_inductor/cudagraph_trees.py
diff --git a/torch/_inductor/runtime/runtime_utils.py b/torch/_inductor/runtime/runtime_utils.py
diff --git a/torch/_inductor/runtime/triton_heuristics.py b/torch/_inductor/runtime/triton_heuristics.py

Original file line number	Diff line number	Diff line change
`@@ -707,7 +707,6 @@ def compile_fx_inner(`
`707`	`707`	`dynamo_compile_column_us="inductor_cumulative_compile_time_us",`
`708`	`708`	`)`
`709`	`709`	`)`
`710`		`- stack.enter_context(torch._dynamo.callback_handler.install_callbacks())`
`711`	`710`	`stack.enter_context(with_fresh_cache_if_config())`
`712`	`711`	`stack.enter_context(DebugContext())`
`713`	`712`	`CompileEventLogger.pt2_compile(`