[dynamo][guards] Turn on profiling of guard manager (pytorch#145420)

anijain2305 · pytorchmergebot · commit 015c6d6fdb56 · 2025-01-23T18:17:43.000Z
Pull Request resolved: pytorch#145420 Approved by: https://github.com/ezyang ghstack dependencies: pytorch#145351
diff --git a/torch/_C/_dynamo/guards.pyi b/torch/_C/_dynamo/guards.pyi
@@ -158,6 +158,7 @@ def install_symbolic_shape_guard(
 def profile_guard_manager(
     guard_manager: GuardManager,
     f_locals: dict[str, Any],
+    n_iters: int,
 ) -> float: ...
 
 class TensorGuards:
diff --git a/torch/_dynamo/guards.py b/torch/_dynamo/guards.py
@@ -2472,11 +2472,14 @@ def cleanup_builder(weak_b):
                     self.guard_manager, output_graph.local_scope
                 )
 
-            if guards_log.isEnabledFor(logging.DEBUG):
-                latency = profile_guard_manager(
-                    self.guard_manager.root, output_graph.local_scope
-                )
-                guards_log.debug("Guard eval latency = %s us", f"{latency:.2f}")
+            # NB for developers: n_iters is chosen to be 50 to achieve
+            # statistical significance.  If you are working on a guard
+            # optimization, it might be a good idea to increase this number for
+            # more stabiilty during development.
+            latency = profile_guard_manager(
+                self.guard_manager.root, output_graph.local_scope, 50
+            )
+            guards_log.debug("Guard eval latency = %s us", f"{latency:.2f}")
 
         # NB - We have to very careful of cleaning up here. Because of the
         # invalidate function, we can create a weakref finalizer that keeps
diff --git a/torch/csrc/dynamo/guards.cpp b/torch/csrc/dynamo/guards.cpp
@@ -5060,36 +5060,26 @@ void install_storage_overlapping_guard(
       /* overlapping= */ false);
 }
 
-double profile_guard_manager(RootGuardManager* root, py::object f_locals) {
+double profile_guard_manager(
+    RootGuardManager* root,
+    py::object f_locals,
+    int n_iters) {
   PyObject* locals = f_locals.ptr();
 
   // Warmup
-  for (int i = 0; i < 10; i++) {
+  for (int i = 0; i < 5; i++) {
     root->check_nopybind(locals);
   }
 
-  int count = 0;
   auto start = std::chrono::high_resolution_clock::now();
-  float profile_duration = 1.0;
-
-  // Run the loop for profile_duration seconds
-  while (true) {
+  for (int i = 0; i < n_iters; i++) {
     root->check_nopybind(locals);
-    count++;
-    auto end = std::chrono::high_resolution_clock::now();
-    std::chrono::duration<double> elapsed = end - start;
-
-    // Break the loop if 1 second has passed
-    if (elapsed.count() >= 1.0) {
-      break;
-    }
   }
-
   auto end = std::chrono::high_resolution_clock::now();
   std::chrono::duration<double> total_elapsed = end - start;
 
   // Calculate the average time per iteration in microseconds
-  return (total_elapsed.count() * profile_duration * 1e6) / count;
+  return (total_elapsed.count() * 1e6) / n_iters;
 }
 
 } // namespace