pull some features from graph_net_bench/torch/eval_backend_perf.py to graph_net_bench/torch/util/timing.py

roll-away · roll-away · commit 61dd8e077772 · 2026-01-28T05:15:07.000Z
diff --git a/graph_net_bench/torch/eval_backend_diff.py b/graph_net_bench/torch/eval_backend_diff.py
@@ -9,7 +9,7 @@
 import types
 from graph_net_bench import test_compiler_util
 from graph_net_bench import path_utils
-from .util.eval_backend_perf import eval_single_model_with_single_backend
+from .eval_backend_perf import eval_single_model_with_single_backend
 
 
 def compare_correctness(expected_out, compiled_out, args):
diff --git a/graph_net_bench/torch/eval_backend_perf.py b/graph_net_bench/torch/eval_backend_perf.py
@@ -15,6 +15,7 @@
 from contextlib import redirect_stdout, redirect_stderr
 from graph_net_bench.torch.backend.graph_compiler_backend import GraphCompilerBackend
 from graph_net_bench import test_compiler_util
+from .util.timing import measure_performance
 
 
 def register_op_lib(op_lib):
@@ -129,69 +130,6 @@ def get_input_dict(args):
     }
 
 
-def measure_performance(model_call, args, compiler):
-    stats = {}
-    outs = model_call()
-
-    # Warmup runs
-    for _ in range(args.warmup):
-        model_call()
-    compiler.synchronize()
-
-    print(
-        f"[Profiling] Warm up {args.warmup}, Trials {args.trials}",
-        file=sys.stderr,
-        flush=True,
-    )
-
-    if "cuda" in args.device:
-        torch.cuda.empty_cache()
-        e2e_times = []
-        gpu_times = []
-
-        for i in range(args.trials):
-            # End-to-end timing (naive_timer)
-            duration_box = test_compiler_util.DurationBox(-1)
-            with test_compiler_util.naive_timer(duration_box, compiler.synchronize):
-                # GPU-only timing (CUDA Events)
-                start_event = torch.cuda.Event(enable_timing=True)
-                end_event = torch.cuda.Event(enable_timing=True)
-                start_event.record()
-
-                model_call()
-
-                end_event.record()
-                compiler.synchronize()
-
-            gpu_time_ms = start_event.elapsed_time(end_event)
-            e2e_times.append(duration_box.value)
-            gpu_times.append(gpu_time_ms)
-            print(
-                f"Trial {i + 1}: e2e={duration_box.value:.5f} ms, gpu={gpu_time_ms:.5f} ms",
-                file=sys.stderr,
-                flush=True,
-            )
-
-        stats["e2e"] = test_compiler_util.get_timing_stats(e2e_times)
-        stats["gpu"] = test_compiler_util.get_timing_stats(gpu_times)
-
-    else:  # CPU or other devices
-        e2e_times = []
-        for i in range(args.trials):
-            duration_box = test_compiler_util.DurationBox(-1)
-            with test_compiler_util.naive_timer(duration_box, compiler.synchronize):
-                model_call()
-            print(
-                f"Trial {i + 1}: e2e={duration_box.value:.5f} ms",
-                file=sys.stderr,
-                flush=True,
-            )
-            e2e_times.append(duration_box.value)
-        stats["e2e"] = test_compiler_util.get_timing_stats(e2e_times)
-
-    return outs, stats
-
-
 def eval_single_model_with_single_backend(args):
     check_and_complete_args(args)
     set_seed(args.seed)
diff --git a/graph_net_bench/torch/util/eval_backend_perf.py b/graph_net_bench/torch/util/eval_backend_perf.py