Record 2 types of speedup

JewelRoam · JewelRoam · commit 6092ffe8e6a1 · 2025-09-01T07:33:16.000Z
diff --git a/graph_net/torch/test_compiler.py b/graph_net/torch/test_compiler.py
@@ -85,85 +85,6 @@ def naive_timer(duration_box, synchronizer_func):
     duration_box.value = (end - start) * 1000  # Store in milliseconds
 
 
-def time_execution_with_cuda_event(
-    kernel_fn: Callable,
-    *args,
-    num_warmup: int = 3,
-    num_trials: int = 10,
-    verbose: bool = True,
-    device: torch.device = None,
-) -> List[float]:
-    """
-    Acknowledgement: We introduce evaluation method in https://github.com/ScalingIntelligence/KernelBench to enhance function.
-
-    Time a CUDA kernel function over multiple trials using torch.cuda.Event
-
-    Args:
-        kernel_fn: Function to time
-        *args: Arguments to pass to kernel_fn
-        num_trials: Number of timing trials to run
-        verbose: Whether to print per-trial timing info
-        device: CUDA device to use, if None, use current device
-
-    Returns:
-        List of elapsed times in milliseconds
-    """
-    if device is None:
-        if verbose:
-            print(f"Using current device: {torch.cuda.current_device()}")
-        device = torch.cuda.current_device()
-
-    # Warm ups
-    for _ in range(num_warmup):
-        kernel_fn(*args)
-        torch.cuda.synchronize(device=device)
-
-    print(
-        f"[Profiling] Using device: {device} {torch.cuda.get_device_name(device)}, warm up {num_warmup}, trials {num_trials}"
-    )
-    elapsed_times = []
-
-    # Actual trials
-    for trial in range(num_trials):
-        # create event marker default is not interprocess
-        start_event = torch.cuda.Event(enable_timing=True)
-        end_event = torch.cuda.Event(enable_timing=True)
-
-        start_event.record()
-        kernel_fn(*args)
-        end_event.record()
-
-        # Synchronize to ensure the events have completed
-        torch.cuda.synchronize(device=device)
-
-        # Calculate the elapsed time in milliseconds
-        elapsed_time_ms = start_event.elapsed_time(end_event)
-        if verbose:
-            print(f"Trial {trial + 1}: {elapsed_time_ms:.3g} ms")
-        elapsed_times.append(elapsed_time_ms)
-
-    return elapsed_times
-
-
-def time_execution_naive(
-    model_call, synchronizer_func, num_warmup: int = 3, num_trials: int = 10
-):
-    print(
-        f"[Profiling] Using device: {args.device} {platform.processor()}, warm up {num_warmup}, trials {num_trials}"
-    )
-    for _ in range(num_warmup):
-        model_call()
-
-    times = []
-    for i in range(num_trials):
-        duration_box = DurationBox(-1)
-        with naive_timer(duration_box, synchronizer_func):
-            model_call()
-        print(f"Trial {i + 1}: {duration_box.value:.2f} ms")
-        times.append(duration_box.value)
-    return times
-
-
 def get_timing_stats(elapsed_times: List[float]):
     stats = {
         "mean": float(f"{np.mean(elapsed_times):.3g}"),
@@ -189,7 +110,7 @@ def measure_performance(model_call, args, compiler):
         device = torch.device(args.device)
         hardware_name = torch.cuda.get_device_name(device)
         print(
-            f"[Profiling] Using device: {args.device} {hardware_name}, warm up {args.warmup}, trials {args.trials}"
+            f"{args.log_prompt} [Profiling] Using device: {args.device} {hardware_name}, warm up {args.warmup}, trials {args.trials}"
         )
 
         e2e_times = []
@@ -292,11 +213,6 @@ def test_single_model(args):
     result_data["performance"]["eager"] = eager_stats
     result_data["performance"]["compiled"] = compiled_stats
 
-    eager_time_ms = eager_stats.get("e2e", {}).get("mean", 0)
-    compiled_time_ms = compiled_stats.get("e2e", {}).get("mean", 0)
-    # Using e2e time to calculate speedup
-    result_data["performance"]["speedup"] = eager_time_ms / compiled_time_ms
-
     expected_out = eager_model_call()
     compiled_out = compiled_model_call()
 
@@ -313,44 +229,68 @@ def print_and_store_cmp(key, func, **kwargs):
             file=sys.stderr,
         )
 
-    print_and_store_cmp("equal", get_cmp_equal)
+    print_and_store_cmp("[equal]", get_cmp_equal)
     print_and_store_cmp(
-        "all_close_atol8_rtol8", get_cmp_all_close, atol=1e-8, rtol=1e-8
+        "[all_close_atol8_rtol8]", get_cmp_all_close, atol=1e-8, rtol=1e-8
     )
     print_and_store_cmp(
-        "all_close_atol8_rtol5", get_cmp_all_close, atol=1e-8, rtol=1e-5
+        "[all_close_atol8_rtol5]", get_cmp_all_close, atol=1e-8, rtol=1e-5
     )
     print_and_store_cmp(
-        "all_close_atol5_rtol5", get_cmp_all_close, atol=1e-5, rtol=1e-5
+        "[all_close_atol5_rtol5]", get_cmp_all_close, atol=1e-5, rtol=1e-5
     )
     print_and_store_cmp(
-        "all_close_atol3_rtol2", get_cmp_all_close, atol=1e-3, rtol=1e-2
+        "[all_close_atol3_rtol2]", get_cmp_all_close, atol=1e-3, rtol=1e-2
     )
     print_and_store_cmp(
-        "all_close_atol2_rtol1", get_cmp_all_close, atol=1e-2, rtol=1e-1
+        "[all_close_atol2_rtol1]", get_cmp_all_close, atol=1e-2, rtol=1e-1
     )
-    print_and_store_cmp("max_diff", get_cmp_max_diff)
-    print_and_store_cmp("mean_diff", get_cmp_mean_diff)
+    print_and_store_cmp("[max_diff]", get_cmp_max_diff)
+    print_and_store_cmp("[mean_diff]", get_cmp_mean_diff)
     print_and_store_cmp(
-        "diff_count_atol8_rtol8", get_cmp_diff_count, atol=1e-8, rtol=1e-8
+        "[diff_count_atol8_rtol8]", get_cmp_diff_count, atol=1e-8, rtol=1e-8
     )
     print_and_store_cmp(
-        "diff_count_atol8_rtol5", get_cmp_diff_count, atol=1e-8, rtol=1e-5
+        "[diff_count_atol8_rtol5]", get_cmp_diff_count, atol=1e-8, rtol=1e-5
     )
     print_and_store_cmp(
-        "diff_count_atol5_rtol5", get_cmp_diff_count, atol=1e-5, rtol=1e-5
+        "[diff_count_atol5_rtol5]", get_cmp_diff_count, atol=1e-5, rtol=1e-5
     )
     print_and_store_cmp(
-        "diff_count_atol3_rtol2", get_cmp_diff_count, atol=1e-3, rtol=1e-2
+        "[diff_count_atol3_rtol2]", get_cmp_diff_count, atol=1e-3, rtol=1e-2
     )
     print_and_store_cmp(
-        "diff_count_atol2_rtol1", get_cmp_diff_count, atol=1e-2, rtol=1e-1
+        "[diff_count_atol2_rtol1]", get_cmp_diff_count, atol=1e-2, rtol=1e-1
     )
 
-    print(
-        f"{args.log_prompt} duration model_path:{args.model_path} eager:{eager_time_ms:.4f} compiled:{compiled_time_ms:.4f}",
-        file=sys.stderr,
+    eager_e2e_time_ms = eager_stats.get("e2e", {}).get("mean", 0)
+    compiled_e2e_time_ms = compiled_stats.get("e2e", {}).get("mean", 0)
+
+    e2e_speedup = 0
+    if eager_e2e_time_ms > 0 and compiled_e2e_time_ms > 0:
+        e2e_speedup = eager_e2e_time_ms / compiled_e2e_time_ms
+        result_data["performance"]["speedup"]["e2e"] = e2e_speedup
+
+    duration_log = (
+        f"{args.log_prompt} [Duration] "
+        f"eager_e2e:{eager_e2e_time_ms:.4f} compiled_e2e:{compiled_e2e_time_ms:.4f}"
     )
+    speedup_log = f"{args.log_prompt} [Speedup] " f"e2e_speedup:{e2e_speedup:.4f}"
+
+    if "cuda" in args.device:
+        eager_gpu_time_ms = eager_stats.get("gpu", {}).get("mean", 0)
+        compiled_gpu_time_ms = compiled_stats.get("gpu", {}).get("mean", 0)
+
+        gpu_speedup = 0
+        if eager_gpu_time_ms > 0 and compiled_gpu_time_ms > 0:
+            gpu_speedup = eager_gpu_time_ms / compiled_gpu_time_ms
+            result_data["performance"]["speedup"]["gpu"] = gpu_speedup
+
+        duration_log += f" eager_gpu:{eager_gpu_time_ms:.4f} compiled_gpu:{compiled_gpu_time_ms:.4f}"
+        speedup_log += f" gpu_speedup:{gpu_speedup:.4f}"
+
+    print(duration_log, file=sys.stderr)
+    print(speedup_log, file=sys.stderr)
 
     if args.output_dir:
         os.makedirs(args.output_dir, exist_ok=True)