Simplify torch test_compiler.

Xreki · Xreki · commit 9eff1c025479 · 2025-10-14T09:46:13.000+08:00
diff --git a/graph_net/paddle/test_compiler.py b/graph_net/paddle/test_compiler.py
@@ -35,10 +35,8 @@ def get_hardward_name(args):
 
 
 def get_compile_framework_version(args):
-    if args.compiler == "cinn":
+    if args.compiler in ["cinn", "nope"]:
         return paddle.__version__
-    if args.compiler == "nope":
-        return "nope-baseline"
     return "unknown"
 
 
diff --git a/graph_net/test_compiler_util.py b/graph_net/test_compiler_util.py
@@ -134,6 +134,8 @@ def check_output_datatype(args, eager_dtypes, compiled_dtypes):
     )
 
     # datatype check
+    # "datatype not match" is recognized as a large loss in analysis process later,
+    # and is not recognized as a failure here.
     type_match = check_type_match(eager_dtypes, compiled_dtypes)
     print_with_log_prompt(
         "[DataType]",
diff --git a/graph_net/torch/test_compiler.py b/graph_net/torch/test_compiler.py
@@ -43,6 +43,24 @@ def set_seed(random_seed):
         torch.cuda.manual_seed_all(random_seed)
 
 
+def get_hardward_name(args):
+    hardware_name = "unknown"
+    if "cuda" in args.device:
+        hardware_name = torch.cuda.get_device_name(args.device)
+    elif args.device == "cpu":
+        hardware_name = platform.processor()
+    return hardware_name
+
+
+def get_compile_framework_version(args):
+    if args.compiler in ["inductor", "nope"]:
+        return torch.__version__
+    elif args.compiler in ["tvm", "xla", "tensorrt", "bladedisc"]:
+        # Assuming compiler object has a version attribute
+        return f"{args.compiler.capitalize()} {compiler.version}"
+    return "unknown"
+
+
 def load_class_from_file(
     args: argparse.Namespace, class_name: str, device: str
 ) -> Type[torch.nn.Module]:
@@ -87,31 +105,6 @@ def get_input_dict(args):
     }
 
 
-@dataclass
-class DurationBox:
-    value: float
-
-
-@contextmanager
-def naive_timer(duration_box, synchronizer_func):
-    synchronizer_func()
-    start = time.time()
-    yield
-    synchronizer_func()
-    end = time.time()
-    duration_box.value = (end - start) * 1000  # Store in milliseconds
-
-
-def get_timing_stats(elapsed_times: List[float]):
-    stats = {
-        "mean": float(f"{np.mean(elapsed_times):.3g}"),
-        "std": float(f"{np.std(elapsed_times):.3g}"),
-        "min": float(f"{np.min(elapsed_times):.3g}"),
-        "max": float(f"{np.max(elapsed_times):.3g}"),
-    }
-    return stats
-
-
 def measure_performance(model_call, args, compiler):
     stats = {}
 
@@ -120,27 +113,26 @@ def measure_performance(model_call, args, compiler):
         model_call()
     compiler.synchronize()
 
+    hardware_name = get_hardward_name(args)
+    print(
+        f"[Profiling] Using device: {args.device} {hardware_name}, warm up {args.warmup}, trials {args.trials}",
+        file=sys.stderr,
+        flush=True,
+    )
+
     if "cuda" in args.device:
         """
         Acknowledgement: We evaluate the performance on both end-to-end and GPU-only timings,
         With reference to methods only based on CUDA events from KernelBench in https://github.com/ScalingIntelligence/KernelBench
         """
 
-        device = torch.device(args.device)
-        hardware_name = torch.cuda.get_device_name(device)
-        print(
-            f"{args.log_prompt} [Profiling] Using device: {args.device} {hardware_name}, warm up {args.warmup}, trials {args.trials}",
-            file=sys.stderr,
-            flush=True,
-        )
-
         e2e_times = []
         gpu_times = []
 
         for i in range(args.trials):
             # End-to-end timing (naive_timer)
-            duration_box = DurationBox(-1)
-            with naive_timer(duration_box, compiler.synchronize):
+            duration_box = test_compiler_util.DurationBox(-1)
+            with test_compiler_util.naive_timer(duration_box, compiler.synchronize):
                 # GPU-only timing (CUDA Events)
                 start_event = torch.cuda.Event(enable_timing=True)
                 end_event = torch.cuda.Event(enable_timing=True)
@@ -149,7 +141,7 @@ def measure_performance(model_call, args, compiler):
                 model_call()
 
                 end_event.record()
-                torch.cuda.synchronize(device=device)
+                compiler.synchronize()
 
             gpu_time_ms = start_event.elapsed_time(end_event)
             e2e_times.append(duration_box.value)
@@ -160,29 +152,22 @@ def measure_performance(model_call, args, compiler):
                 flush=True,
             )
 
-        stats["e2e"] = get_timing_stats(e2e_times)
-        stats["gpu"] = get_timing_stats(gpu_times)
+        stats["e2e"] = test_compiler_util.get_timing_stats(e2e_times)
+        stats["gpu"] = test_compiler_util.get_timing_stats(gpu_times)
 
     else:  # CPU or other devices
-        hardware_name = platform.processor()
-        print(
-            f"[Profiling] Using device: {args.device} {hardware_name}, warm up {args.warmup}, trials {args.trials}",
-            file=sys.stderr,
-            flush=True,
-        )
-
         e2e_times = []
         for i in range(args.trials):
-            duration_box = DurationBox(-1)
-            with naive_timer(duration_box, compiler.synchronize):
+            duration_box = test_compiler_util.DurationBox(-1)
+            with test_compiler_util.naive_timer(duration_box, compiler.synchronize):
                 model_call()
             print(
                 f"Trial {i + 1}: e2e={duration_box.value:.5f} ms",
                 file=sys.stderr,
                 flush=True,
             )
             e2e_times.append(duration_box.value)
-        stats["e2e"] = get_timing_stats(e2e_times)
+        stats["e2e"] = test_compiler_utilget_timing_stats(e2e_times)
 
     return stats
 
@@ -191,49 +176,9 @@ def test_single_model(args):
     compiler = get_compiler_backend(args)
     input_dict = get_input_dict(args)
     model = get_model(args, args.device)
-    model_path = os.path.normpath(args.model_path)
-    print(f"{args.log_prompt} [Processing] {model_path}", file=sys.stderr, flush=True)
-    model_name = os.path.basename(model_path)
-    print(
-        f"{args.log_prompt} [Config] model: {model_name}", file=sys.stderr, flush=True
-    )
-    print(
-        f"{args.log_prompt} [Config] device: {args.device}", file=sys.stderr, flush=True
-    )
-
-    hardware_name = "unknown"
-    if "cuda" in args.device:
-        hardware_name = torch.cuda.get_device_name(args.device)
-    elif args.device == "cpu":
-        hardware_name = platform.processor()
-    print(
-        f"{args.log_prompt} [Config] hardware: {hardware_name}",
-        file=sys.stderr,
-        flush=True,
-    )
-
-    print(
-        f"{args.log_prompt} [Config] compiler: {args.compiler}",
-        file=sys.stderr,
-        flush=True,
-    )
-    print(
-        f"{args.log_prompt} [Config] warmup: {args.warmup}", file=sys.stderr, flush=True
-    )
-    print(
-        f"{args.log_prompt} [Config] trials: {args.trials}", file=sys.stderr, flush=True
-    )
 
-    version_str = "unknown"
-    if args.compiler == "inductor":
-        version_str = torch.__version__
-    elif args.compiler in ["tvm", "xla", "tensorrt", "bladedisc"]:
-        # Assuming compiler object has a version attribute
-        version_str = f"{args.compiler.capitalize()} {compiler.version}"
-    print(
-        f"{args.log_prompt} [Config] compile_framework_version: {version_str}",
-        file=sys.stderr,
-        flush=True,
+    test_compiler_util.print_basic_config(
+        args, get_hardward_name(args), get_compile_framework_version(args)
     )
 
     runtime_seed = 1024
@@ -245,28 +190,11 @@ def test_single_model(args):
     try:
         eager_model_call = lambda: model(**input_dict)
         eager_stats = measure_performance(eager_model_call, args, compiler)
-        print(
-            f"{args.log_prompt} [Performance][eager]: {json.dumps(eager_stats)}",
-            file=sys.stderr,
-            flush=True,
-        )
 
         torch.manual_seed(runtime_seed)
         expected_out = eager_model_call()
         if not isinstance(expected_out, tuple):
             expected_out = (expected_out,)
-
-        eager_types = [
-            str(x.dtype).replace("torch.", "")
-            if isinstance(x, torch.Tensor)
-            else type(x).__name__
-            for x in expected_out
-        ]
-        print(
-            f"{args.log_prompt} [Datatype][eager]: {' '.join(eager_types)}",
-            file=sys.stderr,
-            flush=True,
-        )
     except (TypeError, RuntimeError) as e:
         print(f"Eager model execution failed: {str(e)}", file=sys.stderr)
         eager_failure = True
@@ -286,43 +214,12 @@ def test_single_model(args):
         torch.manual_seed(runtime_seed)
         compiled_model_call = lambda: compiled_model(**input_dict)
         compiled_stats = measure_performance(compiled_model_call, args, compiler)
-        print(
-            f"{args.log_prompt} [Performance][compiled]: {json.dumps(compiled_stats)}",
-            file=sys.stderr,
-            flush=True,
-        )
 
         compiled_out = compiled_model_call()
         if not isinstance(compiled_out, tuple):
             compiled_out = (compiled_out,)
         if args.compiler == "xla":
             compiled_out = tuple(item.to("cpu").to("cuda") for item in compiled_out)
-
-        compiled_types = [
-            str(x.dtype).replace("torch.", "")
-            if isinstance(x, torch.Tensor)
-            else type(x).__name__
-            for x in compiled_out
-        ]
-        print(
-            f"{args.log_prompt} [Datatype][compiled]: {' '.join(compiled_types)}",
-            file=sys.stderr,
-            flush=True,
-        )
-
-        # datatype check
-        type_match = all(
-            eager == compiled for eager, compiled in zip(eager_types, compiled_types)
-        )
-        print(
-            f"{args.log_prompt} [DataType] eager:{eager_types} compiled:{compiled_types} match:{type_match}",
-            file=sys.stderr,
-        )
-        # "datatype not match" is recognized as a large loss in analysis process later,
-        # and is not recognized as a failure here.
-
-        compare_correctness(expected_out, compiled_out, args)
-
     except (TypeError, RuntimeError) as e:
         print(f"Compiled model execution failed: {str(e)}", file=sys.stderr)
         compiled_failure = True
@@ -342,39 +239,13 @@ def test_single_model(args):
             flush=True,
         )
     else:
+        compare_correctness(expected_out, compiled_out, args)
+
         print(
             f"{args.log_prompt} [Result] status: success", file=sys.stderr, flush=True
         )
 
-        e2e_speedup = 0
-        gpu_speedup = 0
-
-        eager_e2e_time_ms = eager_stats.get("e2e", {}).get("mean", 0)
-        compiled_e2e_time_ms = compiled_stats.get("e2e", {}).get("mean", 0)
-
-        if eager_e2e_time_ms > 0 and compiled_e2e_time_ms > 0:
-            e2e_speedup = eager_e2e_time_ms / compiled_e2e_time_ms
-
-        if "cuda" in args.device:
-            eager_gpu_time_ms = eager_stats.get("gpu", {}).get("mean", 0)
-            compiled_gpu_time_ms = compiled_stats.get("gpu", {}).get("mean", 0)
-
-            if eager_gpu_time_ms > 0 and compiled_gpu_time_ms > 0:
-                gpu_speedup = eager_gpu_time_ms / compiled_gpu_time_ms
-
-        if e2e_speedup > 0:
-            print(
-                f"{args.log_prompt} [Speedup][e2e]: {e2e_speedup:.4f}",
-                file=sys.stderr,
-                flush=True,
-            )
-
-        if "cuda" in args.device and gpu_speedup > 0:
-            print(
-                f"{args.log_prompt} [Speedup][gpu]: {gpu_speedup:.4f}",
-                file=sys.stderr,
-                flush=True,
-            )
+        test_compiler_util.print_times_and_speedup(args, eager_stats, compiled_stats)
 
 
 def print_and_store_cmp(key, cmp_func, args, expected_out, compiled_out, **kwargs):
@@ -388,22 +259,41 @@ def print_and_store_cmp(key, cmp_func, args, expected_out, compiled_out, **kwarg
 
 
 def compare_correctness(expected_out, compiled_out, args):
-    test_compiler_util.check_equal(
-        args,
-        expected_out,
-        compiled_out,
-        cmp_equal_func=get_cmp_equal,
-    )
+    eager_dtypes = [
+        str(x.dtype).replace("torch.", "")
+        if isinstance(x, torch.Tensor)
+        else type(x).__name__
+        for x in expected_out
+    ]
+    compiled_dtypes = [
+        str(x.dtype).replace("torch.", "")
+        if isinstance(x, torch.Tensor)
+        else type(x).__name__
+        for x in compiled_out
+    ]
 
-    test_compiler_util.check_allclose(
-        args,
-        expected_out,
-        compiled_out,
-        cmp_all_close_func=get_cmp_all_close,
-        cmp_max_diff_func=get_cmp_max_diff,
-        cmp_mean_diff_func=get_cmp_mean_diff,
+    # datatype check
+    type_match = test_compiler_util.check_output_datatype(
+        args, eager_dtypes, compiled_dtypes
     )
 
+    if type_match:
+        test_compiler_util.check_equal(
+            args,
+            expected_out,
+            compiled_out,
+            cmp_equal_func=get_cmp_equal,
+        )
+
+        test_compiler_util.check_allclose(
+            args,
+            expected_out,
+            compiled_out,
+            cmp_all_close_func=get_cmp_all_close,
+            cmp_max_diff_func=get_cmp_max_diff,
+            cmp_mean_diff_func=get_cmp_mean_diff,
+        )
+
 
 def get_cmp_equal(expected_out, compiled_out):
     return " ".join(

Original file line number	Diff line number	Diff line change
`@@ -134,6 +134,8 @@ def check_output_datatype(args, eager_dtypes, compiled_dtypes):`
`134`	`134`	`)`
`135`	`135`
`136`	`136`	`# datatype check`
	`137`	`+ # "datatype not match" is recognized as a large loss in analysis process later,`
	`138`	`+ # and is not recognized as a failure here.`
`137`	`139`	`type_match = check_type_match(eager_dtypes, compiled_dtypes)`
`138`	`140`	`print_with_log_prompt(`
`139`	`141`	`"[DataType]",`