test_compiler support dcu.

Xreki · Xreki · commit dcd63a7dba1d · 2025-11-04T13:32:08.000+08:00
diff --git a/graph_net/paddle/test_compiler.py b/graph_net/paddle/test_compiler.py
@@ -25,7 +25,7 @@ def set_seed(random_seed):
 
 
 def get_hardward_name(args):
-    if args.device == "cuda":
+    if test_compiler_util.is_gpu_device(args.device):
         hardware = paddle.device.cuda.get_device_name(0)
     elif args.device == "cpu":
         hardware = platform.processor()
@@ -138,7 +138,7 @@ def measure_performance(model_call, args, synchronizer_func, profile=False):
         flush=True,
     )
 
-    if "cuda" in args.device:
+    if test_compiler_util.is_gpu_device(args.device):
         """
         Acknowledgement: We evaluate the performance on both end-to-end and GPU-only timings,
         With reference to methods only based on CUDA events from KernelBench in https://github.com/ScalingIntelligence/KernelBench
@@ -259,6 +259,7 @@ def test_single_model(args):
 
     # Run on eager mode
     eager_success = False
+    eager_time_stats = {}
     try:
         print("Run model in eager mode.", file=sys.stderr, flush=True)
         # static_model = get_static_model(args, model)
@@ -275,6 +276,7 @@ def test_single_model(args):
 
     # Run on compiling mode
     compiled_success = False
+    compiled_time_stats = {}
     try:
         print("Run model in compiled mode.", file=sys.stderr, flush=True)
         compiled_model = get_compiled_model(args, model)
@@ -293,9 +295,9 @@ def test_single_model(args):
     if eager_success and compiled_success:
         check_outputs(args, expected_out, compiled_out)
 
-        test_compiler_util.print_times_and_speedup(
-            args, eager_time_stats, compiled_time_stats
-        )
+    test_compiler_util.print_times_and_speedup(
+        args, eager_time_stats, compiled_time_stats
+    )
 
 
 def get_cmp_equal(expected_out, compiled_out):
diff --git a/graph_net/test_compiler_util.py b/graph_net/test_compiler_util.py
@@ -25,6 +25,10 @@ def naive_timer(duration_box, synchronizer_func):
     duration_box.value = (end - start) * 1000  # Store in milliseconds
 
 
+def is_gpu_device(device):
+    return "cuda" in device or "dcu" in device
+
+
 def get_device_utilization(device_id, device_count, synchronizer_func):
     current_pid = os.getpid()
 
@@ -98,6 +102,7 @@ def get_device_utilization(device_id, device_count, synchronizer_func):
                     gpu_uuid, pid, used_memory = line.split(", ")
                     if gpu_uuid == selected_gpu_uuid and int(pid) != current_pid:
                         other_tasks.append(line)
+            # Note: in docker container, the current_pid maybe different from that captured by nvidia-smi.
             print(
                 f"Note: There are {len(other_tasks)} tasks running on GPU {selected_gpu_id} (current_pid:{current_pid}).",
                 file=sys.stderr,
@@ -169,24 +174,33 @@ def print_basic_config(args, hardware_name, compile_framework_version):
     )
 
 
-def print_running_status(args, eager_success, compiled_success):
+def print_running_status(args, eager_success, compiled_success=None):
     def convert_to_str(b):
         return "success" if b else "failed"
 
-    print_with_log_prompt(
-        "[Result][status]",
-        f"eager:{convert_to_str(eager_success)} compiled:{convert_to_str(compiled_success)}",
-        args.log_prompt,
-    )
+    if compiled_success is not None:
+        print_with_log_prompt(
+            "[Result][status]",
+            f"eager:{convert_to_str(eager_success)} compiled:{convert_to_str(compiled_success)}",
+            args.log_prompt,
+        )
+    else:
+        print_with_log_prompt(
+            "[Result][status]",
+            f"eager:{convert_to_str(eager_success)}",
+            args.log_prompt,
+        )
 
 
 def print_times_and_speedup(args, eager_stats, compiled_stats):
-    print_with_log_prompt(
-        "[Performance][eager]:", json.dumps(eager_stats), args.log_prompt
-    )
-    print_with_log_prompt(
-        "[Performance][compiled]:", json.dumps(compiled_stats), args.log_prompt
-    )
+    if not eager_stats:
+        print_with_log_prompt(
+            "[Performance][eager]:", json.dumps(eager_stats), args.log_prompt
+        )
+    if not compiled_stats:
+        print_with_log_prompt(
+            "[Performance][compiled]:", json.dumps(compiled_stats), args.log_prompt
+        )
 
     e2e_speedup = 0
     gpu_speedup = 0
@@ -197,7 +211,7 @@ def print_times_and_speedup(args, eager_stats, compiled_stats):
     if eager_e2e_time_ms > 0 and compiled_e2e_time_ms > 0:
         e2e_speedup = eager_e2e_time_ms / compiled_e2e_time_ms
 
-    if "cuda" in args.device:
+    if is_gpu_device(args.device):
         eager_gpu_time_ms = eager_stats.get("gpu", {}).get("mean", 0)
         compiled_gpu_time_ms = compiled_stats.get("gpu", {}).get("mean", 0)
 
@@ -207,7 +221,7 @@ def print_times_and_speedup(args, eager_stats, compiled_stats):
     if e2e_speedup > 0:
         print_with_log_prompt("[Speedup][e2e]:", f"{e2e_speedup:.5f}", args.log_prompt)
 
-    if "cuda" in args.device and gpu_speedup > 0:
+    if is_gpu_device(args.device) and gpu_speedup > 0:
         print_with_log_prompt("[Speedup][gpu]:", f"{gpu_speedup:.5f}", args.log_prompt)