[Feature Enhancement] test_compiler supports XPU and support use numpy to initialize tensor. (#329)

Xreki · web-flow · commit f5ca9e40f22c · 2025-11-06T13:46:28.000+08:00
* test_compiler supports XPU.

* Format code.

* Optimize some code and support use numpy to initialize tensor.
diff --git a/graph_net/paddle/backend/cinn_backend.py b/graph_net/paddle/backend/cinn_backend.py
@@ -19,5 +19,6 @@ def synchronize(self):
         if (
             paddle.device.is_compiled_with_cuda()
             or paddle.device.is_compiled_with_rocm()
+            or paddle.device.is_compiled_with_xpu()
         ):
             paddle.device.synchronize()
diff --git a/graph_net/paddle/backend/nope_backend.py b/graph_net/paddle/backend/nope_backend.py
@@ -10,5 +10,6 @@ def synchronize(self):
         if (
             paddle.device.is_compiled_with_cuda()
             or paddle.device.is_compiled_with_rocm()
+            or paddle.device.is_compiled_with_xpu()
         ):
             paddle.device.synchronize()
diff --git a/graph_net/paddle/test_compiler.py b/graph_net/paddle/test_compiler.py
@@ -12,6 +12,8 @@
 import random
 import platform
 import traceback
+import subprocess
+import re
 
 from graph_net.paddle import utils
 from graph_net import path_utils
@@ -40,12 +42,25 @@ def set_seed(random_seed):
 
 
 def get_hardward_name(args):
+    hardware = "unknown"
     if test_compiler_util.is_gpu_device(args.device):
         hardware = paddle.device.cuda.get_device_name(0)
+    elif args.device == "xpu":
+        try:
+            output = subprocess.check_output(["xpu-smi", "-L"], text=True)
+            hardware = next(
+                match.group(2)
+                for line in output.splitlines()
+                if (
+                    match := re.match(
+                        r"XPU\s+(\d+):\s+(.+?)\s+\(UUID:\s*([^)]+)\)", line
+                    )
+                )
+            )
+        except Exception as e:
+            pass
     elif args.device == "cpu":
         hardware = platform.processor()
-    else:
-        hardware = "unknown"
     return hardware
 
 
@@ -422,7 +437,7 @@ def test_multi_models(args):
 def main(args):
     assert os.path.isdir(args.model_path)
     assert args.compiler in {"cinn", "nope"}
-    assert args.device in ["cuda", "dcu", "cpu"]
+    assert args.device in ["cuda", "dcu", "xpu", "cpu"]
 
     initalize_seed = 123
     set_seed(random_seed=initalize_seed)
diff --git a/graph_net/paddle/utils.py b/graph_net/paddle/utils.py
@@ -8,6 +8,7 @@
 import inspect
 import ast
 import math
+import numpy as np
 import paddle
 
 kLiteralTensorSize = 64
@@ -139,6 +140,7 @@ def convert_to_valid_number(data_type, value):
 
 
 def convert_meta_classes_to_tensors(file_path):
+    current_device = paddle.device.get_device()
     for name, cls in _get_classes(file_path):
         attrs = {
             k: v
@@ -159,7 +161,7 @@ def convert_meta_classes_to_tensors(file_path):
             "info": {
                 "shape": attrs.get("shape", []),
                 "dtype": data_type,
-                "device": attrs.get("device", "gpu"),
+                "device": attrs.get("device", current_device),
                 "mean": convert_to_valid_number(data_type, attrs.get("mean", None)),
                 "std": convert_to_valid_number(data_type, attrs.get("std", None)),
                 "min_val": convert_to_valid_number(data_type, attrs.get("min_val", 0)),
@@ -188,7 +190,43 @@ def extract_dynamic_shapes(example_inputs):
     pass
 
 
-def replay_tensor(info):
+def init_integer_tensor(dtype, shape, min_val, max_val, use_numpy):
+    if use_numpy:
+        array = np.random.randint(
+            low=min_val, high=max_val + 1, size=shape, dtype=dtype
+        )
+        return paddle.to_tensor(array)
+    else:
+        return paddle.randint(low=min_val, high=max_val + 1, shape=shape, dtype=dtype)
+
+
+def init_float_tensor(shape, mean, std, min_val, max_val, use_numpy):
+    tensor = None
+    if use_numpy:
+        if mean is not None and std is not None:
+            array = np.random.normal(mean, std, shape)
+            mask = (array < min_val) | (array > max_val)
+            while np.any(mask):
+                array[mask] = np.random.normal(mean, std, mask.sum())
+                mask = (array < min_val) | (array > max_val)
+        else:
+            array = np.random.uniform(low=min_val, high=max_val, size=shape)
+        tensor = paddle.to_tensor(array)
+    else:
+        if mean is not None and std is not None:
+            tensor = paddle.empty(shape=shape, dtype="float32")
+            initializer = paddle.nn.initializer.TruncatedNormal(
+                mean=mean, std=std, a=min_val, b=max_val
+            )
+            initializer(tensor)
+        else:
+            tensor = paddle.uniform(
+                shape=shape, dtype="float32", min=min_val, max=max_val
+            )
+    return tensor
+
+
+def replay_tensor(info, use_numpy=True):
     device = info["info"]["device"]
     dtype = info["info"]["dtype"]
     shape = info["info"]["shape"]
@@ -201,27 +239,14 @@ def replay_tensor(info):
         shape = list(map(lambda i: i if i is not None else 1, shape))
     if "data" in info and info["data"] is not None:
         return paddle.reshape(info["data"], shape).to(dtype).to(device)
-    elif dtype == paddle.int32 or dtype == paddle.int64:
-        return paddle.cast(
-            paddle.randint(low=min_val, high=max_val + 1, shape=shape, dtype="int64"),
-            dtype,
-        ).to(device)
-    elif dtype == paddle.bool:
-        return paddle.cast(
-            paddle.randint(low=0, high=2, shape=shape, dtype="int32"),
-            paddle.bool,
-        ).to(device)
+    elif dtype in [paddle.int32, paddle.int64, paddle.bool]:
+        init_dtype = "int32" if dtype == paddle.bool else "int64"
+        min_val, max_val = 0, 1 if dtype == paddle.bool else min_val, max_val
+        return (
+            init_integer_tensor(init_dtype, shape, min_val, max_val, use_numpy)
+            .to(dtype)
+            .to(device)
+        )
     else:
-        if mean is not None and std is not None:
-            tensor = paddle.empty(shape=shape, dtype="float32")
-            initializer = paddle.nn.initializer.TruncatedNormal(
-                mean=mean, std=std, a=min_val, b=max_val
-            )
-            initializer(tensor)
-            return tensor.to(dtype).to(device)
-        else:
-            return (
-                paddle.uniform(shape=shape, dtype="float32", min=min_val, max=max_val)
-                .to(dtype)
-                .to(device)
-            )
+        tensor = init_float_tensor(shape, mean, std, min_val, max_val, use_numpy)
+        return tensor.to(dtype).to(device)
diff --git a/graph_net/test_compiler_util.py b/graph_net/test_compiler_util.py
@@ -49,20 +49,19 @@ def get_device_utilization(device_id, device_count, synchronizer_func):
                 flush=True,
             )
             selected_gpu_uuid, max_gpu_util, max_mem_util = None, 0.0, 0.0
-            for i in range(5):
+            for i in range(3):
                 synchronizer_func()
                 time.sleep(1)
 
                 cmd = [
                     "nvidia-smi",
+                    f"--id={selected_gpu_id}",
                     f"--query-gpu=index,gpu_uuid,utilization.gpu,memory.used,memory.total",
                     "--format=csv,noheader,nounits",
                 ]
                 output = subprocess.check_output(cmd).decode().strip()
                 _, selected_gpu_uuid, gpu_util, used_mem, mem_total = next(
-                    line.split(", ")
-                    for line in output.split("\n")
-                    if line.strip() and int(line.split(", ")[0]) == selected_gpu_id
+                    line.split(", ") for line in output.split("\n") if line.strip()
                 )
                 gpu_util = float(gpu_util)
                 mem_util = float(used_mem) * 100 / float(mem_total)
@@ -78,6 +77,7 @@ def get_device_utilization(device_id, device_count, synchronizer_func):
             other_tasks = []
             cmd = [
                 "nvidia-smi",
+                f"--id={selected_gpu_id}",
                 f"--query-compute-apps=gpu_uuid,pid,used_memory",
                 "--format=csv,noheader,nounits",
             ]
@@ -86,8 +86,7 @@ def get_device_utilization(device_id, device_count, synchronizer_func):
                 line
                 for line in output.split("\n")
                 if line.strip()
-                and (line.split(", ")[0] == selected_gpu_uuid)
-                and (line.split(", ")[1] != current_pid)
+                if line.split(", ")[1] != current_pid
             ]
             # Note: in docker container, the current_pid maybe different from that captured by nvidia-smi.
             print(