[Other] Optimize codes using list comprehension, implement GraphCompilerBackend for paddle test_compiler. (#327)

Xreki · web-flow · commit c3c8599e43e8 · 2025-11-05T10:31:34.000+08:00
* Optimize codes using list comprehension.

* Implement graph compiler backend for paddle.

* Fix the init dtype.
diff --git a/graph_net/paddle/backend/cinn_backend.py b/graph_net/paddle/backend/cinn_backend.py
@@ -0,0 +1,23 @@
+import paddle
+from graph_net.paddle.backend.graph_compiler_backend import GraphCompilerBackend
+
+
+class CinnBackend(GraphCompilerBackend):
+    def __call__(self, model, input_spec=None):
+        build_strategy = paddle.static.BuildStrategy()
+        compiled_model = paddle.jit.to_static(
+            model,
+            input_spec=input_spec,
+            build_strategy=build_strategy,
+            full_graph=True,
+        )
+        compiled_model.eval()
+        program = compiled_model.forward.concrete_program.main_program
+        return compiled_model
+
+    def synchronize(self):
+        if (
+            paddle.device.is_compiled_with_cuda()
+            or paddle.device.is_compiled_with_rocm()
+        ):
+            paddle.device.synchronize()
diff --git a/graph_net/paddle/backend/graph_compiler_backend.py b/graph_net/paddle/backend/graph_compiler_backend.py
@@ -0,0 +1,6 @@
+class GraphCompilerBackend:
+    def __call__(self, model, input_spec=None):
+        raise NotImplementedError()
+
+    def synchronize(self):
+        raise NotImplementedError()
diff --git a/graph_net/paddle/backend/nope_backend.py b/graph_net/paddle/backend/nope_backend.py
@@ -0,0 +1,14 @@
+import paddle
+from graph_net.paddle.backend.graph_compiler_backend import GraphCompilerBackend
+
+
+class NopeBackend(GraphCompilerBackend):
+    def __call__(self, model, input_spec=None):
+        return model
+
+    def synchronize(self):
+        if (
+            paddle.device.is_compiled_with_cuda()
+            or paddle.device.is_compiled_with_rocm()
+        ):
+            paddle.device.synchronize()
diff --git a/graph_net/paddle/test_compiler.py b/graph_net/paddle/test_compiler.py
@@ -17,6 +17,21 @@
 from graph_net import path_utils
 from graph_net import test_compiler_util
 
+from graph_net.paddle.backend.graph_compiler_backend import GraphCompilerBackend
+from graph_net.paddle.backend.cinn_backend import CinnBackend
+from graph_net.paddle.backend.nope_backend import NopeBackend
+
+
+registry_backend = {
+    "cinn": CinnBackend(),
+    "nope": NopeBackend(),
+}
+
+
+def get_compiler_backend(args) -> GraphCompilerBackend:
+    assert args.compiler in registry_backend, f"Unknown compiler: {args.compiler}"
+    return registry_backend[args.compiler]
+
 
 def set_seed(random_seed):
     paddle.seed(random_seed)
@@ -60,10 +75,6 @@ def load_class_from_file(file_path: str, class_name: str):
     return model_class
 
 
-def get_synchronizer_func(args):
-    return paddle.device.synchronize
-
-
 def get_model(model_path):
     model_class = load_class_from_file(
         f"{model_path}/model.py", class_name="GraphModule"
@@ -91,22 +102,6 @@ def get_input_spec(model_path):
     return input_spec
 
 
-def get_compiled_model(args, model):
-    if args.compiler == "nope":
-        return model
-    input_spec = get_input_spec(args.model_path)
-    build_strategy = paddle.static.BuildStrategy()
-    compiled_model = paddle.jit.to_static(
-        model,
-        input_spec=input_spec,
-        build_strategy=build_strategy,
-        full_graph=True,
-    )
-    compiled_model.eval()
-    program = compiled_model.forward.concrete_program.main_program
-    return compiled_model
-
-
 def get_static_model(args, model):
     static_model = paddle.jit.to_static(
         model,
@@ -119,7 +114,7 @@ def get_static_model(args, model):
     return static_model
 
 
-def measure_performance(model_call, args, synchronizer_func, profile=False):
+def measure_performance(model_call, args, compiler, profile=False):
     runtime_seed = 1024
     stats = {}
 
@@ -129,7 +124,7 @@ def measure_performance(model_call, args, synchronizer_func, profile=False):
     # Warmup runs
     for _ in range(args.warmup):
         model_call()
-    synchronizer_func()
+    compiler.synchronize()
 
     hardware_name = get_hardward_name(args)
     print(
@@ -152,7 +147,7 @@ def measure_performance(model_call, args, synchronizer_func, profile=False):
         for i in range(args.trials):
             # End-to-end timing (naive_timer)
             duration_box = test_compiler_util.DurationBox(-1)
-            with test_compiler_util.naive_timer(duration_box, synchronizer_func):
+            with test_compiler_util.naive_timer(duration_box, compiler.synchronize):
                 # GPU-only timing (CUDA Events)
                 start_event = paddle.device.Event(enable_timing=True)
                 end_event = paddle.device.Event(enable_timing=True)
@@ -178,7 +173,7 @@ def measure_performance(model_call, args, synchronizer_func, profile=False):
         e2e_times = []
         for i in range(args.trials):
             duration_box = test_compiler_util.DurationBox(-1)
-            with test_compiler_util.naive_timer(duration_box, synchronizer_func):
+            with test_compiler_util.naive_timer(duration_box, compiler.synchronize):
                 model_call()
             print(
                 f"Trial {i + 1}: e2e={duration_box.value:.4f} ms",
@@ -247,8 +242,25 @@ def transfer_to_float(origin_outputs):
         )
 
 
+def check_and_print_gpu_utilization(compiler):
+    if paddle.device.is_compiled_with_cuda():
+        device_id = int(paddle.device.get_device().split(":")[-1])
+        device_count = paddle.device.cuda.device_count()
+        gpu_util, mem_util = test_compiler_util.get_device_utilization(
+            device_id, device_count, compiler.synchronize
+        )
+        if gpu_util is not None and mem_util is not None:
+            print(
+                f"Device status: gpu_id {device_id}, gpu_util {gpu_util:.2f}%, mem_util {mem_util:.2f}%",
+                file=sys.stderr,
+                flush=True,
+            )
+
+
 def test_single_model(args):
-    synchronizer_func = get_synchronizer_func(args)
+    compiler = get_compiler_backend(args)
+    check_and_print_gpu_utilization(compiler)
+
     input_dict = get_input_dict(args.model_path)
     model = get_model(args.model_path)
     model.eval()
@@ -264,7 +276,7 @@ def test_single_model(args):
         print("Run model in eager mode.", file=sys.stderr, flush=True)
         static_model = get_static_model(args, model)
         expected_out, eager_time_stats = measure_performance(
-            lambda: static_model(**input_dict), args, synchronizer_func, profile=False
+            lambda: static_model(**input_dict), args, compiler, profile=False
         )
         eager_success = True
     except Exception as e:
@@ -279,9 +291,10 @@ def test_single_model(args):
     compiled_time_stats = {}
     try:
         print("Run model in compiled mode.", file=sys.stderr, flush=True)
-        compiled_model = get_compiled_model(args, model)
+        input_spec = get_input_spec(args.model_path)
+        compiled_model = compiler(model, input_spec)
         compiled_out, compiled_time_stats = measure_performance(
-            lambda: compiled_model(**input_dict), args, synchronizer_func, profile=False
+            lambda: compiled_model(**input_dict), args, compiler, profile=False
         )
         compiled_success = True
     except Exception as e:
@@ -415,18 +428,6 @@ def main(args):
     set_seed(random_seed=initalize_seed)
 
     if path_utils.is_single_model_dir(args.model_path):
-        if paddle.device.is_compiled_with_cuda():
-            device_id = int(paddle.device.get_device().split(":")[-1])
-            device_count = paddle.device.cuda.device_count()
-            gpu_util, mem_util = test_compiler_util.get_device_utilization(
-                device_id, device_count, get_synchronizer_func(args)
-            )
-            if gpu_util is not None and mem_util is not None:
-                print(
-                    f"Device status: gpu_id {device_id}, gpu_util {gpu_util:.2f}%, mem_util {mem_util:.2f}%",
-                    file=sys.stderr,
-                    flush=True,
-                )
         test_single_model(args)
     else:
         test_multi_models(args)
diff --git a/graph_net/paddle/utils.py b/graph_net/paddle/utils.py
@@ -213,7 +213,7 @@ def replay_tensor(info):
         ).to(device)
     else:
         if mean is not None and std is not None:
-            tensor = paddle.empty(shape=shape, dtype=dtype)
+            tensor = paddle.empty(shape=shape, dtype="float32")
             initializer = paddle.nn.initializer.TruncatedNormal(
                 mean=mean, std=std, a=min_val, b=max_val
             )
diff --git a/graph_net/test_compiler_util.py b/graph_net/test_compiler_util.py
@@ -53,29 +53,17 @@ def get_device_utilization(device_id, device_count, synchronizer_func):
                 synchronizer_func()
                 time.sleep(1)
 
-                output = (
-                    subprocess.check_output(
-                        [
-                            "nvidia-smi",
-                            f"--query-gpu=index,gpu_uuid,utilization.gpu,memory.used,memory.total",
-                            "--format=csv,noheader,nounits",
-                        ]
-                    )
-                    .decode()
-                    .strip()
+                cmd = [
+                    "nvidia-smi",
+                    f"--query-gpu=index,gpu_uuid,utilization.gpu,memory.used,memory.total",
+                    "--format=csv,noheader,nounits",
+                ]
+                output = subprocess.check_output(cmd).decode().strip()
+                _, selected_gpu_uuid, gpu_util, used_mem, mem_total = next(
+                    line.split(", ")
+                    for line in output.split("\n")
+                    if line.strip() and int(line.split(", ")[0]) == selected_gpu_id
                 )
-                for line in output.split("\n"):
-                    if line.strip():
-                        (
-                            gpu_id,
-                            selected_gpu_uuid,
-                            gpu_util,
-                            used_mem,
-                            mem_total,
-                        ) = line.split(", ")
-                        if int(gpu_id) == selected_gpu_id:
-                            break
-
                 gpu_util = float(gpu_util)
                 mem_util = float(used_mem) * 100 / float(mem_total)
                 print(
@@ -88,22 +76,19 @@ def get_device_utilization(device_id, device_count, synchronizer_func):
                 max_mem_util = mem_util if mem_util > max_mem_util else max_mem_util
 
             other_tasks = []
-            output = (
-                subprocess.check_output(
-                    [
-                        "nvidia-smi",
-                        f"--query-compute-apps=gpu_uuid,pid,used_memory",
-                        "--format=csv,noheader,nounits",
-                    ]
-                )
-                .decode()
-                .strip()
-            )
-            for line in output.split("\n"):
-                if line.strip():
-                    gpu_uuid, pid, used_memory = line.split(", ")
-                    if gpu_uuid == selected_gpu_uuid and int(pid) != current_pid:
-                        other_tasks.append(line)
+            cmd = [
+                "nvidia-smi",
+                f"--query-compute-apps=gpu_uuid,pid,used_memory",
+                "--format=csv,noheader,nounits",
+            ]
+            output = subprocess.check_output(cmd).decode().strip()
+            other_tasks = [
+                line
+                for line in output.split("\n")
+                if line.strip()
+                and (line.split(", ")[0] == selected_gpu_uuid)
+                and (line.split(", ")[1] != current_pid)
+            ]
             # Note: in docker container, the current_pid maybe different from that captured by nvidia-smi.
             print(
                 f"Note: There are {len(other_tasks)} tasks running on GPU {selected_gpu_id} (current_pid:{current_pid}).",
@@ -195,11 +180,11 @@ def convert_to_str(b):
 
 
 def print_times_and_speedup(args, eager_stats, compiled_stats):
-    if not eager_stats:
+    if eager_stats:
         print_with_log_prompt(
             "[Performance][eager]:", json.dumps(eager_stats), args.log_prompt
         )
-    if not compiled_stats:
+    if compiled_stats:
         print_with_log_prompt(
             "[Performance][compiled]:", json.dumps(compiled_stats), args.log_prompt
         )

Original file line number	Diff line number	Diff line change
`@@ -213,7 +213,7 @@ def replay_tensor(info):`
`213`	`213`	`).to(device)`
`214`	`214`	`else:`
`215`	`215`	`if mean is not None and std is not None:`
`216`		`- tensor = paddle.empty(shape=shape, dtype=dtype)`
	`216`	`+ tensor = paddle.empty(shape=shape, dtype="float32")`
`217`	`217`	`initializer = paddle.nn.initializer.TruncatedNormal(`
`218`	`218`	`mean=mean, std=std, a=min_val, b=max_val`
`219`	`219`	`)`