PaddlePaddle
diff --git a/‎graph_net/analysis_util.py‎
Lines changed: 18 additions & 22 deletions b/‎graph_net/analysis_util.py‎
Lines changed: 18 additions & 22 deletions
diff --git a/‎graph_net/paddle/backend/cinn_backend.py‎
Lines changed: 23 additions & 0 deletions b/‎graph_net/paddle/backend/cinn_backend.py‎
Lines changed: 23 additions & 0 deletions
diff --git a/‎graph_net/paddle/backend/graph_compiler_backend.py‎
Lines changed: 6 additions & 0 deletions b/‎graph_net/paddle/backend/graph_compiler_backend.py‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎graph_net/paddle/backend/nope_backend.py‎
Lines changed: 14 additions & 0 deletions b/‎graph_net/paddle/backend/nope_backend.py‎
Lines changed: 14 additions & 0 deletions
diff --git a/‎graph_net/paddle/test_compiler.py‎
Lines changed: 60 additions & 52 deletions b/‎graph_net/paddle/test_compiler.py‎
Lines changed: 60 additions & 52 deletions
diff --git a/‎graph_net/paddle/utils.py‎
Lines changed: 4 additions & 3 deletions b/‎graph_net/paddle/utils.py‎
Lines changed: 4 additions & 3 deletions
@@ -254,6 +254,10 @@ def print_stat_info(
     # pi is a list of constants for t > 0 for each group
     pi = [0, 0]
 
+    is_correct_at_t1 = [False] * total_samples
+    speedup_at_t1 = [None] * total_samples
+    fail_type_at_t1 = ["CORRECT"] * total_samples
+
     final_correct_count = 0
     final_correct_negative_speedup_count = 0
     final_correct_speedups = []
@@ -291,8 +295,8 @@ def print_stat_info(
                             get_correctness(eager_dtypes[i], t_key, correctness_data, i)
                             for i in range(output_count)
                         )
-                    if not is_correct:
-                        fail_type = "accuracy"
+                if not is_correct:
+                    fail_type = "accuracy"
 
             # Collect statistics
             if is_correct:
@@ -306,6 +310,11 @@ def print_stat_info(
             if fail_type == "accuracy":
                 acc_failure_count += 1
 
+            if t_key == 1:
+                is_correct_at_t1[idx] = is_correct
+                speedup_at_t1[idx] = speedup
+                fail_type_at_t1[idx] = fail_type if fail_type is not None else "CORRECT"
+
             # S(t) calculation
             if fail_type is not None or speedup is None:
                 regularized_speedup = fpdb
@@ -320,37 +329,25 @@ def print_stat_info(
             # ES(t) calculation: based on state change
             rec_speedup_fake_degrad = 0
             if t_key < 1:
-                # When t < 1, ES behaves the same as S
                 if fail_type is not None or speedup is None:
                     rec_speedup_fake_degrad = fpdb
-                    # print(f"sample: {sample.get('configuration').get('model')}, fail_type: {fail_type}, rec_speedup_fake_degrad: {rec_speedup_fake_degrad}")
                 else:
                     rec_speedup_fake_degrad = (
                         speedup ** (negative_speedup_penalty + 1)
                         if speedup < 1
                         else speedup
                     )
             else:
-                # When t >= 1, ES starts applying stepwise logic
-                # ES curve's stepwise state, initialized as 'CORRECT'
-                es_status = ["CORRECT"] * total_samples
-                if es_status[idx] == "CORRECT" and fail_type is not None:
-                    es_status[idx] = fail_type
-
-                if (
-                    es_status[idx] is not None
-                    and es_status[idx] != "CORRECT"
-                    or speedup is None
-                ):
+                if not is_correct_at_t1[idx] or speedup_at_t1[idx] is None:
+                    fail_type_frozen = fail_type_at_t1[idx]
                     rec_speedup_fake_degrad = fake_perf_degrad(
-                        t_key, es_status[idx], fpdb
+                        t_key, fail_type_frozen, fpdb
                     )
-                    # print(f"sample: {sample.get('configuration').get('model')}, error type: {es_status[idx]}, rec_speedup_fake_degrad: {rec_speedup_fake_degrad}")
-                else:  # Still in a "CORRECT" state
+                else:
                     rec_speedup_fake_degrad = (
-                        speedup ** (negative_speedup_penalty + 1)
-                        if speedup < 1
-                        else speedup
+                        speedup_at_t1[idx] ** (negative_speedup_penalty + 1)
+                        if speedup_at_t1[idx] < 1
+                        else speedup_at_t1[idx]
                     )
             rectified_speedups_fake_degrad.append(rec_speedup_fake_degrad)
 
@@ -399,4 +396,3 @@ def print_stat_info(
     print(f"    - pi: {pi}")
 
     return s_scores, s_scores_fake_degrad
-    return s_scores, es_scores
@@ -0,0 +1,23 @@
+import paddle
+from graph_net.paddle.backend.graph_compiler_backend import GraphCompilerBackend
+
+
+class CinnBackend(GraphCompilerBackend):
+    def __call__(self, model, input_spec=None):
+        build_strategy = paddle.static.BuildStrategy()
+        compiled_model = paddle.jit.to_static(
+            model,
+            input_spec=input_spec,
+            build_strategy=build_strategy,
+            full_graph=True,
+        )
+        compiled_model.eval()
+        program = compiled_model.forward.concrete_program.main_program
+        return compiled_model
+
+    def synchronize(self):
+        if (
+            paddle.device.is_compiled_with_cuda()
+            or paddle.device.is_compiled_with_rocm()
+        ):
+            paddle.device.synchronize()
@@ -0,0 +1,6 @@
+class GraphCompilerBackend:
+    def __call__(self, model, input_spec=None):
+        raise NotImplementedError()
+
+    def synchronize(self):
+        raise NotImplementedError()
@@ -0,0 +1,14 @@
+import paddle
+from graph_net.paddle.backend.graph_compiler_backend import GraphCompilerBackend
+
+
+class NopeBackend(GraphCompilerBackend):
+    def __call__(self, model, input_spec=None):
+        return model
+
+    def synchronize(self):
+        if (
+            paddle.device.is_compiled_with_cuda()
+            or paddle.device.is_compiled_with_rocm()
+        ):
+            paddle.device.synchronize()
@@ -17,6 +17,21 @@
 from graph_net import path_utils
 from graph_net import test_compiler_util
 
+from graph_net.paddle.backend.graph_compiler_backend import GraphCompilerBackend
+from graph_net.paddle.backend.cinn_backend import CinnBackend
+from graph_net.paddle.backend.nope_backend import NopeBackend
+
+
+registry_backend = {
+    "cinn": CinnBackend(),
+    "nope": NopeBackend(),
+}
+
+
+def get_compiler_backend(args) -> GraphCompilerBackend:
+    assert args.compiler in registry_backend, f"Unknown compiler: {args.compiler}"
+    return registry_backend[args.compiler]
+
 
 def set_seed(random_seed):
     paddle.seed(random_seed)
@@ -25,7 +40,7 @@ def set_seed(random_seed):
 
 
 def get_hardward_name(args):
-    if args.device == "cuda":
+    if test_compiler_util.is_gpu_device(args.device):
         hardware = paddle.device.cuda.get_device_name(0)
     elif args.device == "cpu":
         hardware = platform.processor()
@@ -60,19 +75,15 @@ def load_class_from_file(file_path: str, class_name: str):
     return model_class
 
 
-def get_synchronizer_func(args):
-    return paddle.device.synchronize
-
-
-def get_model(args):
+def get_model(model_path):
     model_class = load_class_from_file(
-        f"{args.model_path}/model.py", class_name="GraphModule"
+        f"{model_path}/model.py", class_name="GraphModule"
     )
     return model_class()
 
 
-def get_input_dict(args):
-    inputs_params = utils.load_converted_from_text(f"{args.model_path}")
+def get_input_dict(model_path):
+    inputs_params = utils.load_converted_from_text(f"{model_path}")
     params = inputs_params["weight_info"]
     inputs = inputs_params["input_info"]
 
@@ -81,8 +92,8 @@ def get_input_dict(args):
     return state_dict
 
 
-def get_input_spec(args):
-    inputs_params_list = utils.load_converted_list_from_text(f"{args.model_path}")
+def get_input_spec(model_path):
+    inputs_params_list = utils.load_converted_list_from_text(f"{model_path}")
     input_spec = [None] * len(inputs_params_list)
     for i, v in enumerate(inputs_params_list):
         dtype = v["info"]["dtype"]
@@ -91,26 +102,10 @@ def get_input_spec(args):
     return input_spec
 
 
-def get_compiled_model(args, model):
-    if args.compiler == "nope":
-        return model
-    input_spec = get_input_spec(args)
-    build_strategy = paddle.static.BuildStrategy()
-    compiled_model = paddle.jit.to_static(
-        model,
-        input_spec=input_spec,
-        build_strategy=build_strategy,
-        full_graph=True,
-    )
-    compiled_model.eval()
-    program = compiled_model.forward.concrete_program.main_program
-    return compiled_model
-
-
 def get_static_model(args, model):
     static_model = paddle.jit.to_static(
         model,
-        input_spec=get_input_spec(args),
+        input_spec=get_input_spec(args.model_path),
         full_graph=True,
         backend=None,
     )
@@ -119,7 +114,7 @@ def get_static_model(args, model):
     return static_model
 
 
-def measure_performance(model_call, args, synchronizer_func, profile=False):
+def measure_performance(model_call, args, compiler, profile=False):
     runtime_seed = 1024
     stats = {}
 
@@ -129,7 +124,7 @@ def measure_performance(model_call, args, synchronizer_func, profile=False):
     # Warmup runs
     for _ in range(args.warmup):
         model_call()
-    synchronizer_func()
+    compiler.synchronize()
 
     hardware_name = get_hardward_name(args)
     print(
@@ -138,7 +133,7 @@ def measure_performance(model_call, args, synchronizer_func, profile=False):
         flush=True,
     )
 
-    if "cuda" in args.device:
+    if test_compiler_util.is_gpu_device(args.device):
         """
         Acknowledgement: We evaluate the performance on both end-to-end and GPU-only timings,
         With reference to methods only based on CUDA events from KernelBench in https://github.com/ScalingIntelligence/KernelBench
@@ -152,7 +147,7 @@ def measure_performance(model_call, args, synchronizer_func, profile=False):
         for i in range(args.trials):
             # End-to-end timing (naive_timer)
             duration_box = test_compiler_util.DurationBox(-1)
-            with test_compiler_util.naive_timer(duration_box, synchronizer_func):
+            with test_compiler_util.naive_timer(duration_box, compiler.synchronize):
                 # GPU-only timing (CUDA Events)
                 start_event = paddle.device.Event(enable_timing=True)
                 end_event = paddle.device.Event(enable_timing=True)
@@ -178,7 +173,7 @@ def measure_performance(model_call, args, synchronizer_func, profile=False):
         e2e_times = []
         for i in range(args.trials):
             duration_box = test_compiler_util.DurationBox(-1)
-            with test_compiler_util.naive_timer(duration_box, synchronizer_func):
+            with test_compiler_util.naive_timer(duration_box, compiler.synchronize):
                 model_call()
             print(
                 f"Trial {i + 1}: e2e={duration_box.value:.4f} ms",
@@ -247,10 +242,27 @@ def transfer_to_float(origin_outputs):
         )
 
 
+def check_and_print_gpu_utilization(compiler):
+    if paddle.device.is_compiled_with_cuda():
+        device_id = int(paddle.device.get_device().split(":")[-1])
+        device_count = paddle.device.cuda.device_count()
+        gpu_util, mem_util = test_compiler_util.get_device_utilization(
+            device_id, device_count, compiler.synchronize
+        )
+        if gpu_util is not None and mem_util is not None:
+            print(
+                f"Device status: gpu_id {device_id}, gpu_util {gpu_util:.2f}%, mem_util {mem_util:.2f}%",
+                file=sys.stderr,
+                flush=True,
+            )
+
+
 def test_single_model(args):
-    synchronizer_func = get_synchronizer_func(args)
-    input_dict = get_input_dict(args)
-    model = get_model(args)
+    compiler = get_compiler_backend(args)
+    check_and_print_gpu_utilization(compiler)
+
+    input_dict = get_input_dict(args.model_path)
+    model = get_model(args.model_path)
     model.eval()
 
     test_compiler_util.print_basic_config(
@@ -259,11 +271,12 @@ def test_single_model(args):
 
     # Run on eager mode
     eager_success = False
+    eager_time_stats = {}
     try:
         print("Run model in eager mode.", file=sys.stderr, flush=True)
         static_model = get_static_model(args, model)
         expected_out, eager_time_stats = measure_performance(
-            lambda: static_model(**input_dict), args, synchronizer_func, profile=False
+            lambda: static_model(**input_dict), args, compiler, profile=False
         )
         eager_success = True
     except Exception as e:
@@ -275,11 +288,13 @@ def test_single_model(args):
 
     # Run on compiling mode
     compiled_success = False
+    compiled_time_stats = {}
     try:
         print("Run model in compiled mode.", file=sys.stderr, flush=True)
-        compiled_model = get_compiled_model(args, model)
+        input_spec = get_input_spec(args.model_path)
+        compiled_model = compiler(model, input_spec)
         compiled_out, compiled_time_stats = measure_performance(
-            lambda: compiled_model(**input_dict), args, synchronizer_func, profile=False
+            lambda: compiled_model(**input_dict), args, compiler, profile=False
         )
         compiled_success = True
     except Exception as e:
@@ -293,9 +308,9 @@ def test_single_model(args):
     if eager_success and compiled_success:
         check_outputs(args, expected_out, compiled_out)
 
-        test_compiler_util.print_times_and_speedup(
-            args, eager_time_stats, compiled_time_stats
-        )
+    test_compiler_util.print_times_and_speedup(
+        args, eager_time_stats, compiled_time_stats
+    )
 
 
 def get_cmp_equal(expected_out, compiled_out):
@@ -366,20 +381,12 @@ def get_cmp_diff_count(expected_out, compiled_out, atol, rtol):
 
 
 def test_multi_models(args):
-    test_samples = None
-    if args.allow_list is not None:
-        assert os.path.isfile(args.allow_list)
-        graphnet_root = path_utils.get_graphnet_root()
-        print(f"graphnet_root: {graphnet_root}", file=sys.stderr, flush=True)
-        verified_samples = []
-        with open(args.verified_samples_list_path, "r") as f:
-            for line in f.readlines():
-                test_samples.append(os.path.join(graphnet_root, line.strip()))
+    test_samples = test_compiler_util.get_allow_samples(args.allow_list)
 
     sample_idx = 0
     failed_samples = []
     for model_path in path_utils.get_recursively_model_path(args.model_path):
-        if verified_samples is None or os.path.abspath(model_path) in verified_samples:
+        if test_samples is None or os.path.abspath(model_path) in test_samples:
             print(
                 f"[{sample_idx}] test_compiler, model_path: {model_path}",
                 file=sys.stderr,
@@ -415,6 +422,7 @@ def test_multi_models(args):
 def main(args):
     assert os.path.isdir(args.model_path)
     assert args.compiler in {"cinn", "nope"}
+    assert args.device in ["cuda", "dcu", "cpu"]
 
     initalize_seed = 123
     set_seed(random_seed=initalize_seed)
 
@@ -213,10 +213,11 @@ def replay_tensor(info):
         ).to(device)
     else:
         if mean is not None and std is not None:
-            tensor = paddle.empty(shape=shape, dtype=dtype)
-            paddle.nn.init.trunc_normal_(
-                tensor=tensor, mean=mean, std=std, a=min_val, b=max_val
+            tensor = paddle.empty(shape=shape, dtype="float32")
+            initializer = paddle.nn.initializer.TruncatedNormal(
+                mean=mean, std=std, a=min_val, b=max_val
             )
+            initializer(tensor)
             return tensor.to(dtype).to(device)
         else:
             return (