PaddlePaddle
diff --git a/‎graph_net/paddle/test_compiler.py‎
Lines changed: 54 additions & 18 deletions b/‎graph_net/paddle/test_compiler.py‎
Lines changed: 54 additions & 18 deletions
diff --git a/‎graph_net/paddle/test_reference_device.py‎
Lines changed: 25 additions & 10 deletions b/‎graph_net/paddle/test_reference_device.py‎
Lines changed: 25 additions & 10 deletions
@@ -41,6 +41,11 @@ def set_seed(random_seed):
     np.random.seed(random_seed)
 
 
+def init_env(args):
+    if test_compiler_util.is_gpu_device(args.device):
+        paddle.set_flags({"FLAGS_cudnn_exhaustive_search": 1})
+
+
 def get_hardward_name(args):
     hardware = "unknown"
     if test_compiler_util.is_gpu_device(args.device):
@@ -65,10 +70,8 @@ def get_hardward_name(args):
 
 
 def get_compile_framework_version(args):
-    if args.compiler == "cinn":
+    if args.compiler in ["cinn", "nope"]:
         return paddle.__version__
-    if args.compiler == "nope":
-        return "nope-baseline"
     return "unknown"
 
 
@@ -137,17 +140,31 @@ def measure_performance(model_call, args, compiler, profile=False):
     outs = model_call()
 
     # Warmup runs
+    warmup_e2e_times = []
     for _ in range(args.warmup):
-        model_call()
+        duration_box = test_compiler_util.DurationBox(-1)
+        with test_compiler_util.naive_timer(duration_box, compiler.synchronize):
+            model_call()
+        warmup_e2e_times.append(duration_box.value)
     compiler.synchronize()
 
+    # Ensure the measuring time is not less than 100ms.
+    min_trials = int(100 / np.mean(warmup_e2e_times[1:]))
+    trials = max(args.trials, min_trials)
+
     hardware_name = get_hardward_name(args)
     print(
-        f"[Profiling] Using device: {args.device} {hardware_name}, warm up {args.warmup}, trials {args.trials}",
+        f"[Profiling] Using device: {args.device} {hardware_name}, warm up {args.warmup}, trials {trials}",
         file=sys.stderr,
         flush=True,
     )
 
+    if profile:
+        import paddle.profiler as profiler
+
+        p = profiler.Profiler()
+        p.start()
+
     if test_compiler_util.is_gpu_device(args.device):
         """
         Acknowledgement: We evaluate the performance on both end-to-end and GPU-only timings,
@@ -157,9 +174,7 @@ def measure_performance(model_call, args, compiler, profile=False):
         e2e_times = []
         gpu_times = []
 
-        if profile:
-            paddle.base.core.nvprof_start()
-        for i in range(args.trials):
+        for i in range(trials):
             # End-to-end timing (naive_timer)
             duration_box = test_compiler_util.DurationBox(-1)
             with test_compiler_util.naive_timer(duration_box, compiler.synchronize):
@@ -171,6 +186,9 @@ def measure_performance(model_call, args, compiler, profile=False):
                 model_call()
                 end_event.record()
 
+            if profile:
+                p.step()
+
             gpu_time_ms = start_event.elapsed_time(end_event)
             e2e_times.append(duration_box.value)
             gpu_times.append(gpu_time_ms)
@@ -179,25 +197,30 @@ def measure_performance(model_call, args, compiler, profile=False):
                 file=sys.stderr,
                 flush=True,
             )
-        if profile:
-            paddle.base.core.nvprof_stop()
-
         stats["e2e"] = test_compiler_util.get_timing_stats(e2e_times)
         stats["gpu"] = test_compiler_util.get_timing_stats(gpu_times)
     else:  # CPU or other devices
         e2e_times = []
-        for i in range(args.trials):
+        for i in range(trials):
             duration_box = test_compiler_util.DurationBox(-1)
             with test_compiler_util.naive_timer(duration_box, compiler.synchronize):
                 model_call()
+
+            if profile:
+                p.step()
+
+            e2e_times.append(duration_box.value)
             print(
                 f"Trial {i + 1}: e2e={duration_box.value:.4f} ms",
                 file=sys.stderr,
                 flush=True,
             )
-            e2e_times.append(duration_box.value)
         stats["e2e"] = test_compiler_util.get_timing_stats(e2e_times)
 
+    if profile:
+        p.stop()
+        p.summary()
+
     return outs, stats
 
 
@@ -210,19 +233,31 @@ def check_outputs(args, expected_out, compiled_out):
     eager_dtypes = [None] * len(expected_out)
     for i, tensor in enumerate(expected_out):
         eager_dtypes[i] = (
-            str(tensor.dtype).replace("paddle.", "") if tensor is not None else "none"
+            str(tensor.dtype).replace("paddle.", "") if tensor is not None else "None"
         )
 
     compiled_dtypes = [None] * len(compiled_out)
     for i, tensor in enumerate(compiled_out):
         compiled_dtypes[i] = (
-            str(tensor.dtype).replace("paddle.", "") if tensor is not None else "none"
+            str(tensor.dtype).replace("paddle.", "") if tensor is not None else "None"
         )
 
     type_match = test_compiler_util.check_output_datatype(
         args, eager_dtypes, compiled_dtypes
     )
 
+    eager_shapes = [None] * len(expected_out)
+    for i, tensor in enumerate(expected_out):
+        eager_shapes[i] = tensor.shape if tensor is not None else None
+
+    compiled_shapes = [None] * len(compiled_out)
+    for i, tensor in enumerate(compiled_out):
+        compiled_shapes[i] = tensor.shape if tensor is not None else None
+
+    shape_match = test_compiler_util.check_output_shape(
+        args, eager_shapes, compiled_shapes
+    )
+
     def transfer_to_float(origin_outputs):
         outputs = []
         for item in origin_outputs:
@@ -235,7 +270,7 @@ def transfer_to_float(origin_outputs):
             outputs.append(item)
         return outputs
 
-    if type_match:
+    if type_match and shape_match:
         test_compiler_util.check_equal(
             args,
             expected_out,
@@ -400,17 +435,18 @@ def test_multi_models(args):
 
     sample_idx = 0
     failed_samples = []
+    module_name = os.path.splitext(os.path.basename(__file__))[0]
     for model_path in path_utils.get_recursively_model_path(args.model_path):
         if test_samples is None or os.path.abspath(model_path) in test_samples:
             print(
-                f"[{sample_idx}] test_compiler, model_path: {model_path}",
+                f"[{sample_idx}] {module_name}, model_path: {model_path}",
                 file=sys.stderr,
                 flush=True,
             )
             cmd = " ".join(
                 [
                     sys.executable,
-                    "-m graph_net.paddle.test_compiler",
+                    f"-m graph_net.paddle.{module_name}",
                     f"--model-path {model_path}",
                     f"--compiler {args.compiler}",
                     f"--device {args.device}",
 
@@ -11,18 +11,28 @@
 import re
 import sys
 import traceback
-from graph_net import test_compiler_util
-from graph_net.paddle import utils
-from graph_net.paddle import test_compiler
+
 from graph_net import path_utils
 from graph_net import test_compiler_util
+from graph_net.paddle import test_compiler
+
+
+def get_reference_log_path(reference_dir, model_path):
+    model_name = model_path.split("paddle_samples/")[-1].replace(os.sep, "_")
+    return os.path.join(reference_dir, f"{model_name}.log")
+
+
+def get_reference_output_path(reference_dir, model_path):
+    model_name = model_path.split("paddle_samples/")[-1].replace(os.sep, "_")
+    return os.path.join(reference_dir, f"{model_name}.pdout")
 
 
 def test_single_model(args):
-    model_name = test_compiler_util.get_model_name(args.model_path)
-    if test_compiler_util.get_subgraph_tag(args.model_path):
-        model_name += "_" + test_compiler_util.get_subgraph_tag(args.model_path)
-    ref_log = Path(args.reference_dir) / f"{model_name}.log"
+    ref_log = get_reference_log_path(args.reference_dir, args.model_path)
+    ref_dump = get_reference_output_path(args.reference_dir, args.model_path)
+    print(f"Reference log path: {ref_log}", file=sys.stderr, flush=True)
+    print(f"Reference outputs path: {ref_dump}", file=sys.stderr, flush=True)
+
     with open(ref_log, "w", encoding="utf-8") as log_f:
         with redirect_stdout(log_f), redirect_stderr(log_f):
             compiler = test_compiler.get_compiler_backend(args)
@@ -63,29 +73,33 @@ def test_single_model(args):
 
             test_compiler_util.print_running_status(args, success)
             if success:
-                ref_dump = Path(args.reference_dir) / f"{model_name}.pdout"
                 paddle.save(outputs, str(ref_dump))
             test_compiler_util.print_with_log_prompt(
                 "[Performance][eager]:", json.dumps(time_stats), args.log_prompt
             )
 
+    with open(ref_log, "r", encoding="utf-8") as f:
+        content = f.read()
+        print(content, file=sys.stderr, flush=True)
+
 
 def test_multi_models(args):
     test_samples = test_compiler_util.get_allow_samples(args.allow_list)
 
     sample_idx = 0
     failed_samples = []
+    module_name = os.path.splitext(os.path.basename(__file__))[0]
     for model_path in path_utils.get_recursively_model_path(args.model_path):
         if test_samples is None or os.path.abspath(model_path) in test_samples:
             print(
-                f"[{sample_idx}] test_compiler, model_path: {model_path}",
+                f"[{sample_idx}] {module_name}, model_path: {model_path}",
                 file=sys.stderr,
                 flush=True,
             )
             cmd = " ".join(
                 [
                     sys.executable,
-                    "-m graph_net.paddle.test_reference_device",
+                    f"-m graph_net.paddle.{module_name}",
                     f"--model-path {model_path}",
                     f"--compiler {args.compiler}",
                     f"--device {args.device}",
@@ -117,6 +131,7 @@ def main(args):
     assert args.device in ["cuda"]
 
     test_compiler.set_seed(random_seed=args.seed)
+    test_compiler.init_env(args)
 
     ref_dump_dir = Path(args.reference_dir)
     ref_dump_dir.mkdir(parents=True, exist_ok=True)