Polish the timing codes of test_compiler of paddle.

Xreki · Xreki · commit 413742eac144 · 2025-09-01T16:04:54.000+08:00
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -3,4 +3,4 @@ repos:
   rev: 23.1.0
   hooks:
   - id: black
-    language_version: python3
+    language_version: python
diff --git a/graph_net/paddle/test_compiler.py b/graph_net/paddle/test_compiler.py
@@ -1,17 +1,16 @@
-from . import utils
 import argparse
 import importlib.util
-import inspect
 import paddle
 from pathlib import Path
-from typing import Type, Any
 import sys
 import os
-import os.path
 from dataclasses import dataclass
 from contextlib import contextmanager
 import time
 import numpy as np
+import random
+
+from . import utils
 
 
 def load_class_from_file(file_path: str, class_name: str):
@@ -33,7 +32,6 @@ def load_class_from_file(file_path: str, class_name: str):
 
 
 def get_synchronizer_func(args):
-    assert args.compiler == "default"
     return paddle.device.synchronize
 
 
@@ -49,37 +47,46 @@ def get_input_dict(args):
     params = inputs_params["weight_info"]
     inputs = inputs_params["input_info"]
 
-    params.update(inputs)
-    state_dict = {k: utils.replay_tensor(v) for k, v in params.items()}
-    return state_dict
-
-
-@dataclass
-class DurationBox:
-    value: int
+    param_dtypes = set()
+    for name, info in params.items():
+        dtype = str(info["info"]["dtype"])
+        if dtype not in param_dtypes:
+            param_dtypes.add(dtype)
 
+    input_dtypes = set()
+    for name, info in inputs.items():
+        dtype = str(info["info"]["dtype"])
+        if dtype not in input_dtypes:
+            input_dtypes.add(dtype)
 
-@contextmanager
-def naive_timer(duration_box, get_synchronizer_func):
-    get_synchronizer_func()
-    start = time.time()
-    yield
-    get_synchronizer_func()
-    end = time.time()
-    duration_box.value = end - start
+    params.update(inputs)
+    state_dict = {k: utils.replay_tensor(v) for k, v in params.items()}
+    return state_dict, list(input_dtypes), list(param_dtypes)
 
 
 def get_input_spec(args):
     inputs_params_list = utils.load_converted_list_from_text(f"{args.model_path}")
     input_spec = [None] * len(inputs_params_list)
     for i, v in enumerate(inputs_params_list):
-        name = v["name"]
         dtype = v["info"]["dtype"]
         shape = v["info"]["shape"]
         input_spec[i] = paddle.static.InputSpec(shape, dtype)
     return input_spec
 
 
+def get_compiled_model(args, model):
+    input_spec = get_input_spec(args)
+    build_strategy = paddle.static.BuildStrategy()
+    compiled_model = paddle.jit.to_static(
+        model,
+        input_spec=input_spec,
+        build_strategy=build_strategy,
+        full_graph=True,
+    )
+    compiled_model.eval()
+    return compiled_model
+
+
 def regular_item(item):
     if isinstance(item, paddle.Tensor) and (item.dtype == paddle.bfloat16):
         item = np.array(item.astype("float32"))
@@ -90,37 +97,129 @@ def regular_item(item):
     return item
 
 
+def count_number_of_ops(args, model):
+    static_model = paddle.jit.to_static(
+        model,
+        input_spec=get_input_spec(args),
+        full_graph=True,
+        backend=None,
+    )
+    static_model.eval()
+    program = model.forward.concrete_program.main_program
+    # print(program)
+
+    num_ops = 0
+    for block in program.blocks:
+        for op in block.ops:
+            if op.name() != "pd_op.data" and not op.name().startswith("builtin."):
+                num_ops += 1
+    print(f"Totally {num_ops} ops.")
+    print("")
+    return num_ops
+
+
+@dataclass
+class DurationBox:
+    value: int
+
+
+@contextmanager
+def naive_timer(duration_box, synchronizer_func):
+    synchronizer_func()
+    start = time.time()
+    yield
+    synchronizer_func()
+    end = time.time()
+    duration_box.value = end - start
+
+
+def time_execution_with_cuda_event(
+    model_call, synchronizer_func, num_warmup=3, num_trials=10, profile=False
+):
+    outs = None
+
+    # warmups
+    for _ in range(num_warmup):
+        outs = model_call()
+    synchronizer_func()
+
+    elapsed_times = []
+    if profile:
+        paddle.base.core.nvprof_start()
+
+    # actual trials
+    for trial in range(num_trials):
+        # create event marker default is not interprocess
+        start_event = paddle.device.Event(enable_timing=True)
+        end_event = paddle.device.Event(enable_timing=True)
+
+        start_event.record()
+        outs = model_call()
+        end_event.record()
+        synchronizer_func()
+
+        # Calculate the elapsed time in milliseconds
+        elapsed_time_ms = start_event.elapsed_time(end_event)
+        elapsed_times.append(elapsed_time_ms)
+    if profile:
+        paddle.base.core.nvprof_stop()
+    elapsed_times = elapsed_times[num_trials // 2 :]
+    return outs, np.mean(elapsed_times)
+
+
+def time_execution_naive(model_call, synchronizer_func, num_warmup=3, num_trials=10):
+    outs = None
+
+    # warmups
+    for _ in range(num_warmup):
+        outs = model_call()
+
+    # actual trials
+    duration_box = DurationBox(-1)
+    with naive_timer(duration_box, synchronizer_func):
+        for i in range(num_trials):
+            outs = model_call()
+    return outs, duration_box.value * 1000 / float(num_trials)
+
+
+def measure_performance(model_call, synchronizer_func, args, profile=False):
+    if not args.use_naive_timer:
+        outs, times = time_execution_with_cuda_event(
+            model_call,
+            synchronizer_func=synchronizer_func,
+            num_warmup=args.warmup,
+            num_trials=args.trials,
+            profile=profile,
+        )
+    else:
+        outs, times = time_execution_naive(
+            model_call,
+            synchronizer_func=synchronizer_func,
+            num_warmup=args.warmup,
+            num_trials=args.trials,
+        )
+    return outs, times
+
+
 def test_single_model(args):
     synchronizer_func = get_synchronizer_func(args)
-    input_dict = get_input_dict(args)
-    model_dy = get_model(args)
-
-    # eager
-    print("-- Run with eager mode")
-    model_dy.eval()
-    for _ in range(args.warmup if args.warmup > 0 else 0):
-        model_dy(**input_dict)
-    eager_duration_box = DurationBox(-1)
-    with naive_timer(eager_duration_box, synchronizer_func):
-        expected_out = model_dy(**input_dict)
-
-    # compiled
-    print("-- Run with compiled mode")
-    input_spec = get_input_spec(args)
-    build_strategy = paddle.static.BuildStrategy()
-    # build_strategy.build_cinn_pass = True
-    compiled_model = paddle.jit.to_static(
-        model_dy,
-        input_spec=input_spec,
-        build_strategy=build_strategy,
-        full_graph=True,
+    input_dict, input_dtypes, param_dtypes = get_input_dict(args)
+    model = get_model(args)
+    model.eval()
+
+    # Collect model information
+    num_ops = count_number_of_ops(args, model)
+
+    print("Run on eager mode")
+    expected_out, eager_time_ms = measure_performance(
+        lambda: model(**input_dict), synchronizer_func, args, profile=False
+    )
+
+    print("Run on compiling mode")
+    compiled_model = get_compiled_model(args, model)
+    compiled_out, compiled_time_ms = measure_performance(
+        lambda: compiled_model(**input_dict), synchronizer_func, args, profile=False
     )
-    compiled_model.eval()
-    for _ in range(args.warmup if args.warmup > 0 else 0):
-        compiled_model(**input_dict)
-    compiled_duration_box = DurationBox(-1)
-    with naive_timer(compiled_duration_box, synchronizer_func):
-        compiled_out = compiled_model(**input_dict)
 
     if isinstance(expected_out, paddle.Tensor):
         expected_out = [expected_out]
@@ -164,7 +263,11 @@ def print_cmp(key, func, **kwargs):
     print_cmp("cmp.diff_count_atol2_rtol1", get_cmp_diff_count, atol=1e-2, rtol=1e-1)
 
     print(
-        f"{args.log_prompt} duration model_path:{args.model_path} eager:{eager_duration_box.value} compiled:{compiled_duration_box.value}",
+        f"{args.log_prompt} information model_path:{args.model_path} {num_ops} ops, param_dtypes:{param_dtypes}, input_dtypes:{input_dtypes}",
+        file=sys.stderr,
+    )
+    print(
+        f"{args.log_prompt} duration model_path:{args.model_path} eager:{eager_time_ms:.5f} ms, compiled:{compiled_time_ms:.5f} ms, speedup:{eager_time_ms / compiled_time_ms:.3f}",
         file=sys.stderr,
     )
 
@@ -210,6 +313,7 @@ def test_multi_models(args):
                 f"--model-path {model_path}",
                 f"--compiler {args.compiler}",
                 f"--warmup {args.warmup}",
+                f"--trials {args.trials}",
                 f"--log-prompt {args.log_prompt}",
             ]
         )
@@ -240,6 +344,13 @@ def is_single_model_dir(model_dir):
 
 def main(args):
     assert os.path.isdir(args.model_path)
+    assert args.compiler == "CINN"
+
+    random_seed = 123
+    paddle.seed(random_seed)
+    random.seed(random_seed)
+    np.random.seed(random_seed)
+
     if is_single_model_dir(args.model_path):
         test_single_model(args)
     else:
@@ -258,12 +369,21 @@ def main(args):
         "--compiler",
         type=str,
         required=False,
-        default="default",
+        default="CINN",
         help="Path to customized compiler python file",
     )
     parser.add_argument(
         "--warmup", type=int, required=False, default=5, help="Number of warmup steps"
     )
+    parser.add_argument(
+        "--trials", type=int, required=False, default=10, help="Number of timing trials"
+    )
+    parser.add_argument(
+        "--use-naive-timer",
+        action="store_true",
+        default=False,
+        help="Use naive timer for permance measuring.",
+    )
     parser.add_argument(
         "--log-prompt",
         type=str,