1717from graph_net import path_utils
1818from graph_net import test_compiler_util
1919
20+ from graph_net .paddle .backend .graph_compiler_backend import GraphCompilerBackend
21+ from graph_net .paddle .backend .cinn_backend import CinnBackend
22+ from graph_net .paddle .backend .nope_backend import NopeBackend
23+
24+
25+ registry_backend = {
26+ "cinn" : CinnBackend (),
27+ "nope" : NopeBackend (),
28+ }
29+
30+
31+ def get_compiler_backend (args ) -> GraphCompilerBackend :
32+ assert args .compiler in registry_backend , f"Unknown compiler: { args .compiler } "
33+ return registry_backend [args .compiler ]
34+
2035
2136def set_seed (random_seed ):
2237 paddle .seed (random_seed )
@@ -25,7 +40,7 @@ def set_seed(random_seed):
2540
2641
2742def get_hardward_name (args ):
28- if args .device == "cuda" :
43+ if test_compiler_util . is_gpu_device ( args .device ) :
2944 hardware = paddle .device .cuda .get_device_name (0 )
3045 elif args .device == "cpu" :
3146 hardware = platform .processor ()
@@ -60,19 +75,15 @@ def load_class_from_file(file_path: str, class_name: str):
6075 return model_class
6176
6277
63- def get_synchronizer_func (args ):
64- return paddle .device .synchronize
65-
66-
67- def get_model (args ):
78+ def get_model (model_path ):
6879 model_class = load_class_from_file (
69- f"{ args . model_path } /model.py" , class_name = "GraphModule"
80+ f"{ model_path } /model.py" , class_name = "GraphModule"
7081 )
7182 return model_class ()
7283
7384
74- def get_input_dict (args ):
75- inputs_params = utils .load_converted_from_text (f"{ args . model_path } " )
85+ def get_input_dict (model_path ):
86+ inputs_params = utils .load_converted_from_text (f"{ model_path } " )
7687 params = inputs_params ["weight_info" ]
7788 inputs = inputs_params ["input_info" ]
7889
@@ -81,8 +92,8 @@ def get_input_dict(args):
8192 return state_dict
8293
8394
84- def get_input_spec (args ):
85- inputs_params_list = utils .load_converted_list_from_text (f"{ args . model_path } " )
95+ def get_input_spec (model_path ):
96+ inputs_params_list = utils .load_converted_list_from_text (f"{ model_path } " )
8697 input_spec = [None ] * len (inputs_params_list )
8798 for i , v in enumerate (inputs_params_list ):
8899 dtype = v ["info" ]["dtype" ]
@@ -91,26 +102,10 @@ def get_input_spec(args):
91102 return input_spec
92103
93104
94- def get_compiled_model (args , model ):
95- if args .compiler == "nope" :
96- return model
97- input_spec = get_input_spec (args )
98- build_strategy = paddle .static .BuildStrategy ()
99- compiled_model = paddle .jit .to_static (
100- model ,
101- input_spec = input_spec ,
102- build_strategy = build_strategy ,
103- full_graph = True ,
104- )
105- compiled_model .eval ()
106- program = compiled_model .forward .concrete_program .main_program
107- return compiled_model
108-
109-
110105def get_static_model (args , model ):
111106 static_model = paddle .jit .to_static (
112107 model ,
113- input_spec = get_input_spec (args ),
108+ input_spec = get_input_spec (args . model_path ),
114109 full_graph = True ,
115110 backend = None ,
116111 )
@@ -119,7 +114,7 @@ def get_static_model(args, model):
119114 return static_model
120115
121116
122- def measure_performance (model_call , args , synchronizer_func , profile = False ):
117+ def measure_performance (model_call , args , compiler , profile = False ):
123118 runtime_seed = 1024
124119 stats = {}
125120
@@ -129,7 +124,7 @@ def measure_performance(model_call, args, synchronizer_func, profile=False):
129124 # Warmup runs
130125 for _ in range (args .warmup ):
131126 model_call ()
132- synchronizer_func ()
127+ compiler . synchronize ()
133128
134129 hardware_name = get_hardward_name (args )
135130 print (
@@ -138,7 +133,7 @@ def measure_performance(model_call, args, synchronizer_func, profile=False):
138133 flush = True ,
139134 )
140135
141- if "cuda" in args .device :
136+ if test_compiler_util . is_gpu_device ( args .device ) :
142137 """
143138 Acknowledgement: We evaluate the performance on both end-to-end and GPU-only timings,
144139 With reference to methods only based on CUDA events from KernelBench in https://github.com/ScalingIntelligence/KernelBench
@@ -152,7 +147,7 @@ def measure_performance(model_call, args, synchronizer_func, profile=False):
152147 for i in range (args .trials ):
153148 # End-to-end timing (naive_timer)
154149 duration_box = test_compiler_util .DurationBox (- 1 )
155- with test_compiler_util .naive_timer (duration_box , synchronizer_func ):
150+ with test_compiler_util .naive_timer (duration_box , compiler . synchronize ):
156151 # GPU-only timing (CUDA Events)
157152 start_event = paddle .device .Event (enable_timing = True )
158153 end_event = paddle .device .Event (enable_timing = True )
@@ -178,7 +173,7 @@ def measure_performance(model_call, args, synchronizer_func, profile=False):
178173 e2e_times = []
179174 for i in range (args .trials ):
180175 duration_box = test_compiler_util .DurationBox (- 1 )
181- with test_compiler_util .naive_timer (duration_box , synchronizer_func ):
176+ with test_compiler_util .naive_timer (duration_box , compiler . synchronize ):
182177 model_call ()
183178 print (
184179 f"Trial { i + 1 } : e2e={ duration_box .value :.4f} ms" ,
@@ -247,10 +242,27 @@ def transfer_to_float(origin_outputs):
247242 )
248243
249244
245+ def check_and_print_gpu_utilization (compiler ):
246+ if paddle .device .is_compiled_with_cuda ():
247+ device_id = int (paddle .device .get_device ().split (":" )[- 1 ])
248+ device_count = paddle .device .cuda .device_count ()
249+ gpu_util , mem_util = test_compiler_util .get_device_utilization (
250+ device_id , device_count , compiler .synchronize
251+ )
252+ if gpu_util is not None and mem_util is not None :
253+ print (
254+ f"Device status: gpu_id { device_id } , gpu_util { gpu_util :.2f} %, mem_util { mem_util :.2f} %" ,
255+ file = sys .stderr ,
256+ flush = True ,
257+ )
258+
259+
250260def test_single_model (args ):
251- synchronizer_func = get_synchronizer_func (args )
252- input_dict = get_input_dict (args )
253- model = get_model (args )
261+ compiler = get_compiler_backend (args )
262+ check_and_print_gpu_utilization (compiler )
263+
264+ input_dict = get_input_dict (args .model_path )
265+ model = get_model (args .model_path )
254266 model .eval ()
255267
256268 test_compiler_util .print_basic_config (
@@ -259,11 +271,12 @@ def test_single_model(args):
259271
260272 # Run on eager mode
261273 eager_success = False
274+ eager_time_stats = {}
262275 try :
263276 print ("Run model in eager mode." , file = sys .stderr , flush = True )
264277 static_model = get_static_model (args , model )
265278 expected_out , eager_time_stats = measure_performance (
266- lambda : static_model (** input_dict ), args , synchronizer_func , profile = False
279+ lambda : static_model (** input_dict ), args , compiler , profile = False
267280 )
268281 eager_success = True
269282 except Exception as e :
@@ -275,11 +288,13 @@ def test_single_model(args):
275288
276289 # Run on compiling mode
277290 compiled_success = False
291+ compiled_time_stats = {}
278292 try :
279293 print ("Run model in compiled mode." , file = sys .stderr , flush = True )
280- compiled_model = get_compiled_model (args , model )
294+ input_spec = get_input_spec (args .model_path )
295+ compiled_model = compiler (model , input_spec )
281296 compiled_out , compiled_time_stats = measure_performance (
282- lambda : compiled_model (** input_dict ), args , synchronizer_func , profile = False
297+ lambda : compiled_model (** input_dict ), args , compiler , profile = False
283298 )
284299 compiled_success = True
285300 except Exception as e :
@@ -293,9 +308,9 @@ def test_single_model(args):
293308 if eager_success and compiled_success :
294309 check_outputs (args , expected_out , compiled_out )
295310
296- test_compiler_util .print_times_and_speedup (
297- args , eager_time_stats , compiled_time_stats
298- )
311+ test_compiler_util .print_times_and_speedup (
312+ args , eager_time_stats , compiled_time_stats
313+ )
299314
300315
301316def get_cmp_equal (expected_out , compiled_out ):
@@ -366,20 +381,12 @@ def get_cmp_diff_count(expected_out, compiled_out, atol, rtol):
366381
367382
368383def test_multi_models (args ):
369- test_samples = None
370- if args .allow_list is not None :
371- assert os .path .isfile (args .allow_list )
372- graphnet_root = path_utils .get_graphnet_root ()
373- print (f"graphnet_root: { graphnet_root } " , file = sys .stderr , flush = True )
374- verified_samples = []
375- with open (args .verified_samples_list_path , "r" ) as f :
376- for line in f .readlines ():
377- test_samples .append (os .path .join (graphnet_root , line .strip ()))
384+ test_samples = test_compiler_util .get_allow_samples (args .allow_list )
378385
379386 sample_idx = 0
380387 failed_samples = []
381388 for model_path in path_utils .get_recursively_model_path (args .model_path ):
382- if verified_samples is None or os .path .abspath (model_path ) in verified_samples :
389+ if test_samples is None or os .path .abspath (model_path ) in test_samples :
383390 print (
384391 f"[{ sample_idx } ] test_compiler, model_path: { model_path } " ,
385392 file = sys .stderr ,
@@ -415,6 +422,7 @@ def test_multi_models(args):
415422def main (args ):
416423 assert os .path .isdir (args .model_path )
417424 assert args .compiler in {"cinn" , "nope" }
425+ assert args .device in ["cuda" , "dcu" , "cpu" ]
418426
419427 initalize_seed = 123
420428 set_seed (random_seed = initalize_seed )
0 commit comments