@@ -41,6 +41,11 @@ def set_seed(random_seed):
4141 np .random .seed (random_seed )
4242
4343
44+ def init_env (args ):
45+ if test_compiler_util .is_gpu_device (args .device ):
46+ paddle .set_flags ({"FLAGS_cudnn_exhaustive_search" : 1 })
47+
48+
4449def get_hardward_name (args ):
4550 hardware = "unknown"
4651 if test_compiler_util .is_gpu_device (args .device ):
@@ -65,10 +70,8 @@ def get_hardward_name(args):
6570
6671
6772def get_compile_framework_version (args ):
68- if args .compiler == "cinn" :
73+ if args .compiler in [ "cinn" , "nope" ] :
6974 return paddle .__version__
70- if args .compiler == "nope" :
71- return "nope-baseline"
7275 return "unknown"
7376
7477
@@ -137,17 +140,31 @@ def measure_performance(model_call, args, compiler, profile=False):
137140 outs = model_call ()
138141
139142 # Warmup runs
143+ warmup_e2e_times = []
140144 for _ in range (args .warmup ):
141- model_call ()
145+ duration_box = test_compiler_util .DurationBox (- 1 )
146+ with test_compiler_util .naive_timer (duration_box , compiler .synchronize ):
147+ model_call ()
148+ warmup_e2e_times .append (duration_box .value )
142149 compiler .synchronize ()
143150
151+ # Ensure the measuring time is not less than 100ms.
152+ min_trials = int (100 / np .mean (warmup_e2e_times [1 :]))
153+ trials = max (args .trials , min_trials )
154+
144155 hardware_name = get_hardward_name (args )
145156 print (
146- f"[Profiling] Using device: { args .device } { hardware_name } , warm up { args .warmup } , trials { args . trials } " ,
157+ f"[Profiling] Using device: { args .device } { hardware_name } , warm up { args .warmup } , trials { trials } " ,
147158 file = sys .stderr ,
148159 flush = True ,
149160 )
150161
162+ if profile :
163+ import paddle .profiler as profiler
164+
165+ p = profiler .Profiler ()
166+ p .start ()
167+
151168 if test_compiler_util .is_gpu_device (args .device ):
152169 """
153170 Acknowledgement: We evaluate the performance on both end-to-end and GPU-only timings,
@@ -157,9 +174,7 @@ def measure_performance(model_call, args, compiler, profile=False):
157174 e2e_times = []
158175 gpu_times = []
159176
160- if profile :
161- paddle .base .core .nvprof_start ()
162- for i in range (args .trials ):
177+ for i in range (trials ):
163178 # End-to-end timing (naive_timer)
164179 duration_box = test_compiler_util .DurationBox (- 1 )
165180 with test_compiler_util .naive_timer (duration_box , compiler .synchronize ):
@@ -171,6 +186,9 @@ def measure_performance(model_call, args, compiler, profile=False):
171186 model_call ()
172187 end_event .record ()
173188
189+ if profile :
190+ p .step ()
191+
174192 gpu_time_ms = start_event .elapsed_time (end_event )
175193 e2e_times .append (duration_box .value )
176194 gpu_times .append (gpu_time_ms )
@@ -179,25 +197,30 @@ def measure_performance(model_call, args, compiler, profile=False):
179197 file = sys .stderr ,
180198 flush = True ,
181199 )
182- if profile :
183- paddle .base .core .nvprof_stop ()
184-
185200 stats ["e2e" ] = test_compiler_util .get_timing_stats (e2e_times )
186201 stats ["gpu" ] = test_compiler_util .get_timing_stats (gpu_times )
187202 else : # CPU or other devices
188203 e2e_times = []
189- for i in range (args . trials ):
204+ for i in range (trials ):
190205 duration_box = test_compiler_util .DurationBox (- 1 )
191206 with test_compiler_util .naive_timer (duration_box , compiler .synchronize ):
192207 model_call ()
208+
209+ if profile :
210+ p .step ()
211+
212+ e2e_times .append (duration_box .value )
193213 print (
194214 f"Trial { i + 1 } : e2e={ duration_box .value :.4f} ms" ,
195215 file = sys .stderr ,
196216 flush = True ,
197217 )
198- e2e_times .append (duration_box .value )
199218 stats ["e2e" ] = test_compiler_util .get_timing_stats (e2e_times )
200219
220+ if profile :
221+ p .stop ()
222+ p .summary ()
223+
201224 return outs , stats
202225
203226
@@ -210,19 +233,31 @@ def check_outputs(args, expected_out, compiled_out):
210233 eager_dtypes = [None ] * len (expected_out )
211234 for i , tensor in enumerate (expected_out ):
212235 eager_dtypes [i ] = (
213- str (tensor .dtype ).replace ("paddle." , "" ) if tensor is not None else "none "
236+ str (tensor .dtype ).replace ("paddle." , "" ) if tensor is not None else "None "
214237 )
215238
216239 compiled_dtypes = [None ] * len (compiled_out )
217240 for i , tensor in enumerate (compiled_out ):
218241 compiled_dtypes [i ] = (
219- str (tensor .dtype ).replace ("paddle." , "" ) if tensor is not None else "none "
242+ str (tensor .dtype ).replace ("paddle." , "" ) if tensor is not None else "None "
220243 )
221244
222245 type_match = test_compiler_util .check_output_datatype (
223246 args , eager_dtypes , compiled_dtypes
224247 )
225248
249+ eager_shapes = [None ] * len (expected_out )
250+ for i , tensor in enumerate (expected_out ):
251+ eager_shapes [i ] = tensor .shape if tensor is not None else None
252+
253+ compiled_shapes = [None ] * len (compiled_out )
254+ for i , tensor in enumerate (compiled_out ):
255+ compiled_shapes [i ] = tensor .shape if tensor is not None else None
256+
257+ shape_match = test_compiler_util .check_output_shape (
258+ args , eager_shapes , compiled_shapes
259+ )
260+
226261 def transfer_to_float (origin_outputs ):
227262 outputs = []
228263 for item in origin_outputs :
@@ -235,7 +270,7 @@ def transfer_to_float(origin_outputs):
235270 outputs .append (item )
236271 return outputs
237272
238- if type_match :
273+ if type_match and shape_match :
239274 test_compiler_util .check_equal (
240275 args ,
241276 expected_out ,
@@ -400,17 +435,18 @@ def test_multi_models(args):
400435
401436 sample_idx = 0
402437 failed_samples = []
438+ module_name = os .path .splitext (os .path .basename (__file__ ))[0 ]
403439 for model_path in path_utils .get_recursively_model_path (args .model_path ):
404440 if test_samples is None or os .path .abspath (model_path ) in test_samples :
405441 print (
406- f"[{ sample_idx } ] test_compiler , model_path: { model_path } " ,
442+ f"[{ sample_idx } ] { module_name } , model_path: { model_path } " ,
407443 file = sys .stderr ,
408444 flush = True ,
409445 )
410446 cmd = " " .join (
411447 [
412448 sys .executable ,
413- "-m graph_net.paddle.test_compiler " ,
449+ f "-m graph_net.paddle.{ module_name } " ,
414450 f"--model-path { model_path } " ,
415451 f"--compiler { args .compiler } " ,
416452 f"--device { args .device } " ,
0 commit comments