@@ -140,17 +140,31 @@ def measure_performance(model_call, args, compiler, profile=False):
140140 outs = model_call ()
141141
142142 # Warmup runs
143+ warmup_e2e_times = []
143144 for _ in range (args .warmup ):
144- model_call ()
145+ duration_box = test_compiler_util .DurationBox (- 1 )
146+ with test_compiler_util .naive_timer (duration_box , compiler .synchronize ):
147+ model_call ()
148+ warmup_e2e_times .append (duration_box .value )
145149 compiler .synchronize ()
146150
151+ # Ensure the measuring time is not less than 100ms.
152+ min_trials = int (100 / np .mean (warmup_e2e_times [1 :]))
153+ trials = max (args .trials , min_trials )
154+
147155 hardware_name = get_hardward_name (args )
148156 print (
149- f"[Profiling] Using device: { args .device } { hardware_name } , warm up { args .warmup } , trials { args . trials } " ,
157+ f"[Profiling] Using device: { args .device } { hardware_name } , warm up { args .warmup } , trials { trials } " ,
150158 file = sys .stderr ,
151159 flush = True ,
152160 )
153161
162+ if profile :
163+ import paddle .profiler as profiler
164+
165+ p = profiler .Profiler ()
166+ p .start ()
167+
154168 if test_compiler_util .is_gpu_device (args .device ):
155169 """
156170 Acknowledgement: We evaluate the performance on both end-to-end and GPU-only timings,
@@ -160,12 +174,7 @@ def measure_performance(model_call, args, compiler, profile=False):
160174 e2e_times = []
161175 gpu_times = []
162176
163- if profile :
164- import paddle .profiler as profiler
165-
166- p = profiler .Profiler ()
167- p .start ()
168- for i in range (args .trials ):
177+ for i in range (trials ):
169178 # End-to-end timing (naive_timer)
170179 duration_box = test_compiler_util .DurationBox (- 1 )
171180 with test_compiler_util .naive_timer (duration_box , compiler .synchronize ):
@@ -176,8 +185,9 @@ def measure_performance(model_call, args, compiler, profile=False):
176185 start_event .record ()
177186 model_call ()
178187 end_event .record ()
179- if profile :
180- p .step ()
188+
189+ if profile :
190+ p .step ()
181191
182192 gpu_time_ms = start_event .elapsed_time (end_event )
183193 e2e_times .append (duration_box .value )
@@ -187,26 +197,30 @@ def measure_performance(model_call, args, compiler, profile=False):
187197 file = sys .stderr ,
188198 flush = True ,
189199 )
190- if profile :
191- p .stop ()
192- p .summary ()
193-
194200 stats ["e2e" ] = test_compiler_util .get_timing_stats (e2e_times )
195201 stats ["gpu" ] = test_compiler_util .get_timing_stats (gpu_times )
196202 else : # CPU or other devices
197203 e2e_times = []
198- for i in range (args . trials ):
204+ for i in range (trials ):
199205 duration_box = test_compiler_util .DurationBox (- 1 )
200206 with test_compiler_util .naive_timer (duration_box , compiler .synchronize ):
201207 model_call ()
208+
209+ if profile :
210+ p .step ()
211+
212+ e2e_times .append (duration_box .value )
202213 print (
203214 f"Trial { i + 1 } : e2e={ duration_box .value :.4f} ms" ,
204215 file = sys .stderr ,
205216 flush = True ,
206217 )
207- e2e_times .append (duration_box .value )
208218 stats ["e2e" ] = test_compiler_util .get_timing_stats (e2e_times )
209219
220+ if profile :
221+ p .stop ()
222+ p .summary ()
223+
210224 return outs , stats
211225
212226
0 commit comments