@@ -19,40 +19,51 @@ def measure_performance(model_call, args, compiler):
1919
2020 if "cuda" in args .device :
2121 torch .cuda .empty_cache ()
22- e2e_times , gpu_times = [], []
23- for i in range (args .trials ):
24- duration_box = test_compiler_util .DurationBox (- 1 )
25- with test_compiler_util .naive_timer (duration_box , compiler .synchronize ):
26- start_event = torch .cuda .Event (enable_timing = True )
27- end_event = torch .cuda .Event (enable_timing = True )
28- start_event .record ()
29- model_call ()
30- end_event .record ()
31- compiler .synchronize ()
32-
33- gpu_time_ms = start_event .elapsed_time (end_event )
34- e2e_times .append (duration_box .value )
35- gpu_times .append (gpu_time_ms )
36- print (
37- f"Trial { i + 1 } : e2e={ duration_box .value :.5f} ms, gpu={ gpu_time_ms :.5f} ms" ,
38- file = sys .stderr ,
39- flush = True ,
40- )
41-
22+ e2e_times , gpu_times = run_cuda_benchmark_timer (
23+ model_call , args .trials , compiler
24+ )
4225 stats ["e2e" ] = test_compiler_util .get_timing_stats (e2e_times )
4326 stats ["gpu" ] = test_compiler_util .get_timing_stats (gpu_times )
4427 else :
45- e2e_times = []
46- for i in range (args .trials ):
47- duration_box = test_compiler_util .DurationBox (- 1 )
48- with test_compiler_util .naive_timer (duration_box , compiler .synchronize ):
49- model_call ()
50- e2e_times .append (duration_box .value )
51- print (
52- f"Trial { i + 1 } : e2e={ duration_box .value :.5f} ms" ,
53- file = sys .stderr ,
54- flush = True ,
55- )
28+ e2e_times = run_non_cuda_benchmark_timer (model_call , args .trials , compiler )
5629 stats ["e2e" ] = test_compiler_util .get_timing_stats (e2e_times )
5730
5831 return outs , stats
32+
33+
34+ def run_cuda_benchmark_timer (model_call , trials , compiler ):
35+ e2e_times , gpu_times = [], []
36+ for i in range (trials ):
37+ duration_box = test_compiler_util .DurationBox (- 1 )
38+ with test_compiler_util .naive_timer (duration_box , compiler .synchronize ):
39+ start_event = torch .cuda .Event (enable_timing = True )
40+ end_event = torch .cuda .Event (enable_timing = True )
41+ start_event .record ()
42+ model_call ()
43+ end_event .record ()
44+ compiler .synchronize ()
45+
46+ gpu_time_ms = start_event .elapsed_time (end_event )
47+ e2e_times .append (duration_box .value )
48+ gpu_times .append (gpu_time_ms )
49+ print (
50+ f"Trial { i + 1 } : e2e={ duration_box .value :.5f} ms, gpu={ gpu_time_ms :.5f} ms" ,
51+ file = sys .stderr ,
52+ flush = True ,
53+ )
54+ return e2e_times , gpu_times
55+
56+
57+ def run_non_cuda_benchmark_timer (model_call , trials , compiler ):
58+ e2e_times = []
59+ for i in range (trials ):
60+ duration_box = test_compiler_util .DurationBox (- 1 )
61+ with test_compiler_util .naive_timer (duration_box , compiler .synchronize ):
62+ model_call ()
63+ e2e_times .append (duration_box .value )
64+ print (
65+ f"Trial { i + 1 } : e2e={ duration_box .value :.5f} ms" ,
66+ file = sys .stderr ,
67+ flush = True ,
68+ )
69+ return e2e_times
0 commit comments