@@ -36,8 +36,8 @@ def _summarize_statistics(times, quantiles, return_mode):
3636 return getattr (torch , return_mode )(times ).item ()
3737
3838
39- def do_bench_ipex (fn , warmup = 25 , rep = 100 , grad_to_none = None , quantiles = None , fast_flush = True , return_mode = "mean " ,
40- device = "xpu" , sync_submitting = True , kernel_name = None ): # pylint: disable=unused-argument
39+ def do_bench_ipex (fn , warmup = 25 , rep = 100 , grad_to_none = None , quantiles = None , return_mode = "mean" , device = "xpu " ,
40+ sync_submitting = True , kernel_name = None ): # pylint: disable=unused-argument
4141 """
4242 Benchmark the runtime of the provided function. By default, return the median runtime of :code:`fn` along with
4343 the 20-th and 80-th performance percentile.
@@ -52,8 +52,6 @@ def do_bench_ipex(fn, warmup=25, rep=100, grad_to_none=None, quantiles=None, fas
5252 :type grad_to_none: torch.tensor, optional
5353 :param quantiles: Performance percentile to return in addition to the median.
5454 :type quantiles: list[float]
55- :param fast_flush: Use faster kernel to flush L2 between measurements
56- :type fast_flush: bool
5755 """
5856 # TODO: remove this function and switch to `do_bench_no_ipex` after
5957 # `XPUEvent.elapsed_time` stops introducing regressions into the results.
@@ -69,10 +67,7 @@ def do_bench_ipex(fn, warmup=25, rep=100, grad_to_none=None, quantiles=None, fas
6967 # before each kernel call to make sure that the L2
7068 # doesn't contain any input data before the run
7169 cache_size = 256 * 1024 * 1024
72- if fast_flush :
73- cache = torch .empty (int (cache_size // 4 ), dtype = torch .int , device = device )
74- else :
75- cache = torch .empty (int (cache_size ), dtype = torch .int8 , device = device )
70+ cache = torch .empty (int (cache_size // 4 ), dtype = torch .int , device = device )
7671
7772 # Estimate the runtime of the function
7873 start_event = torch .xpu .Event (enable_timing = True )
@@ -126,8 +121,8 @@ def extract_kernels(funcs):
126121 return _summarize_statistics (times , quantiles , return_mode )
127122
128123
129- def do_bench_elapsed_time (fn , warmup = 25 , rep = 100 , grad_to_none = None , quantiles = None , fast_flush = True ,
130- return_mode = "mean" , device = "xpu" , kernel_name = None ): # pylint: disable=unused-argument
124+ def do_bench_elapsed_time (fn , warmup = 25 , rep = 100 , grad_to_none = None , quantiles = None , return_mode = "mean" , device = "xpu" ,
125+ kernel_name = None ): # pylint: disable=unused-argument
131126 """
132127 Benchmark the runtime of the provided function. By default, return the median runtime of :code:`fn` along with
133128 the 20-th and 80-th performance percentile.
@@ -142,21 +137,19 @@ def do_bench_elapsed_time(fn, warmup=25, rep=100, grad_to_none=None, quantiles=N
142137 :type grad_to_none: torch.tensor, optional
143138 :param quantiles: Performance percentile to return in addition to the median.
144139 :type quantiles: list[float]
145- :param fast_flush: Use faster kernel to flush L2 between measurements
146- :type fast_flush: bool
147140 """
148141 assert return_mode in ["min" , "max" , "mean" , "median" ]
149142 import torch
150143 from triton .testing import do_bench as triton_do_bench
151144
152- times = triton_do_bench (fn , warmup = warmup , rep = rep , grad_to_none = grad_to_none , fast_flush = fast_flush ,
153- return_mode = "all" , device_type = device )
145+ times = triton_do_bench (fn , warmup = warmup , rep = rep , grad_to_none = grad_to_none , return_mode = "all" ,
146+ device_type = device )
154147 times = torch .tensor (times , dtype = torch .float )
155148 return _summarize_statistics (times , quantiles , return_mode )
156149
157150
158- def do_bench_upstream_pytorch_profiler (fn , warmup = 25 , rep = 100 , grad_to_none = None , quantiles = None , fast_flush = True ,
159- return_mode = "mean" , device = "xpu" , sync_submitting = True , kernel_name = None ):
151+ def do_bench_upstream_pytorch_profiler (fn , warmup = 25 , rep = 100 , grad_to_none = None , quantiles = None , return_mode = "mean" ,
152+ device = "xpu" , sync_submitting = True , kernel_name = None ):
160153 """
161154 Benchmark the runtime of the provided function. By default, return the median runtime of :code:`fn` along with
162155 the 20-th and 80-th performance percentile.
@@ -171,8 +164,6 @@ def do_bench_upstream_pytorch_profiler(fn, warmup=25, rep=100, grad_to_none=None
171164 :type grad_to_none: torch.tensor, optional
172165 :param quantiles: Performance percentile to return in addition to the median.
173166 :type quantiles: list[float]
174- :param fast_flush: Use faster kernel to flush L2 between measurements
175- :type fast_flush: bool
176167 """
177168
178169 assert return_mode in ["min" , "max" , "mean" , "median" ]
@@ -186,10 +177,7 @@ def do_bench_upstream_pytorch_profiler(fn, warmup=25, rep=100, grad_to_none=None
186177 # before each kernel call to make sure that the L2
187178 # doesn't contain any input data before the run
188179 cache_size = 256 * 1024 * 1024
189- if fast_flush :
190- cache = torch .empty (int (cache_size // 4 ), dtype = torch .int , device = device )
191- else :
192- cache = torch .empty (int (cache_size ), dtype = torch .int8 , device = device )
180+ cache = torch .empty (int (cache_size // 4 ), dtype = torch .int , device = device )
193181
194182 # Estimate the runtime of the function
195183 start_event = torch .xpu .Event (enable_timing = True )
0 commit comments