Skip to content

Commit c83c0ed

Browse files
authored
Remove workaround for upstream profiler (#2484)
Signed-off-by: Anatoly Myachev <[email protected]>
1 parent 713b5b8 commit c83c0ed

10 files changed

+31
-81
lines changed

benchmarks/triton_kernels_benchmark/benchmark_testing.py

Lines changed: 16 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ def _summarize_statistics(times, quantiles, return_mode):
3737

3838

3939
def do_bench_ipex(fn, n_warmup=25, n_repeat=100, grad_to_none=None, quantiles=None, return_mode="mean", device="xpu",
40-
sync_submitting=True, kernel_name=None): # pylint: disable=unused-argument
40+
sync_submitting=True):
4141
"""
4242
Benchmark the runtime of the provided function. By default, return the median runtime of :code:`fn` along with
4343
the 20-th and 80-th performance percentile.
@@ -108,7 +108,7 @@ def extract_kernels(funcs):
108108

109109

110110
def do_bench_elapsed_time(fn, n_warmup=25, n_repeat=100, grad_to_none=None, quantiles=None, return_mode="mean",
111-
device="xpu", kernel_name=None): # pylint: disable=unused-argument
111+
device="xpu"):
112112
"""
113113
Benchmark the runtime of the provided function. By default, return the median runtime of :code:`fn` along with
114114
the 20-th and 80-th performance percentile.
@@ -159,7 +159,7 @@ def do_bench_elapsed_time(fn, n_warmup=25, n_repeat=100, grad_to_none=None, quan
159159

160160

161161
def do_bench_upstream_pytorch_profiler(fn, n_warmup=25, n_repeat=100, grad_to_none=None, quantiles=None,
162-
return_mode="mean", device="xpu", sync_submitting=True, kernel_name=None):
162+
return_mode="mean", device="xpu", sync_submitting=True):
163163
"""
164164
Benchmark the runtime of the provided function. By default, return the median runtime of :code:`fn` along with
165165
the 20-th and 80-th performance percentile.
@@ -178,7 +178,7 @@ def do_bench_upstream_pytorch_profiler(fn, n_warmup=25, n_repeat=100, grad_to_no
178178

179179
assert return_mode in ["min", "max", "mean", "median"]
180180
import torch
181-
from torch.profiler import profile, ProfilerActivity
181+
from torch.profiler import profile, ProfilerActivity, record_function
182182

183183
fn()
184184
synchronize()
@@ -206,24 +206,24 @@ def do_bench_upstream_pytorch_profiler(fn, n_warmup=25, n_repeat=100, grad_to_no
206206
if sync_submitting:
207207
synchronize()
208208
# record time of `fn`
209-
fn()
209+
with record_function("__profile_kernel_of_func"):
210+
fn()
210211
# Record clocks
211212
synchronize()
212213

213-
function_events = prof.events()
214+
profiling_func_filter = filter(lambda x: x.name.startswith("__profile_kernel_of_func"), prof.events())
215+
functions = list(profiling_func_filter)
214216

215-
all_functions = []
216-
if isinstance(kernel_name, str):
217-
kernel_name = [kernel_name]
218-
for ker_name in kernel_name:
219-
functions = list(filter(lambda x: x.name.startswith(ker_name), function_events)) # pylint: disable=cell-var-from-loop
220-
assert len(functions) == n_repeat, f"the profiling number for kernel: '{ker_name}' not match, {len(functions)}"
221-
all_functions.append(functions)
222-
# profiling_func_filter = filter(lambda x: x.name.startswith("__profile_kernel_of_func"), function_events)
217+
def extract_kernels(funcs):
218+
kernels = []
219+
kernels += list(itertools.chain.from_iterable(map(lambda func: extract_kernels(func.cpu_children), funcs)))
220+
kernels += list(itertools.chain.from_iterable([func.kernels for func in funcs]))
221+
return kernels
223222

223+
kernels = [extract_kernels(func.cpu_children) for func in functions]
224+
assert len(kernels) == n_repeat, "the profiling number not match"
224225
# Make the time to the milliseconds.
225-
times = torch.tensor([sum(map(lambda elem: elem.self_device_time_total, f)) * 1e-3 for f in zip(*all_functions)],
226-
dtype=torch.float)
226+
times = torch.tensor([sum([k.duration for k in ks]) * 1e-3 for ks in kernels], dtype=torch.float)
227227
return _summarize_statistics(times, quantiles, return_mode)
228228

229229

benchmarks/triton_kernels_benchmark/flash_attention_fwd_benchmark.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -265,8 +265,7 @@ def benchmark(Z, H, N_CTX, D_HEAD, CAUSAL, provider):
265265
), attn_mask=None, dropout_p=0.0, is_causal=CAUSAL, scale=sm_scale).to(torch.float32)
266266
atol = 1e-1 if N_CTX == 16384 else 1e-2
267267
benchmark_suit.assert_close(triton_fn(), torch_fn(), atol=atol, rtol=1e-3, err_msg='triton to torch')
268-
_, min_ms, max_ms, mean, cv = benchmark_suit.do_bench(triton_fn, n_warmup=10, n_repeat=10, quantiles=quantiles,
269-
kernel_name='_attn_fwd')
268+
_, min_ms, max_ms, mean, cv = benchmark_suit.do_bench(triton_fn, n_warmup=10, n_repeat=10, quantiles=quantiles)
270269

271270
elif provider == 'xetla':
272271
module_name = f'flash_attn_causal_{CAUSAL}'.lower()
@@ -281,8 +280,7 @@ def benchmark(Z, H, N_CTX, D_HEAD, CAUSAL, provider):
281280
l = torch.empty((size_ml, ), device='xpu', dtype=torch.float)
282281

283282
xetla_fn = lambda: func(q, k, v, out, dropout_mask, bias, m, l, Z, H, D_HEAD, N_CTX, N_CTX, sm_scale)
284-
_, min_ms, max_ms, mean, cv = benchmark_suit.do_bench(xetla_fn, n_warmup=10, n_repeat=10, quantiles=quantiles,
285-
kernel_name='gpu::xetla::fmha::FmhaForwardKernel<')
283+
_, min_ms, max_ms, mean, cv = benchmark_suit.do_bench(xetla_fn, n_warmup=10, n_repeat=10, quantiles=quantiles)
286284

287285
else:
288286
raise NotImplementedError(f'Unsupported provider {provider}')

benchmarks/triton_kernels_benchmark/fused_softmax.py

Lines changed: 2 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -131,8 +131,7 @@ def benchmark(M, N, provider):
131131
triton_fn = lambda: softmax(x, out)
132132
torch_fn = lambda: torch.softmax(x, axis=-1)
133133
benchmark_suit.assert_close(triton_fn(), torch_fn(), err_msg="triton to torch")
134-
_, min_ms, max_ms, mean, cv = benchmark_suit.do_bench(triton_fn, quantiles=quantiles, n_warmup=10, n_repeat=10,
135-
kernel_name="softmax_kernel")
134+
_, min_ms, max_ms, mean, cv = benchmark_suit.do_bench(triton_fn, quantiles=quantiles, n_warmup=10, n_repeat=10)
136135

137136
elif provider == "torch-jit":
138137
_, min_ms, max_ms, mean, cv = benchmark_suit.do_bench(lambda: naive_softmax(x), quantiles=quantiles,
@@ -145,17 +144,7 @@ def benchmark(M, N, provider):
145144
xetla_fn = lambda: func(x, out, 0)
146145
torch_fn = lambda: torch.softmax(x, axis=-1)
147146
# benchmark_suit.assert_close(xetla_fn(), torch_fn(), err_msg="xetla to torch")
148-
kernels_name = {
149-
"softmax_shape_4096_256": "mat1_4096x256_bf16_cfg0",
150-
"softmax_shape_4096_1024": "mat1_4096x1024_bf16_cfg0",
151-
"softmax_shape_4096_2048": "mat1_4096x2048_bf16_cfg0",
152-
"softmax_shape_4096_4096": "mat1_4096x4096_bf16_cfg0",
153-
"softmax_shape_4096_8192": "mat1_4096x8k_bf16_cfg0",
154-
"softmax_shape_4096_16384": "mat1_4096x16k_bf16_cfg0",
155-
"softmax_shape_4096_32768": "mat1_4096x32k_bf16_cfg0",
156-
}
157-
_, min_ms, max_ms, mean, cv = benchmark_suit.do_bench(xetla_fn, quantiles=quantiles, n_warmup=10, n_repeat=10,
158-
kernel_name=kernels_name[name])
147+
_, min_ms, max_ms, mean, cv = benchmark_suit.do_bench(xetla_fn, quantiles=quantiles, n_warmup=10, n_repeat=10)
159148

160149
else:
161150
raise NotImplementedError(f"Unsupported provider {provider}")

benchmarks/triton_kernels_benchmark/gemm_benchmark.py

Lines changed: 3 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -288,7 +288,7 @@ def benchmark(B, M, N, K, provider):
288288
# Legacy profiler shows ~6000TFLOPS GeoMean for onednn measurements, so use more reliable method
289289
do_bench = do_bench_elapsed_time
290290
_, min_ms, max_ms, mean_ms, cv = do_bench(lambda: torch.matmul(torch_a, torch_b), n_warmup=10, n_repeat=10,
291-
quantiles=quantiles, kernel_name='gemm_kernel')
291+
quantiles=quantiles)
292292
elif provider == 'triton':
293293
assert len(a.shape) == len(b.shape), 'Incompatible sizes'
294294
if len(a.shape) == 3:
@@ -301,8 +301,7 @@ def benchmark(B, M, N, K, provider):
301301
rtol = 1e-2 if a.dtype == torch.bfloat16 else 1e-3
302302
benchmark_suit.assert_close(triton_fn(), torch_fn(), atol=1e-4, rtol=rtol, err_msg='triton to torch')
303303
_, min_ms, max_ms, mean_ms, cv = benchmark_suit.do_bench(triton_fn, n_warmup=10, n_repeat=10,
304-
quantiles=quantiles,
305-
kernel_name='matmul_kernel_with_block_pointers')
304+
quantiles=quantiles)
306305
elif provider == 'xetla':
307306
if B == 1:
308307
c = torch.zeros((M, N), device='xpu', dtype=torch.float32)
@@ -329,37 +328,9 @@ def xetla_func_with_acc_allocation():
329328
xetla_fn = xetla_func_with_acc_allocation
330329
torch_fn = lambda: torch.matmul(a, b).to(torch.float32)
331330

332-
kernels_name = {
333-
'gemm_shape_1_1024_1024_1024': 'Test_1x1024x1024x1024_row_row',
334-
'gemm_shape_1_2048_2048_2048': 'Test_1x2048x2048x2048_row_row',
335-
'gemm_shape_1_4096_4096_4096': 'Test_1x4096x4096x4096_row_row',
336-
'gemm_shape_1_8192_8192_8192': 'Test_1x8192x8192x8192_row_row',
337-
'gemm_shape_1_1_5120_13824': 'Test_1x1x5120x13824_row_row',
338-
'gemm_shape_1_4_4096_12288': 'Test_1x4x4096x12288_row_row',
339-
'gemm_shape_1_512_8192_8192': 'Test_1x512x8192x8192_row_row',
340-
'gemm_shape_1_512_8192_32768': 'Test_1x512x8192x32768_row_row',
341-
'gemm_shape_1_512_32768_8192': 'Test_1x512x32768x8192_row_row',
342-
'gemm_shape_1_1024_16384_8192': 'Test_1x1024x16384x8192_row_row',
343-
'gemm_shape_1_1024_28672_8192': 'Test_1x1024x28672x8192_row_row',
344-
'gemm_shape_1_3072_4096_3072': 'Test_1x3072x4096x3072_row_row',
345-
'gemm_shape_1_4096_16384_8192': 'Test_1x4096x16384x8192_row_row',
346-
'gemm_shape_1_8192_16384_1024': 'Test_1x8192x16384x1024_row_row',
347-
'gemm_shape_1_8192_16384_4096': 'Test_1x8192x16384x4096_row_row',
348-
'gemm_shape_1_16384_1024_8192': 'Test_1x16384x1024x8192_row_row',
349-
'gemm_shape_1_16384_4096_8192': 'Test_1x16384x4096x8192_row_row',
350-
'gemm_shape_1_16384_8192_1024': 'Test_1x16384x8192x1024_row_row',
351-
'gemm_shape_1_16384_8192_4096': 'Test_1x16384x8192x4096_row_row',
352-
'gemm_shape_4_32768_128_4096': 'Test_4x32768x128x4096_row_row',
353-
'gemm_shape_4_32768_4096_128': 'Test_4x32768x4096x128_row_row',
354-
'gemm_shape_32_4096_4096_128': 'Test_32x4096x4096x128_row_row',
355-
'gemm_shape_4096_8_128_16384': 'Test_4096x8x128x16384_row_row',
356-
'gemm_shape_4096_8_16384_128': 'Test_4096x8x16384x128_row_row',
357-
'gemm_streamk_shape_3072_4096_3072': 'stream_k_gemm_run',
358-
}
359-
360331
# benchmark_suit.assert_close(xetla_fn(), torch_fn(), atol=1e-4, rtol=1.0, err_msg='xetla to torch')
361332
_, min_ms, max_ms, mean_ms, cv = benchmark_suit.do_bench(xetla_fn, n_warmup=10, n_repeat=10,
362-
quantiles=quantiles, kernel_name=kernels_name[name])
333+
quantiles=quantiles)
363334
else:
364335
raise NotImplementedError(f'Unsupported provider {provider}')
365336

benchmarks/triton_kernels_benchmark/gemm_postop_addmatrix_benchmark.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -266,17 +266,15 @@ def benchmark(B, M, N, K, provider):
266266
assert len(a.shape) == len(b.shape), 'Incompatible sizes'
267267
if len(a.shape) == 3:
268268
c = torch.empty((B, M, N), device='xpu', dtype=torch.float32)
269-
kernel_name = 'matmul_kernel_with_block_pointers_batched'
270269
else:
271270
assert len(a.shape) == 2, 'Expecting shape of length 2'
272271
c = torch.empty((M, N), device='xpu', dtype=torch.float32)
273-
kernel_name = 'matmul_kernel_with_block_pointers'
274272
triton_fn = lambda: matmul(a, b, d, c)
275273
torch_fn = lambda: torch.matmul(a, b).to(torch.float32) + d
276274
rtol = 1e-2 if a.dtype == torch.bfloat16 else 1e-3
277275
benchmark_suit.assert_close(triton_fn(), torch_fn(), atol=1e-4, rtol=rtol, err_msg='triton to torch')
278276
_, min_ms, max_ms, mean_ms, cv = benchmark_suit.do_bench(triton_fn, n_warmup=10, n_repeat=10,
279-
quantiles=quantiles, kernel_name=kernel_name)
277+
quantiles=quantiles)
280278
else:
281279
raise NotImplementedError(f'Unsupported provider {provider}')
282280

benchmarks/triton_kernels_benchmark/gemm_postop_gelu_benchmark.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -268,17 +268,15 @@ def benchmark(B, M, N, K, provider):
268268
assert len(a.shape) == len(b.shape), 'Incompatible sizes'
269269
if len(a.shape) == 3:
270270
c = torch.empty((B, M, N), device='xpu', dtype=torch.float32)
271-
kernel_name = 'matmul_kernel_with_block_pointers_batched'
272271
else:
273272
assert len(a.shape) == 2, 'Expecting shape of length 2'
274273
c = torch.empty((M, N), device='xpu', dtype=torch.float32)
275-
kernel_name = 'matmul_kernel_with_block_pointers'
276274
triton_fn = lambda: matmul(a, b, c)
277275
torch_fn = lambda: torch.nn.functional.gelu(torch.matmul(a, b).to(torch.float32))
278276
rtol = 1e-2 if a.dtype == torch.bfloat16 else 1e-3
279277
benchmark_suit.assert_close(triton_fn(), torch_fn(), atol=1e-4, rtol=rtol, err_msg='triton to torch')
280278
_, min_ms, max_ms, mean_ms, cv = benchmark_suit.do_bench(triton_fn, n_warmup=10, n_repeat=10,
281-
quantiles=quantiles, kernel_name=kernel_name)
279+
quantiles=quantiles)
282280
else:
283281
raise NotImplementedError(f'Unsupported provider {provider}')
284282

benchmarks/triton_kernels_benchmark/gemm_preop_exp_benchmark.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -256,17 +256,15 @@ def benchmark(B, M, N, K, provider):
256256
assert len(a.shape) == len(b.shape), 'Incompatible sizes'
257257
if len(a.shape) == 3:
258258
c = torch.empty((B, M, N), device='xpu', dtype=torch.float32)
259-
kernel_name = 'matmul_kernel_with_block_pointers_batched'
260259
else:
261260
assert len(a.shape) == 2, 'Expecting shape of length 2'
262261
c = torch.empty((M, N), device='xpu', dtype=torch.float32)
263-
kernel_name = 'matmul_kernel_with_block_pointers'
264262
triton_fn = lambda: matmul(a, b, c)
265263
torch_fn = lambda: torch.matmul(torch.exp(a), b).to(torch.float32)
266264
rtol = 1e-2 if a.dtype == torch.bfloat16 else 1e-3
267265
benchmark_suit.assert_close(triton_fn(), torch_fn(), atol=1e-4, rtol=rtol, err_msg='triton to torch')
268266
_, min_ms, max_ms, mean_ms, cv = benchmark_suit.do_bench(triton_fn, n_warmup=10, n_repeat=10,
269-
quantiles=quantiles, kernel_name=kernel_name)
267+
quantiles=quantiles)
270268
else:
271269
raise NotImplementedError(f'Unsupported provider {provider}')
272270

benchmarks/triton_kernels_benchmark/gemm_splitk_benchmark.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -159,7 +159,7 @@ def benchmark(M, N, K, provider):
159159
rtol = 1e-2 if a.dtype == torch.bfloat16 else 1e-3
160160
benchmark_suit.assert_close(triton_fn(), torch_fn(), atol=1e-4, rtol=rtol, err_msg='triton to torch')
161161
_, min_ms, max_ms, mean_ms, cv = benchmark_suit.do_bench(triton_fn, n_warmup=10, n_repeat=10,
162-
quantiles=quantiles, kernel_name='_kernel')
162+
quantiles=quantiles)
163163
elif provider == 'xetla':
164164
c = torch.zeros((M, N), device='xpu', dtype=torch.float32)
165165
acc = torch.zeros((M, N), device='xpu', dtype=torch.float32)
@@ -172,7 +172,7 @@ def benchmark(M, N, K, provider):
172172

173173
# benchmark_suit.assert_close(xetla_fn(), torch_fn(), atol=1e-4, rtol=1.0, err_msg='xetla to torch')
174174
_, min_ms, max_ms, mean_ms, cv = benchmark_suit.do_bench(xetla_fn, n_warmup=10, n_repeat=10,
175-
quantiles=quantiles, kernel_name='split_k_gemm_run')
175+
quantiles=quantiles)
176176
else:
177177
raise NotImplementedError(f'Unsupported provider {provider}')
178178

benchmarks/triton_kernels_benchmark/gemm_streamk_benchmark.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -280,8 +280,7 @@ def benchmark(M, N, K, provider):
280280
torch_fn = lambda: torch.matmul(a, b).to(torch.float32)
281281
benchmark_suit.assert_close(triton_fn(), torch_fn(), atol=1e-4, rtol=1e-2, err_msg='triton to torch')
282282
_, min_ms, max_ms, mean_ms, cv = benchmark_suit.do_bench(triton_fn, n_warmup=10, n_repeat=10,
283-
quantiles=quantiles,
284-
kernel_name=['first_wave', 'full_tiles'])
283+
quantiles=quantiles)
285284
elif provider == 'xetla':
286285
c = torch.zeros((M, N), device='xpu', dtype=torch.float32)
287286
acc = torch.zeros((M, N), device='xpu', dtype=torch.float32)
@@ -294,7 +293,7 @@ def benchmark(M, N, K, provider):
294293

295294
# benchmark_suit.assert_close(xetla_fn(), torch_fn(), atol=1e-4, rtol=1.0, err_msg='xetla to torch')
296295
_, min_ms, max_ms, mean_ms, cv = benchmark_suit.do_bench(xetla_fn, n_warmup=10, n_repeat=10,
297-
quantiles=quantiles, kernel_name='stream_k_gemm_run')
296+
quantiles=quantiles)
298297
else:
299298
raise NotImplementedError(f'Unsupported provider {provider}')
300299

benchmarks/triton_kernels_benchmark/prefix_sums.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -44,8 +44,7 @@ def benchmark(M, N, AXIS, provider):
4444

4545
if provider == 'triton':
4646
triton_fn = lambda: scan_kernel[(1, )](x, BLOCK_SIZE_M=M, BLOCK_SIZE_N=N, AXIS=AXIS)
47-
_, min_ms, max_ms, mean_ms, cv = benchmark_suit.do_bench(triton_fn, quantiles=quantiles,
48-
kernel_name='scan_kernel')
47+
_, min_ms, max_ms, mean_ms, cv = benchmark_suit.do_bench(triton_fn, quantiles=quantiles)
4948
else:
5049
raise NotImplementedError(f'Unsupported provider {provider}')
5150

0 commit comments

Comments
 (0)