Skip to content

Commit 9de9f14

Browse files
enhance upsample and scatter related cases
1 parent e5197a4 commit 9de9f14

30 files changed

+1441
-937
lines changed

test/microbench/adaptive_avg_pool2d.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -47,19 +47,19 @@ def Adaptive_AVGPool2d(shape, dtype, channels_last, backward, device):
4747

4848
def run_profile(shape, dtype, channels_last, backward, device, num_iter):
4949
with profile(
50-
activities=[ProfilerActivity.CPU,
50+
activities=[ProfilerActivity.CPU,
5151
ProfilerActivity.XPU if device == 'xpu' else ProfilerActivity.CUDA],
5252
record_shapes=True,
5353
) as prof:
54-
for _ in range(num_iter):
54+
for i in range(num_iter):
5555
Adaptive_AVGPool2d(shape, dtype, channels_last, backward, device)
5656
print(prof.key_averages().table(sort_by="{}_time_total".format(device)))
5757

5858
def run_e2e(shape, dtype, channels_last, backward, device, num_iter):
5959
if device in ['xpu', 'cuda']:
6060
torch.xpu.synchronize() if device == 'xpu' else torch.cuda.synchronize()
6161
t1 = time.time()
62-
for _ in range(num_iter):
62+
for i in range(num_iter):
6363
Adaptive_AVGPool2d(shape, dtype, channels_last, backward, device)
6464
if device in ['xpu', 'cuda']:
6565
torch.xpu.synchronize() if device == 'xpu' else torch.cuda.synchronize()
@@ -95,14 +95,14 @@ def benchmark(args):
9595

9696
def parse_args():
9797
parser = argparse.ArgumentParser(description='OP Benchmark')
98-
parser.add_argument('--device', type=str, default='xpu',
98+
parser.add_argument('--device', type=str, default='xpu',
9999
help='Device to run on (e.g., "cpu", "cuda", "xpu")')
100100
group = parser.add_mutually_exclusive_group()
101-
group.add_argument('--profile-only', action='store_true',
101+
group.add_argument('--profile-only', action='store_true',
102102
help='Only Run profile timing')
103-
group.add_argument('--e2e-only', action='store_true',
103+
group.add_argument('--e2e-only', action='store_true',
104104
help='Only Run E2E timing')
105-
parser.add_argument('--num-iter', type=int, default=20,
105+
parser.add_argument('--num-iter', type=int, default=20,
106106
help='Number of iterations')
107107
return parser.parse_args()
108108

test/microbench/avg_pool2d.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -54,19 +54,19 @@ def AVGPool2d(shape, dtype, channels_last, backward, device):
5454

5555
def run_profile(shape, dtype, channels_last, backward, device, num_iter):
5656
with profile(
57-
activities=[ProfilerActivity.CPU,
57+
activities=[ProfilerActivity.CPU,
5858
ProfilerActivity.XPU if device == 'xpu' else ProfilerActivity.CUDA],
5959
record_shapes=True,
6060
) as prof:
61-
for _ in range(num_iter):
61+
for i in range(num_iter):
6262
AVGPool2d(shape, dtype, channels_last, backward, device)
6363
print(prof.key_averages().table(sort_by="{}_time_total".format(device)))
6464

6565
def run_e2e(shape, dtype, channels_last, backward, device, num_iter):
6666
if device in ['xpu', 'cuda']:
6767
torch.xpu.synchronize() if device == 'xpu' else torch.cuda.synchronize()
6868
t1 = time.time()
69-
for _ in range(num_iter):
69+
for i in range(num_iter):
7070
AVGPool2d(shape, dtype, channels_last, backward, device)
7171
if device in ['xpu', 'cuda']:
7272
torch.xpu.synchronize() if device == 'xpu' else torch.cuda.synchronize()
@@ -105,14 +105,14 @@ def benchmark(args):
105105

106106
def parse_args():
107107
parser = argparse.ArgumentParser(description='OP Benchmark')
108-
parser.add_argument('--device', type=str, default='xpu',
108+
parser.add_argument('--device', type=str, default='xpu',
109109
help='Device to run on (e.g., "cpu", "cuda", "xpu")')
110110
group = parser.add_mutually_exclusive_group()
111-
group.add_argument('--profile-only', action='store_true',
111+
group.add_argument('--profile-only', action='store_true',
112112
help='Only Run profile timing')
113-
group.add_argument('--e2e-only', action='store_true',
113+
group.add_argument('--e2e-only', action='store_true',
114114
help='Only Run E2E timing')
115-
parser.add_argument('--num-iter', type=int, default=20,
115+
parser.add_argument('--num-iter', type=int, default=20,
116116
help='Number of iterations')
117117
return parser.parse_args()
118118

test/microbench/avg_pool3d.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -54,19 +54,19 @@ def AVGPool3d(shape, dtype, channels_last, backward, device):
5454

5555
def run_profile(shape, dtype, channels_last, backward, device, num_iter):
5656
with profile(
57-
activities=[ProfilerActivity.CPU,
57+
activities=[ProfilerActivity.CPU,
5858
ProfilerActivity.XPU if device == 'xpu' else ProfilerActivity.CUDA],
5959
record_shapes=True,
6060
) as prof:
61-
for _ in range(num_iter):
61+
for i in range(num_iter):
6262
AVGPool3d(shape, dtype, channels_last, backward, device)
6363
print(prof.key_averages().table(sort_by="{}_time_total".format(device)))
6464

6565
def run_e2e(shape, dtype, channels_last, backward, device, num_iter):
6666
if device in ['xpu', 'cuda']:
6767
torch.xpu.synchronize() if device == 'xpu' else torch.cuda.synchronize()
6868
t1 = time.time()
69-
for _ in range(num_iter):
69+
for i in range(num_iter):
7070
AVGPool3d(shape, dtype, channels_last, backward, device)
7171
if device in ['xpu', 'cuda']:
7272
torch.xpu.synchronize() if device == 'xpu' else torch.cuda.synchronize()
@@ -105,14 +105,14 @@ def benchmark(args):
105105

106106
def parse_args():
107107
parser = argparse.ArgumentParser(description='OP Benchmark')
108-
parser.add_argument('--device', type=str, default='xpu',
108+
parser.add_argument('--device', type=str, default='xpu',
109109
help='Device to run on (e.g., "cpu", "cuda", "xpu")')
110110
group = parser.add_mutually_exclusive_group()
111-
group.add_argument('--profile-only', action='store_true',
111+
group.add_argument('--profile-only', action='store_true',
112112
help='Only Run profile timing')
113-
group.add_argument('--e2e-only', action='store_true',
113+
group.add_argument('--e2e-only', action='store_true',
114114
help='Only Run E2E timing')
115-
parser.add_argument('--num-iter', type=int, default=20,
115+
parser.add_argument('--num-iter', type=int, default=20,
116116
help='Number of iterations')
117117
return parser.parse_args()
118118

test/microbench/batch_norm_1d.py

Lines changed: 19 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -7,34 +7,28 @@
77
backward = True
88

99

10-
def BTN1d(shape, dtype, backward, device):
11-
input = torch.randn(shape[0], device=device, dtype=dtype)
12-
if backward:
13-
input.requires_grad_(True)
14-
15-
m = torch.nn.BatchNorm1d(shape[1], device=device)
10+
def BTN1d(m, input, backward, device):
1611
output = m(input)
17-
1812
if backward:
1913
gy = torch.empty_like(output)
2014
output.backward(gy)
2115

22-
def run_profile(shape, dtype, backward, device, num_iter):
16+
def run_profile(m, input, backward, device, num_iter):
2317
with profile(
24-
activities=[ProfilerActivity.CPU,
18+
activities=[ProfilerActivity.CPU,
2519
ProfilerActivity.XPU if device == 'xpu' else ProfilerActivity.CUDA],
2620
record_shapes=True,
2721
) as prof:
28-
for _ in range(num_iter):
29-
BTN1d(shape, dtype, backward, device)
22+
for i in range(num_iter):
23+
BTN1d(m, input, backward, device)
3024
print(prof.key_averages().table(sort_by="{}_time_total".format(device)))
3125

32-
def run_e2e(shape, dtype, backward, device, num_iter):
26+
def run_e2e(m, input, backward, device, num_iter):
3327
if device in ['xpu', 'cuda']:
3428
torch.xpu.synchronize() if device == 'xpu' else torch.cuda.synchronize()
3529
t1 = time.time()
36-
for _ in range(num_iter):
37-
BTN1d(shape, dtype, backward, device)
30+
for i in range(num_iter):
31+
BTN1d(m, input, backward, device)
3832
if device in ['xpu', 'cuda']:
3933
torch.xpu.synchronize() if device == 'xpu' else torch.cuda.synchronize()
4034
t2 = time.time()
@@ -44,8 +38,12 @@ def run_e2e(shape, dtype, backward, device, num_iter):
4438
def benchmark(args):
4539
for shape in shape_list:
4640
for dtype in [torch.bfloat16, torch.float16, torch.float32]:
41+
input = torch.randn(shape[0], device=args.device, dtype=dtype)
42+
if backward:
43+
input.requires_grad_(True)
44+
m = torch.nn.BatchNorm1d(shape[1], device=args.device)
4745
# warm up
48-
BTN1d(shape, dtype, backward, args.device)
46+
BTN1d(m, input, backward, args.device)
4947

5048
# go
5149
print(
@@ -60,21 +58,21 @@ def benchmark(args):
6058
)
6159

6260
if not args.e2e_only:
63-
run_profile(shape, dtype, backward, args.device, args.num_iter)
61+
run_profile(m, input, backward, args.device, args.num_iter)
6462

6563
if not args.profile_only:
66-
run_e2e(shape, dtype, backward, args.device, args.num_iter)
64+
run_e2e(m, input, backward, args.device, args.num_iter)
6765

6866
def parse_args():
6967
parser = argparse.ArgumentParser(description='OP Benchmark')
70-
parser.add_argument('--device', type=str, default='xpu',
68+
parser.add_argument('--device', type=str, default='xpu',
7169
help='Device to run on (e.g., "cpu", "cuda", "xpu")')
7270
group = parser.add_mutually_exclusive_group()
73-
group.add_argument('--profile-only', action='store_true',
71+
group.add_argument('--profile-only', action='store_true',
7472
help='Only Run profile timing')
75-
group.add_argument('--e2e-only', action='store_true',
73+
group.add_argument('--e2e-only', action='store_true',
7674
help='Only Run E2E timing')
77-
parser.add_argument('--num-iter', type=int, default=20,
75+
parser.add_argument('--num-iter', type=int, default=20,
7876
help='Number of iterations')
7977
return parser.parse_args()
8078

test/microbench/batch_norm_2d.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -39,19 +39,19 @@ def BTN2d(shape, dtype, channels_last, backward, device):
3939

4040
def run_profile(shape, dtype, channels_last, backward, device, num_iter):
4141
with profile(
42-
activities=[ProfilerActivity.CPU,
42+
activities=[ProfilerActivity.CPU,
4343
ProfilerActivity.XPU if device == 'xpu' else ProfilerActivity.CUDA],
4444
record_shapes=True,
4545
) as prof:
46-
for _ in range(num_iter):
46+
for i in range(num_iter):
4747
BTN2d(shape, dtype, channels_last, backward, device)
4848
print(prof.key_averages().table(sort_by="{}_time_total".format(device)))
4949

5050
def run_e2e(shape, dtype, channels_last, backward, device, num_iter):
5151
if device in ['xpu', 'cuda']:
5252
torch.xpu.synchronize() if device == 'xpu' else torch.cuda.synchronize()
5353
t1 = time.time()
54-
for _ in range(num_iter):
54+
for i in range(num_iter):
5555
BTN2d(shape, dtype, channels_last, backward, device)
5656
if device in ['xpu', 'cuda']:
5757
torch.xpu.synchronize() if device == 'xpu' else torch.cuda.synchronize()
@@ -88,14 +88,14 @@ def benchmark(args):
8888

8989
def parse_args():
9090
parser = argparse.ArgumentParser(description='OP Benchmark')
91-
parser.add_argument('--device', type=str, default='xpu',
91+
parser.add_argument('--device', type=str, default='xpu',
9292
help='Device to run on (e.g., "cpu", "cuda", "xpu")')
9393
group = parser.add_mutually_exclusive_group()
94-
group.add_argument('--profile-only', action='store_true',
94+
group.add_argument('--profile-only', action='store_true',
9595
help='Only Run profile timing')
96-
group.add_argument('--e2e-only', action='store_true',
96+
group.add_argument('--e2e-only', action='store_true',
9797
help='Only Run E2E timing')
98-
parser.add_argument('--num-iter', type=int, default=20,
98+
parser.add_argument('--num-iter', type=int, default=20,
9999
help='Number of iterations')
100100
return parser.parse_args()
101101

test/microbench/batch_norm_3d.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -39,19 +39,19 @@ def BTN3d(shape, dtype, channels_last, backward, device):
3939

4040
def run_profile(shape, dtype, channels_last, backward, device, num_iter):
4141
with profile(
42-
activities=[ProfilerActivity.CPU,
42+
activities=[ProfilerActivity.CPU,
4343
ProfilerActivity.XPU if device == 'xpu' else ProfilerActivity.CUDA],
4444
record_shapes=True,
4545
) as prof:
46-
for _ in range(num_iter):
46+
for i in range(num_iter):
4747
BTN3d(shape, dtype, channels_last, backward, device)
4848
print(prof.key_averages().table(sort_by="{}_time_total".format(device)))
4949

5050
def run_e2e(shape, dtype, channels_last, backward, device, num_iter):
5151
if device in ['xpu', 'cuda']:
5252
torch.xpu.synchronize() if device == 'xpu' else torch.cuda.synchronize()
5353
t1 = time.time()
54-
for _ in range(num_iter):
54+
for i in range(num_iter):
5555
BTN3d(shape, dtype, channels_last, backward, device)
5656
if device in ['xpu', 'cuda']:
5757
torch.xpu.synchronize() if device == 'xpu' else torch.cuda.synchronize()
@@ -87,14 +87,14 @@ def benchmark(args):
8787

8888
def parse_args():
8989
parser = argparse.ArgumentParser(description='OP Benchmark')
90-
parser.add_argument('--device', type=str, default='xpu',
90+
parser.add_argument('--device', type=str, default='xpu',
9191
help='Device to run on (e.g., "cpu", "cuda", "xpu")')
9292
group = parser.add_mutually_exclusive_group()
93-
group.add_argument('--profile-only', action='store_true',
93+
group.add_argument('--profile-only', action='store_true',
9494
help='Only Run profile timing')
95-
group.add_argument('--e2e-only', action='store_true',
95+
group.add_argument('--e2e-only', action='store_true',
9696
help='Only Run E2E timing')
97-
parser.add_argument('--num-iter', type=int, default=20,
97+
parser.add_argument('--num-iter', type=int, default=20,
9898
help='Number of iterations')
9999
return parser.parse_args()
100100

test/microbench/col2im.py

Lines changed: 19 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -13,34 +13,29 @@
1313
dilation = (6, 6)
1414
backward = True
1515

16-
def Col2im(shape, dtype, backward, device):
17-
input = torch.randn(shape[0], dtype=dtype, device=device, requires_grad=True)
18-
if backward:
19-
input.requires_grad_(True)
20-
output_size = shape[1]
21-
16+
def Col2im(input, output_size, kernel_size, dilation, backward, device):
2217
output = torch.nn.functional.fold(
2318
input, output_size, kernel_size, dilation, 1, 1
2419
)
2520
if backward:
2621
torch.autograd.grad(output, input, grad_outputs=torch.ones_like(output))
2722

28-
def run_profile(shape, dtype, backward, device, num_iter):
23+
def run_profile(input, output_size, kernel_size, dilation, backward, device, num_iter):
2924
with profile(
30-
activities=[ProfilerActivity.CPU,
25+
activities=[ProfilerActivity.CPU,
3126
ProfilerActivity.XPU if device == 'xpu' else ProfilerActivity.CUDA],
3227
record_shapes=True,
3328
) as prof:
34-
for _ in range(num_iter):
35-
Col2im(shape, dtype, backward, device)
29+
for i in range(num_iter):
30+
Col2im(input, output_size, kernel_size, dilation, backward, device)
3631
print(prof.key_averages().table(sort_by="{}_time_total".format(device)))
3732

38-
def run_e2e(shape, dtype, backward, device, num_iter):
33+
def run_e2e(input, output_size, kernel_size, dilation, backward, device, num_iter):
3934
if device in ['xpu', 'cuda']:
4035
torch.xpu.synchronize() if device == 'xpu' else torch.cuda.synchronize()
4136
t1 = time.time()
42-
for _ in range(num_iter):
43-
Col2im(shape, dtype, backward, device)
37+
for i in range(num_iter):
38+
Col2im(input, output_size, kernel_size, dilation, backward, device)
4439
if device in ['xpu', 'cuda']:
4540
torch.xpu.synchronize() if device == 'xpu' else torch.cuda.synchronize()
4641
t2 = time.time()
@@ -50,8 +45,12 @@ def run_e2e(shape, dtype, backward, device, num_iter):
5045
def benchmark(args):
5146
for shape in shape_list:
5247
for dtype in [torch.bfloat16, torch.float16, torch.float32]:
48+
input = torch.randn(shape[0], dtype=dtype, device=args.device, requires_grad=True)
49+
if backward:
50+
input.requires_grad_(True)
51+
output_size = shape[1]
5352
# warm up
54-
Col2im(shape, dtype, backward, args.device)
53+
Col2im(input, output_size, kernel_size, dilation, backward, args.device)
5554

5655
# go
5756
print(
@@ -65,21 +64,21 @@ def benchmark(args):
6564
backward,
6665
)
6766
if not args.e2e_only:
68-
run_profile(shape, dtype, backward, args.device, args.num_iter)
67+
run_profile(input, output_size, kernel_size, dilation, backward, args.device, args.num_iter)
6968

7069
if not args.profile_only:
71-
run_e2e(shape, dtype, backward, args.device, args.num_iter)
70+
run_e2e(input, output_size, kernel_size, dilation, backward, args.device, args.num_iter)
7271

7372
def parse_args():
7473
parser = argparse.ArgumentParser(description='OP Benchmark')
75-
parser.add_argument('--device', type=str, default='xpu',
74+
parser.add_argument('--device', type=str, default='xpu',
7675
help='Device to run on (e.g., "cpu", "cuda", "xpu")')
7776
group = parser.add_mutually_exclusive_group()
78-
group.add_argument('--profile-only', action='store_true',
77+
group.add_argument('--profile-only', action='store_true',
7978
help='Only Run profile timing')
80-
group.add_argument('--e2e-only', action='store_true',
79+
group.add_argument('--e2e-only', action='store_true',
8180
help='Only Run E2E timing')
82-
parser.add_argument('--num-iter', type=int, default=20,
81+
parser.add_argument('--num-iter', type=int, default=20,
8382
help='Number of iterations')
8483
return parser.parse_args()
8584

0 commit comments

Comments
 (0)