intel
diff --git a/‎test/microbench/adaptive_avg_pool2d.py
Lines changed: 7 additions & 7 deletions b/‎test/microbench/adaptive_avg_pool2d.py
Lines changed: 7 additions & 7 deletions
diff --git a/‎test/microbench/avg_pool2d.py
Lines changed: 7 additions & 7 deletions b/‎test/microbench/avg_pool2d.py
Lines changed: 7 additions & 7 deletions
diff --git a/‎test/microbench/avg_pool3d.py
Lines changed: 7 additions & 7 deletions b/‎test/microbench/avg_pool3d.py
Lines changed: 7 additions & 7 deletions
diff --git a/‎test/microbench/batch_norm_1d.py
Lines changed: 19 additions & 21 deletions b/‎test/microbench/batch_norm_1d.py
Lines changed: 19 additions & 21 deletions
diff --git a/‎test/microbench/batch_norm_2d.py
Lines changed: 7 additions & 7 deletions b/‎test/microbench/batch_norm_2d.py
Lines changed: 7 additions & 7 deletions
diff --git a/‎test/microbench/batch_norm_3d.py
Lines changed: 7 additions & 7 deletions b/‎test/microbench/batch_norm_3d.py
Lines changed: 7 additions & 7 deletions
diff --git a/‎test/microbench/col2im.py
Lines changed: 19 additions & 20 deletions b/‎test/microbench/col2im.py
Lines changed: 19 additions & 20 deletions
@@ -47,19 +47,19 @@ def Adaptive_AVGPool2d(shape, dtype, channels_last, backward, device):
 
 def run_profile(shape, dtype, channels_last, backward, device, num_iter):
     with profile(
-        activities=[ProfilerActivity.CPU, 
+        activities=[ProfilerActivity.CPU,
                   ProfilerActivity.XPU if device == 'xpu' else ProfilerActivity.CUDA],
         record_shapes=True,
     ) as prof:
-        for _ in range(num_iter):
+        for i in range(num_iter):
             Adaptive_AVGPool2d(shape, dtype, channels_last, backward, device)
     print(prof.key_averages().table(sort_by="{}_time_total".format(device)))
 
 def run_e2e(shape, dtype, channels_last, backward, device, num_iter):
     if device in ['xpu', 'cuda']:
         torch.xpu.synchronize() if device == 'xpu' else torch.cuda.synchronize()
     t1 = time.time()
-    for _ in range(num_iter):
+    for i in range(num_iter):
         Adaptive_AVGPool2d(shape, dtype, channels_last, backward, device)
     if device in ['xpu', 'cuda']:
         torch.xpu.synchronize() if device == 'xpu' else torch.cuda.synchronize()
@@ -95,14 +95,14 @@ def benchmark(args):
 
 def parse_args():
     parser = argparse.ArgumentParser(description='OP Benchmark')
-    parser.add_argument('--device', type=str, default='xpu', 
+    parser.add_argument('--device', type=str, default='xpu',
                         help='Device to run on (e.g., "cpu", "cuda", "xpu")')
     group = parser.add_mutually_exclusive_group()
-    group.add_argument('--profile-only', action='store_true', 
+    group.add_argument('--profile-only', action='store_true',
                        help='Only Run profile timing')
-    group.add_argument('--e2e-only', action='store_true', 
+    group.add_argument('--e2e-only', action='store_true',
                        help='Only Run E2E timing')
-    parser.add_argument('--num-iter', type=int, default=20, 
+    parser.add_argument('--num-iter', type=int, default=20,
                         help='Number of iterations')
     return parser.parse_args()
 
 
@@ -54,19 +54,19 @@ def AVGPool2d(shape, dtype, channels_last, backward, device):
 
 def run_profile(shape, dtype, channels_last, backward, device, num_iter):
     with profile(
-        activities=[ProfilerActivity.CPU, 
+        activities=[ProfilerActivity.CPU,
                   ProfilerActivity.XPU if device == 'xpu' else ProfilerActivity.CUDA],
         record_shapes=True,
     ) as prof:
-        for _ in range(num_iter):
+        for i in range(num_iter):
             AVGPool2d(shape, dtype, channels_last, backward, device)
     print(prof.key_averages().table(sort_by="{}_time_total".format(device)))
 
 def run_e2e(shape, dtype, channels_last, backward, device, num_iter):
     if device in ['xpu', 'cuda']:
         torch.xpu.synchronize() if device == 'xpu' else torch.cuda.synchronize()
     t1 = time.time()
-    for _ in range(num_iter):
+    for i in range(num_iter):
         AVGPool2d(shape, dtype, channels_last, backward, device)
     if device in ['xpu', 'cuda']:
         torch.xpu.synchronize() if device == 'xpu' else torch.cuda.synchronize()
@@ -105,14 +105,14 @@ def benchmark(args):
 
 def parse_args():
     parser = argparse.ArgumentParser(description='OP Benchmark')
-    parser.add_argument('--device', type=str, default='xpu', 
+    parser.add_argument('--device', type=str, default='xpu',
                         help='Device to run on (e.g., "cpu", "cuda", "xpu")')
     group = parser.add_mutually_exclusive_group()
-    group.add_argument('--profile-only', action='store_true', 
+    group.add_argument('--profile-only', action='store_true',
                        help='Only Run profile timing')
-    group.add_argument('--e2e-only', action='store_true', 
+    group.add_argument('--e2e-only', action='store_true',
                        help='Only Run E2E timing')
-    parser.add_argument('--num-iter', type=int, default=20, 
+    parser.add_argument('--num-iter', type=int, default=20,
                         help='Number of iterations')
     return parser.parse_args()
 
 
@@ -54,19 +54,19 @@ def AVGPool3d(shape, dtype, channels_last, backward, device):
 
 def run_profile(shape, dtype, channels_last, backward, device, num_iter):
     with profile(
-        activities=[ProfilerActivity.CPU, 
+        activities=[ProfilerActivity.CPU,
                   ProfilerActivity.XPU if device == 'xpu' else ProfilerActivity.CUDA],
         record_shapes=True,
     ) as prof:
-        for _ in range(num_iter):
+        for i in range(num_iter):
             AVGPool3d(shape, dtype, channels_last, backward, device)
     print(prof.key_averages().table(sort_by="{}_time_total".format(device)))
 
 def run_e2e(shape, dtype, channels_last, backward, device, num_iter):
     if device in ['xpu', 'cuda']:
         torch.xpu.synchronize() if device == 'xpu' else torch.cuda.synchronize()
     t1 = time.time()
-    for _ in range(num_iter):
+    for i in range(num_iter):
         AVGPool3d(shape, dtype, channels_last, backward, device)
     if device in ['xpu', 'cuda']:
         torch.xpu.synchronize() if device == 'xpu' else torch.cuda.synchronize()
@@ -105,14 +105,14 @@ def benchmark(args):
 
 def parse_args():
     parser = argparse.ArgumentParser(description='OP Benchmark')
-    parser.add_argument('--device', type=str, default='xpu', 
+    parser.add_argument('--device', type=str, default='xpu',
                         help='Device to run on (e.g., "cpu", "cuda", "xpu")')
     group = parser.add_mutually_exclusive_group()
-    group.add_argument('--profile-only', action='store_true', 
+    group.add_argument('--profile-only', action='store_true',
                        help='Only Run profile timing')
-    group.add_argument('--e2e-only', action='store_true', 
+    group.add_argument('--e2e-only', action='store_true',
                        help='Only Run E2E timing')
-    parser.add_argument('--num-iter', type=int, default=20, 
+    parser.add_argument('--num-iter', type=int, default=20,
                         help='Number of iterations')
     return parser.parse_args()
 
 
@@ -7,34 +7,28 @@
 backward = True
 
 
-def BTN1d(shape, dtype, backward, device):
-    input = torch.randn(shape[0], device=device, dtype=dtype)
-    if backward:
-        input.requires_grad_(True)
-
-    m = torch.nn.BatchNorm1d(shape[1], device=device)
+def BTN1d(m, input, backward, device):
     output = m(input)
-
     if backward:
         gy = torch.empty_like(output)
         output.backward(gy)
 
-def run_profile(shape, dtype, backward, device, num_iter):
+def run_profile(m, input, backward, device, num_iter):
     with profile(
-        activities=[ProfilerActivity.CPU, 
+        activities=[ProfilerActivity.CPU,
                   ProfilerActivity.XPU if device == 'xpu' else ProfilerActivity.CUDA],
         record_shapes=True,
     ) as prof:
-        for _ in range(num_iter):
-            BTN1d(shape, dtype, backward, device)
+        for i in range(num_iter):
+            BTN1d(m, input, backward, device)
     print(prof.key_averages().table(sort_by="{}_time_total".format(device)))
 
-def run_e2e(shape, dtype, backward, device, num_iter):
+def run_e2e(m, input, backward, device, num_iter):
     if device in ['xpu', 'cuda']:
         torch.xpu.synchronize() if device == 'xpu' else torch.cuda.synchronize()
     t1 = time.time()
-    for _ in range(num_iter):
-        BTN1d(shape, dtype, backward, device)
+    for i in range(num_iter):
+        BTN1d(m, input, backward, device)
     if device in ['xpu', 'cuda']:
         torch.xpu.synchronize() if device == 'xpu' else torch.cuda.synchronize()
     t2 = time.time()
@@ -44,8 +38,12 @@ def run_e2e(shape, dtype, backward, device, num_iter):
 def benchmark(args):
     for shape in shape_list:
         for dtype in [torch.bfloat16, torch.float16, torch.float32]:
+            input = torch.randn(shape[0], device=args.device, dtype=dtype)
+            if backward:
+                input.requires_grad_(True)
+            m = torch.nn.BatchNorm1d(shape[1], device=args.device)
             # warm up
-            BTN1d(shape, dtype, backward, args.device)
+            BTN1d(m, input, backward, args.device)
 
             # go
             print(
@@ -60,21 +58,21 @@ def benchmark(args):
             )
 
             if not args.e2e_only:
-                run_profile(shape, dtype, backward, args.device, args.num_iter)
+                run_profile(m, input, backward, args.device, args.num_iter)
 
             if not args.profile_only:
-                run_e2e(shape, dtype, backward, args.device, args.num_iter)
+                run_e2e(m, input, backward, args.device, args.num_iter)
 
 def parse_args():
     parser = argparse.ArgumentParser(description='OP Benchmark')
-    parser.add_argument('--device', type=str, default='xpu', 
+    parser.add_argument('--device', type=str, default='xpu',
                         help='Device to run on (e.g., "cpu", "cuda", "xpu")')
     group = parser.add_mutually_exclusive_group()
-    group.add_argument('--profile-only', action='store_true', 
+    group.add_argument('--profile-only', action='store_true',
                        help='Only Run profile timing')
-    group.add_argument('--e2e-only', action='store_true', 
+    group.add_argument('--e2e-only', action='store_true',
                        help='Only Run E2E timing')
-    parser.add_argument('--num-iter', type=int, default=20, 
+    parser.add_argument('--num-iter', type=int, default=20,
                         help='Number of iterations')
     return parser.parse_args()
 
 
@@ -39,19 +39,19 @@ def BTN2d(shape, dtype, channels_last, backward, device):
 
 def run_profile(shape, dtype, channels_last, backward, device, num_iter):
     with profile(
-        activities=[ProfilerActivity.CPU, 
+        activities=[ProfilerActivity.CPU,
                   ProfilerActivity.XPU if device == 'xpu' else ProfilerActivity.CUDA],
         record_shapes=True,
     ) as prof:
-        for _ in range(num_iter):
+        for i in range(num_iter):
             BTN2d(shape, dtype, channels_last, backward, device)
     print(prof.key_averages().table(sort_by="{}_time_total".format(device)))
 
 def run_e2e(shape, dtype, channels_last, backward, device, num_iter):
     if device in ['xpu', 'cuda']:
         torch.xpu.synchronize() if device == 'xpu' else torch.cuda.synchronize()
     t1 = time.time()
-    for _ in range(num_iter):
+    for i in range(num_iter):
         BTN2d(shape, dtype, channels_last, backward, device)
     if device in ['xpu', 'cuda']:
         torch.xpu.synchronize() if device == 'xpu' else torch.cuda.synchronize()
@@ -88,14 +88,14 @@ def benchmark(args):
 
 def parse_args():
     parser = argparse.ArgumentParser(description='OP Benchmark')
-    parser.add_argument('--device', type=str, default='xpu', 
+    parser.add_argument('--device', type=str, default='xpu',
                         help='Device to run on (e.g., "cpu", "cuda", "xpu")')
     group = parser.add_mutually_exclusive_group()
-    group.add_argument('--profile-only', action='store_true', 
+    group.add_argument('--profile-only', action='store_true',
                        help='Only Run profile timing')
-    group.add_argument('--e2e-only', action='store_true', 
+    group.add_argument('--e2e-only', action='store_true',
                        help='Only Run E2E timing')
-    parser.add_argument('--num-iter', type=int, default=20, 
+    parser.add_argument('--num-iter', type=int, default=20,
                         help='Number of iterations')
     return parser.parse_args()
 
 
@@ -39,19 +39,19 @@ def BTN3d(shape, dtype, channels_last, backward, device):
 
 def run_profile(shape, dtype, channels_last, backward, device, num_iter):
     with profile(
-        activities=[ProfilerActivity.CPU, 
+        activities=[ProfilerActivity.CPU,
                   ProfilerActivity.XPU if device == 'xpu' else ProfilerActivity.CUDA],
         record_shapes=True,
     ) as prof:
-        for _ in range(num_iter):
+        for i in range(num_iter):
             BTN3d(shape, dtype, channels_last, backward, device)
     print(prof.key_averages().table(sort_by="{}_time_total".format(device)))
 
 def run_e2e(shape, dtype, channels_last, backward, device, num_iter):
     if device in ['xpu', 'cuda']:
         torch.xpu.synchronize() if device == 'xpu' else torch.cuda.synchronize()
     t1 = time.time()
-    for _ in range(num_iter):
+    for i in range(num_iter):
         BTN3d(shape, dtype, channels_last, backward, device)
     if device in ['xpu', 'cuda']:
         torch.xpu.synchronize() if device == 'xpu' else torch.cuda.synchronize()
@@ -87,14 +87,14 @@ def benchmark(args):
 
 def parse_args():
     parser = argparse.ArgumentParser(description='OP Benchmark')
-    parser.add_argument('--device', type=str, default='xpu', 
+    parser.add_argument('--device', type=str, default='xpu',
                         help='Device to run on (e.g., "cpu", "cuda", "xpu")')
     group = parser.add_mutually_exclusive_group()
-    group.add_argument('--profile-only', action='store_true', 
+    group.add_argument('--profile-only', action='store_true',
                        help='Only Run profile timing')
-    group.add_argument('--e2e-only', action='store_true', 
+    group.add_argument('--e2e-only', action='store_true',
                        help='Only Run E2E timing')
-    parser.add_argument('--num-iter', type=int, default=20, 
+    parser.add_argument('--num-iter', type=int, default=20,
                         help='Number of iterations')
     return parser.parse_args()
 
 
@@ -13,34 +13,29 @@
 dilation = (6, 6)
 backward = True
 
-def Col2im(shape, dtype, backward, device):
-    input = torch.randn(shape[0], dtype=dtype, device=device, requires_grad=True)
-    if backward:
-        input.requires_grad_(True)
-    output_size = shape[1]
-
+def Col2im(input, output_size, kernel_size, dilation, backward, device):
     output = torch.nn.functional.fold(
         input, output_size, kernel_size, dilation, 1, 1
     )
     if backward:
         torch.autograd.grad(output, input, grad_outputs=torch.ones_like(output))
 
-def run_profile(shape, dtype, backward, device, num_iter):
+def run_profile(input, output_size, kernel_size, dilation, backward, device, num_iter):
     with profile(
-        activities=[ProfilerActivity.CPU, 
+        activities=[ProfilerActivity.CPU,
                   ProfilerActivity.XPU if device == 'xpu' else ProfilerActivity.CUDA],
         record_shapes=True,
     ) as prof:
-        for _ in range(num_iter):
-            Col2im(shape, dtype, backward, device)
+        for i in range(num_iter):
+            Col2im(input, output_size, kernel_size, dilation, backward, device)
     print(prof.key_averages().table(sort_by="{}_time_total".format(device)))
 
-def run_e2e(shape, dtype, backward, device, num_iter):
+def run_e2e(input, output_size, kernel_size, dilation, backward, device, num_iter):
     if device in ['xpu', 'cuda']:
         torch.xpu.synchronize() if device == 'xpu' else torch.cuda.synchronize()
     t1 = time.time()
-    for _ in range(num_iter):
-        Col2im(shape, dtype, backward, device)
+    for i in range(num_iter):
+        Col2im(input, output_size, kernel_size, dilation, backward, device)
     if device in ['xpu', 'cuda']:
         torch.xpu.synchronize() if device == 'xpu' else torch.cuda.synchronize()
     t2 = time.time()
@@ -50,8 +45,12 @@ def run_e2e(shape, dtype, backward, device, num_iter):
 def benchmark(args):
     for shape in shape_list:
         for dtype in [torch.bfloat16, torch.float16, torch.float32]:
+            input = torch.randn(shape[0], dtype=dtype, device=args.device, requires_grad=True)
+            if backward:
+                input.requires_grad_(True)
+            output_size = shape[1]
             # warm up
-            Col2im(shape, dtype, backward, args.device)
+            Col2im(input, output_size, kernel_size, dilation, backward, args.device)
 
             # go
             print(
@@ -65,21 +64,21 @@ def benchmark(args):
                 backward,
             )
             if not args.e2e_only:
-                run_profile(shape, dtype, backward, args.device, args.num_iter)
+                run_profile(input, output_size, kernel_size, dilation, backward, args.device, args.num_iter)
 
             if not args.profile_only:
-                run_e2e(shape, dtype, backward, args.device, args.num_iter)
+                run_e2e(input, output_size, kernel_size, dilation, backward, args.device, args.num_iter)
 
 def parse_args():
     parser = argparse.ArgumentParser(description='OP Benchmark')
-    parser.add_argument('--device', type=str, default='xpu', 
+    parser.add_argument('--device', type=str, default='xpu',
                         help='Device to run on (e.g., "cpu", "cuda", "xpu")')
     group = parser.add_mutually_exclusive_group()
-    group.add_argument('--profile-only', action='store_true', 
+    group.add_argument('--profile-only', action='store_true',
                        help='Only Run profile timing')
-    group.add_argument('--e2e-only', action='store_true', 
+    group.add_argument('--e2e-only', action='store_true',
                        help='Only Run E2E timing')
-    parser.add_argument('--num-iter', type=int, default=20, 
+    parser.add_argument('--num-iter', type=int, default=20,
                         help='Number of iterations')
     return parser.parse_args()