intel
diff --git a/‎test/microbench/avg_pool2d.py
Lines changed: 44 additions & 19 deletions b/‎test/microbench/avg_pool2d.py
Lines changed: 44 additions & 19 deletions
diff --git a/‎test/microbench/avg_pool3d.py
Lines changed: 44 additions & 19 deletions b/‎test/microbench/avg_pool3d.py
Lines changed: 44 additions & 19 deletions
diff --git a/‎test/microbench/batch_norm_1d.py
Lines changed: 28 additions & 17 deletions b/‎test/microbench/batch_norm_1d.py
Lines changed: 28 additions & 17 deletions
diff --git a/‎test/microbench/batch_norm_2d.py
Lines changed: 44 additions & 19 deletions b/‎test/microbench/batch_norm_2d.py
Lines changed: 44 additions & 19 deletions
@@ -1,5 +1,6 @@
-import time
 import argparse
+import time
+
 import torch
 from torch.profiler import profile, ProfilerActivity
 
@@ -52,28 +53,33 @@ def AVGPool2d(shape, dtype, channels_last, backward, device):
     if backward:
         output[0].backward(grad)
 
+
 def run_profile(shape, dtype, channels_last, backward, device, num_iter):
     with profile(
-        activities=[ProfilerActivity.CPU,
-                  ProfilerActivity.XPU if device == 'xpu' else ProfilerActivity.CUDA],
+        activities=[
+            ProfilerActivity.CPU,
+            ProfilerActivity.XPU if device == "xpu" else ProfilerActivity.CUDA,
+        ],
         record_shapes=True,
     ) as prof:
         for i in range(num_iter):
             AVGPool2d(shape, dtype, channels_last, backward, device)
-    print(prof.key_averages().table(sort_by="{}_time_total".format(device)))
+    print(prof.key_averages().table(sort_by=f"{device}_time_total"))
+
 
 def run_e2e(shape, dtype, channels_last, backward, device, num_iter):
-    if device in ['xpu', 'cuda']:
-        torch.xpu.synchronize() if device == 'xpu' else torch.cuda.synchronize()
+    if device in ["xpu", "cuda"]:
+        torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
     t1 = time.time()
     for i in range(num_iter):
         AVGPool2d(shape, dtype, channels_last, backward, device)
-    if device in ['xpu', 'cuda']:
-        torch.xpu.synchronize() if device == 'xpu' else torch.cuda.synchronize()
+    if device in ["xpu", "cuda"]:
+        torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
     t2 = time.time()
     e2e_time = (t2 - t1) / num_iter
     print("E2E total time:", f"{float(e2e_time):.20f}")
 
+
 def benchmark(args):
     for shape in shape_list:
         for dtype in [torch.bfloat16, torch.float16, torch.float32]:
@@ -98,24 +104,43 @@ def benchmark(args):
                 )
 
                 if not args.e2e_only:
-                    run_profile(shape, dtype, channels_last, backward, args.device, args.num_iter)
+                    run_profile(
+                        shape,
+                        dtype,
+                        channels_last,
+                        backward,
+                        args.device,
+                        args.num_iter,
+                    )
 
                 if not args.profile_only:
-                    run_e2e(shape, dtype, channels_last, backward, args.device, args.num_iter)
+                    run_e2e(
+                        shape,
+                        dtype,
+                        channels_last,
+                        backward,
+                        args.device,
+                        args.num_iter,
+                    )
+
 
 def parse_args():
-    parser = argparse.ArgumentParser(description='OP Benchmark')
-    parser.add_argument('--device', type=str, default='xpu',
-                        help='Device to run on (e.g., "cpu", "cuda", "xpu")')
+    parser = argparse.ArgumentParser(description="OP Benchmark")
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="xpu",
+        help='Device to run on (e.g., "cpu", "cuda", "xpu")',
+    )
     group = parser.add_mutually_exclusive_group()
-    group.add_argument('--profile-only', action='store_true',
-                       help='Only Run profile timing')
-    group.add_argument('--e2e-only', action='store_true',
-                       help='Only Run E2E timing')
-    parser.add_argument('--num-iter', type=int, default=20,
-                        help='Number of iterations')
+    group.add_argument(
+        "--profile-only", action="store_true", help="Only Run profile timing"
+    )
+    group.add_argument("--e2e-only", action="store_true", help="Only Run E2E timing")
+    parser.add_argument("--num-iter", type=int, default=20, help="Number of iterations")
     return parser.parse_args()
 
+
 if __name__ == "__main__":
     args = parse_args()
     benchmark(args)
@@ -1,5 +1,6 @@
-import time
 import argparse
+import time
+
 import torch
 from torch.profiler import profile, ProfilerActivity
 
@@ -52,28 +53,33 @@ def AVGPool3d(shape, dtype, channels_last, backward, device):
     if backward:
         output[0].backward(grad)
 
+
 def run_profile(shape, dtype, channels_last, backward, device, num_iter):
     with profile(
-        activities=[ProfilerActivity.CPU,
-                  ProfilerActivity.XPU if device == 'xpu' else ProfilerActivity.CUDA],
+        activities=[
+            ProfilerActivity.CPU,
+            ProfilerActivity.XPU if device == "xpu" else ProfilerActivity.CUDA,
+        ],
         record_shapes=True,
     ) as prof:
         for i in range(num_iter):
             AVGPool3d(shape, dtype, channels_last, backward, device)
-    print(prof.key_averages().table(sort_by="{}_time_total".format(device)))
+    print(prof.key_averages().table(sort_by=f"{device}_time_total"))
+
 
 def run_e2e(shape, dtype, channels_last, backward, device, num_iter):
-    if device in ['xpu', 'cuda']:
-        torch.xpu.synchronize() if device == 'xpu' else torch.cuda.synchronize()
+    if device in ["xpu", "cuda"]:
+        torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
     t1 = time.time()
     for i in range(num_iter):
         AVGPool3d(shape, dtype, channels_last, backward, device)
-    if device in ['xpu', 'cuda']:
-        torch.xpu.synchronize() if device == 'xpu' else torch.cuda.synchronize()
+    if device in ["xpu", "cuda"]:
+        torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
     t2 = time.time()
     e2e_time = (t2 - t1) / num_iter
     print("E2E total time:", f"{float(e2e_time):.20f}")
 
+
 def benchmark(args):
     for shape in shape_list:
         for dtype in [torch.bfloat16, torch.float16, torch.float32]:
@@ -98,24 +104,43 @@ def benchmark(args):
                 )
 
                 if not args.e2e_only:
-                    run_profile(shape, dtype, channels_last, backward, args.device, args.num_iter)
+                    run_profile(
+                        shape,
+                        dtype,
+                        channels_last,
+                        backward,
+                        args.device,
+                        args.num_iter,
+                    )
 
                 if not args.profile_only:
-                    run_e2e(shape, dtype, channels_last, backward, args.device, args.num_iter)
+                    run_e2e(
+                        shape,
+                        dtype,
+                        channels_last,
+                        backward,
+                        args.device,
+                        args.num_iter,
+                    )
+
 
 def parse_args():
-    parser = argparse.ArgumentParser(description='OP Benchmark')
-    parser.add_argument('--device', type=str, default='xpu',
-                        help='Device to run on (e.g., "cpu", "cuda", "xpu")')
+    parser = argparse.ArgumentParser(description="OP Benchmark")
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="xpu",
+        help='Device to run on (e.g., "cpu", "cuda", "xpu")',
+    )
     group = parser.add_mutually_exclusive_group()
-    group.add_argument('--profile-only', action='store_true',
-                       help='Only Run profile timing')
-    group.add_argument('--e2e-only', action='store_true',
-                       help='Only Run E2E timing')
-    parser.add_argument('--num-iter', type=int, default=20,
-                        help='Number of iterations')
+    group.add_argument(
+        "--profile-only", action="store_true", help="Only Run profile timing"
+    )
+    group.add_argument("--e2e-only", action="store_true", help="Only Run E2E timing")
+    parser.add_argument("--num-iter", type=int, default=20, help="Number of iterations")
     return parser.parse_args()
 
+
 if __name__ == "__main__":
     args = parse_args()
     benchmark(args)
@@ -1,5 +1,6 @@
-import time
 import argparse
+import time
+
 import torch
 from torch.profiler import profile, ProfilerActivity
 
@@ -13,28 +14,33 @@ def BTN1d(m, input, backward, device):
         gy = torch.empty_like(output)
         output.backward(gy)
 
+
 def run_profile(m, input, backward, device, num_iter):
     with profile(
-        activities=[ProfilerActivity.CPU,
-                  ProfilerActivity.XPU if device == 'xpu' else ProfilerActivity.CUDA],
+        activities=[
+            ProfilerActivity.CPU,
+            ProfilerActivity.XPU if device == "xpu" else ProfilerActivity.CUDA,
+        ],
         record_shapes=True,
     ) as prof:
         for i in range(num_iter):
             BTN1d(m, input, backward, device)
-    print(prof.key_averages().table(sort_by="{}_time_total".format(device)))
+    print(prof.key_averages().table(sort_by=f"{device}_time_total"))
+
 
 def run_e2e(m, input, backward, device, num_iter):
-    if device in ['xpu', 'cuda']:
-        torch.xpu.synchronize() if device == 'xpu' else torch.cuda.synchronize()
+    if device in ["xpu", "cuda"]:
+        torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
     t1 = time.time()
     for i in range(num_iter):
         BTN1d(m, input, backward, device)
-    if device in ['xpu', 'cuda']:
-        torch.xpu.synchronize() if device == 'xpu' else torch.cuda.synchronize()
+    if device in ["xpu", "cuda"]:
+        torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
     t2 = time.time()
     e2e_time = (t2 - t1) / num_iter
     print("E2E total time:", f"{float(e2e_time):.20f}")
 
+
 def benchmark(args):
     for shape in shape_list:
         for dtype in [torch.bfloat16, torch.float16, torch.float32]:
@@ -63,19 +69,24 @@ def benchmark(args):
             if not args.profile_only:
                 run_e2e(m, input, backward, args.device, args.num_iter)
 
+
 def parse_args():
-    parser = argparse.ArgumentParser(description='OP Benchmark')
-    parser.add_argument('--device', type=str, default='xpu',
-                        help='Device to run on (e.g., "cpu", "cuda", "xpu")')
+    parser = argparse.ArgumentParser(description="OP Benchmark")
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="xpu",
+        help='Device to run on (e.g., "cpu", "cuda", "xpu")',
+    )
     group = parser.add_mutually_exclusive_group()
-    group.add_argument('--profile-only', action='store_true',
-                       help='Only Run profile timing')
-    group.add_argument('--e2e-only', action='store_true',
-                       help='Only Run E2E timing')
-    parser.add_argument('--num-iter', type=int, default=20,
-                        help='Number of iterations')
+    group.add_argument(
+        "--profile-only", action="store_true", help="Only Run profile timing"
+    )
+    group.add_argument("--e2e-only", action="store_true", help="Only Run E2E timing")
+    parser.add_argument("--num-iter", type=int, default=20, help="Number of iterations")
     return parser.parse_args()
 
+
 if __name__ == "__main__":
     args = parse_args()
     benchmark(args)
@@ -1,5 +1,6 @@
-import time
 import argparse
+import time
+
 import torch
 from torch.profiler import profile, ProfilerActivity
 
@@ -37,28 +38,33 @@ def BTN2d(shape, dtype, channels_last, backward, device):
     if backward:
         output[0].backward(grad)
 
+
 def run_profile(shape, dtype, channels_last, backward, device, num_iter):
     with profile(
-        activities=[ProfilerActivity.CPU,
-                  ProfilerActivity.XPU if device == 'xpu' else ProfilerActivity.CUDA],
+        activities=[
+            ProfilerActivity.CPU,
+            ProfilerActivity.XPU if device == "xpu" else ProfilerActivity.CUDA,
+        ],
         record_shapes=True,
     ) as prof:
         for i in range(num_iter):
             BTN2d(shape, dtype, channels_last, backward, device)
-    print(prof.key_averages().table(sort_by="{}_time_total".format(device)))
+    print(prof.key_averages().table(sort_by=f"{device}_time_total"))
+
 
 def run_e2e(shape, dtype, channels_last, backward, device, num_iter):
-    if device in ['xpu', 'cuda']:
-        torch.xpu.synchronize() if device == 'xpu' else torch.cuda.synchronize()
+    if device in ["xpu", "cuda"]:
+        torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
     t1 = time.time()
     for i in range(num_iter):
         BTN2d(shape, dtype, channels_last, backward, device)
-    if device in ['xpu', 'cuda']:
-        torch.xpu.synchronize() if device == 'xpu' else torch.cuda.synchronize()
+    if device in ["xpu", "cuda"]:
+        torch.xpu.synchronize() if device == "xpu" else torch.cuda.synchronize()
     t2 = time.time()
     e2e_time = (t2 - t1) / num_iter
     print("E2E total time:", f"{float(e2e_time):.20f}")
 
+
 def benchmark(args):
     for shape in shape_list:
         for dtype in [torch.bfloat16, torch.float16, torch.float32]:
@@ -81,24 +87,43 @@ def benchmark(args):
                 )
 
                 if not args.e2e_only:
-                    run_profile(shape, dtype, channels_last, backward, args.device, args.num_iter)
+                    run_profile(
+                        shape,
+                        dtype,
+                        channels_last,
+                        backward,
+                        args.device,
+                        args.num_iter,
+                    )
 
                 if not args.profile_only:
-                    run_e2e(shape, dtype, channels_last, backward, args.device, args.num_iter)
+                    run_e2e(
+                        shape,
+                        dtype,
+                        channels_last,
+                        backward,
+                        args.device,
+                        args.num_iter,
+                    )
+
 
 def parse_args():
-    parser = argparse.ArgumentParser(description='OP Benchmark')
-    parser.add_argument('--device', type=str, default='xpu',
-                        help='Device to run on (e.g., "cpu", "cuda", "xpu")')
+    parser = argparse.ArgumentParser(description="OP Benchmark")
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="xpu",
+        help='Device to run on (e.g., "cpu", "cuda", "xpu")',
+    )
     group = parser.add_mutually_exclusive_group()
-    group.add_argument('--profile-only', action='store_true',
-                       help='Only Run profile timing')
-    group.add_argument('--e2e-only', action='store_true',
-                       help='Only Run E2E timing')
-    parser.add_argument('--num-iter', type=int, default=20,
-                        help='Number of iterations')
+    group.add_argument(
+        "--profile-only", action="store_true", help="Only Run profile timing"
+    )
+    group.add_argument("--e2e-only", action="store_true", help="Only Run E2E timing")
+    parser.add_argument("--num-iter", type=int, default=20, help="Number of iterations")
     return parser.parse_args()
 
+
 if __name__ == "__main__":
     args = parse_args()
     benchmark(args)