intel
diff --git a/‎test/microbench/distance.pdist.py
Lines changed: 18 additions & 19 deletions b/‎test/microbench/distance.pdist.py
Lines changed: 18 additions & 19 deletions
diff --git a/‎test/microbench/distribution.bernoulli.py
Lines changed: 18 additions & 18 deletions b/‎test/microbench/distribution.bernoulli.py
Lines changed: 18 additions & 18 deletions
diff --git a/‎test/microbench/distribution.cauchy.py
Lines changed: 16 additions & 16 deletions b/‎test/microbench/distribution.cauchy.py
Lines changed: 16 additions & 16 deletions
diff --git a/‎test/microbench/distribution.exponential.py
Lines changed: 16 additions & 16 deletions b/‎test/microbench/distribution.exponential.py
Lines changed: 16 additions & 16 deletions
diff --git a/‎test/microbench/distribution.geometric.py
Lines changed: 16 additions & 16 deletions b/‎test/microbench/distribution.geometric.py
Lines changed: 16 additions & 16 deletions
@@ -7,32 +7,28 @@
 backward_shape_list = [(256, 256), (256, 8192), (16, 8192 * 4)]
 
 
-def Pdist(shape, dtype, backward, device):
-    input = torch.rand(shape, device=device, dtype=dtype)
-    if backward:
-        input.requires_grad_(True)
-
+def Pdist(input, backward, device):
     b = torch.nn.functional.pdist(input, 2)
     if backward:
         gy = torch.empty_like(b)
         b.backward(gy)
 
-def run_profile(shape, dtype, backward, device, num_iter):
+def run_profile(input, backward, device, num_iter):
     with profile(
-        activities=[ProfilerActivity.CPU, 
+        activities=[ProfilerActivity.CPU,
                   ProfilerActivity.XPU if device == 'xpu' else ProfilerActivity.CUDA],
         record_shapes=True,
     ) as prof:
-        for _ in range(num_iter):
-            Pdist(shape, dtype, backward, device)
+        for i in range(num_iter):
+            Pdist(input, backward, device)
     print(prof.key_averages().table(sort_by="{}_time_total".format(device)))
 
-def run_e2e(shape, dtype, backward, device, num_iter):
+def run_e2e(input, backward, device, num_iter):
     if device in ['xpu', 'cuda']:
         torch.xpu.synchronize() if device == 'xpu' else torch.cuda.synchronize()
     t1 = time.time()
-    for _ in range(num_iter):
-        Pdist(shape, dtype, backward, device)
+    for i in range(num_iter):
+        Pdist(input, backward, device)
     if device in ['xpu', 'cuda']:
         torch.xpu.synchronize() if device == 'xpu' else torch.cuda.synchronize()
     t2 = time.time()
@@ -44,27 +40,30 @@ def benchmark(args):
         shape_list = backward_shape_list if backward else forward_shape_list
         for shape in shape_list:
             for dtype in [torch.float32]:
+                input = torch.rand(shape, device=args.device, dtype=dtype)
+                if backward:
+                    input.requires_grad_(True)
                 # warm up
-                Pdist(shape, dtype, backward, args.device)
+                Pdist(input, backward, args.device)
 
                 # go
                 print("shape:", shape, "; datatype:", dtype, "; backward:", backward)
                 if not args.e2e_only:
-                    run_profile(shape, dtype, backward, args.device, args.num_iter)
+                    run_profile(input, backward, args.device, args.num_iter)
 
                 if not args.profile_only:
-                    run_e2e(shape, dtype, backward, args.device, args.num_iter)
+                    run_e2e(input, backward, args.device, args.num_iter)
 
 def parse_args():
     parser = argparse.ArgumentParser(description='OP Benchmark')
-    parser.add_argument('--device', type=str, default='xpu', 
+    parser.add_argument('--device', type=str, default='xpu',
                         help='Device to run on (e.g., "cpu", "cuda", "xpu")')
     group = parser.add_mutually_exclusive_group()
-    group.add_argument('--profile-only', action='store_true', 
+    group.add_argument('--profile-only', action='store_true',
                        help='Only Run profile timing')
-    group.add_argument('--e2e-only', action='store_true', 
+    group.add_argument('--e2e-only', action='store_true',
                        help='Only Run E2E timing')
-    parser.add_argument('--num-iter', type=int, default=20, 
+    parser.add_argument('--num-iter', type=int, default=20,
                         help='Number of iterations')
     return parser.parse_args()
 
 
@@ -7,28 +7,25 @@
 backward = False
 
 
-def Bernoulli(shape, dtype, backward, p, device):
-    input = torch.zeros(
-        shape, dtype=torch.bfloat16, device=device
-    )
+def Bernoulli(input, p, device):
     input.bernoulli_(p)
 
-def run_profile(shape, dtype, backward, p, device, num_iter):
+def run_profile(input, p, device, num_iter):
     with profile(
-        activities=[ProfilerActivity.CPU, 
+        activities=[ProfilerActivity.CPU,
                   ProfilerActivity.XPU if device == 'xpu' else ProfilerActivity.CUDA],
         record_shapes=True,
     ) as prof:
-        for _ in range(num_iter):
-            Bernoulli(shape, dtype, backward, p, device)
+        for i in range(num_iter):
+            Bernoulli(input, p, device)
     print(prof.key_averages().table(sort_by="{}_time_total".format(device)))
 
-def run_e2e(shape, dtype, backward, p, device, num_iter):
+def run_e2e(input, p, device, num_iter):
     if device in ['xpu', 'cuda']:
         torch.xpu.synchronize() if device == 'xpu' else torch.cuda.synchronize()
     t1 = time.time()
-    for _ in range(num_iter):
-        Bernoulli(shape, dtype, backward, p, device)
+    for i in range(num_iter):
+        Bernoulli(input, p, device)
     if device in ['xpu', 'cuda']:
         torch.xpu.synchronize() if device == 'xpu' else torch.cuda.synchronize()
     t2 = time.time()
@@ -39,8 +36,11 @@ def benchmark(args):
     for shape in shape_list:
         for dtype in [torch.bfloat16, torch.float16, torch.float32]:
             for p in [0.5, torch.tensor(0.5)]:
+                input = torch.zeros(
+                    shape, dtype=dtype, device=args.device
+                )
                 # warm up
-                Bernoulli(shape, dtype, backward, p, args.device)
+                Bernoulli(input, p, args.device)
 
                 # go
                 print(
@@ -54,21 +54,21 @@ def benchmark(args):
                     backward,
                 )
                 if not args.e2e_only:
-                    run_profile(shape, dtype, backward, p, args.device, args.num_iter)
+                    run_profile(input, p, args.device, args.num_iter)
 
                 if not args.profile_only:
-                    run_e2e(shape, dtype, backward, p, args.device, args.num_iter)
+                    run_e2e(input, p, args.device, args.num_iter)
 
 def parse_args():
     parser = argparse.ArgumentParser(description='OP Benchmark')
-    parser.add_argument('--device', type=str, default='xpu', 
+    parser.add_argument('--device', type=str, default='xpu',
                         help='Device to run on (e.g., "cpu", "cuda", "xpu")')
     group = parser.add_mutually_exclusive_group()
-    group.add_argument('--profile-only', action='store_true', 
+    group.add_argument('--profile-only', action='store_true',
                        help='Only Run profile timing')
-    group.add_argument('--e2e-only', action='store_true', 
+    group.add_argument('--e2e-only', action='store_true',
                        help='Only Run E2E timing')
-    parser.add_argument('--num-iter', type=int, default=20, 
+    parser.add_argument('--num-iter', type=int, default=20,
                         help='Number of iterations')
     return parser.parse_args()
 
 
@@ -7,26 +7,25 @@
 backward = False
 
 
-def Cauchy(shape, dtype, backward, device):
-    input = torch.randn(shape, dtype=dtype, device=device)
+def Cauchy(input, device):
     input.cauchy_()
 
-def run_profile(shape, dtype, backward, device, num_iter):
+def run_profile(input, device, num_iter):
     with profile(
-        activities=[ProfilerActivity.CPU, 
+        activities=[ProfilerActivity.CPU,
                   ProfilerActivity.XPU if device == 'xpu' else ProfilerActivity.CUDA],
         record_shapes=True,
     ) as prof:
-        for _ in range(num_iter):
-            Cauchy(shape, dtype, backward, device)
+        for i in range(num_iter):
+            Cauchy(input, device)
     print(prof.key_averages().table(sort_by="{}_time_total".format(device)))
 
-def run_e2e(shape, dtype, backward, device, num_iter):
+def run_e2e(input, device, num_iter):
     if device in ['xpu', 'cuda']:
         torch.xpu.synchronize() if device == 'xpu' else torch.cuda.synchronize()
     t1 = time.time()
-    for _ in range(num_iter):
-        Cauchy(shape, dtype, backward, device)
+    for i in range(num_iter):
+        Cauchy(input, device)
     if device in ['xpu', 'cuda']:
         torch.xpu.synchronize() if device == 'xpu' else torch.cuda.synchronize()
     t2 = time.time()
@@ -36,27 +35,28 @@ def run_e2e(shape, dtype, backward, device, num_iter):
 def benchmark(args):
     for shape in shape_list:
         for dtype in [torch.bfloat16, torch.float16, torch.float32]:
+            input = torch.randn(shape, dtype=dtype, device=args.device)
             # warm up
-            Cauchy(shape, dtype, backward, args.device)
+            Cauchy(input, args.device)
 
             # go
             print("shape:", (shape), "; datatype:", dtype, "; backward:", backward)
             if not args.e2e_only:
-                run_profile(shape, dtype, backward, args.device, args.num_iter)
+                run_profile(input, args.device, args.num_iter)
 
             if not args.profile_only:
-                run_e2e(shape, dtype, backward, args.device, args.num_iter)
+                run_e2e(input, args.device, args.num_iter)
 
 def parse_args():
     parser = argparse.ArgumentParser(description='OP Benchmark')
-    parser.add_argument('--device', type=str, default='xpu', 
+    parser.add_argument('--device', type=str, default='xpu',
                         help='Device to run on (e.g., "cpu", "cuda", "xpu")')
     group = parser.add_mutually_exclusive_group()
-    group.add_argument('--profile-only', action='store_true', 
+    group.add_argument('--profile-only', action='store_true',
                        help='Only Run profile timing')
-    group.add_argument('--e2e-only', action='store_true', 
+    group.add_argument('--e2e-only', action='store_true',
                        help='Only Run E2E timing')
-    parser.add_argument('--num-iter', type=int, default=20, 
+    parser.add_argument('--num-iter', type=int, default=20,
                         help='Number of iterations')
     return parser.parse_args()
 
 
@@ -7,26 +7,25 @@
 backward = False
 
 
-def Exponential(shape, dtype, backward, device):
-    input = torch.randn(shape, dtype=dtype, device=device)
+def Exponential(input, device):
     input.exponential_(0.5)
 
-def run_profile(shape, dtype, backward, device, num_iter):
+def run_profile(input, device, num_iter):
     with profile(
-        activities=[ProfilerActivity.CPU, 
+        activities=[ProfilerActivity.CPU,
                   ProfilerActivity.XPU if device == 'xpu' else ProfilerActivity.CUDA],
         record_shapes=True,
     ) as prof:
-        for _ in range(num_iter):
-            Exponential(shape, dtype, backward, device)
+        for i in range(num_iter):
+            Exponential(input, device)
     print(prof.key_averages().table(sort_by="{}_time_total".format(device)))
 
-def run_e2e(shape, dtype, backward, device, num_iter):
+def run_e2e(input, device, num_iter):
     if device in ['xpu', 'cuda']:
         torch.xpu.synchronize() if device == 'xpu' else torch.cuda.synchronize()
     t1 = time.time()
-    for _ in range(num_iter):
-        Exponential(shape, dtype, backward, device)
+    for i in range(num_iter):
+        Exponential(input, device)
     if device in ['xpu', 'cuda']:
         torch.xpu.synchronize() if device == 'xpu' else torch.cuda.synchronize()
     t2 = time.time()
@@ -36,27 +35,28 @@ def run_e2e(shape, dtype, backward, device, num_iter):
 def benchmark(args):
     for shape in shape_list:
         for dtype in [torch.bfloat16, torch.float16, torch.float32]:
+            input = torch.randn(shape, dtype=dtype, device=args.device)
             # warm up
-            Exponential(shape, dtype, backward, args.device)
+            Exponential(input, args.device)
 
             # go
             print("shape:", (shape), "; datatype:", dtype, "; backward:", backward)
             if not args.e2e_only:
-                run_profile(shape, dtype, backward, args.device, args.num_iter)
+                run_profile(input, args.device, args.num_iter)
 
             if not args.profile_only:
-                run_e2e(shape, dtype, backward, args.device, args.num_iter)
+                run_e2e(input, args.device, args.num_iter)
 
 def parse_args():
     parser = argparse.ArgumentParser(description='OP Benchmark')
-    parser.add_argument('--device', type=str, default='xpu', 
+    parser.add_argument('--device', type=str, default='xpu',
                         help='Device to run on (e.g., "cpu", "cuda", "xpu")')
     group = parser.add_mutually_exclusive_group()
-    group.add_argument('--profile-only', action='store_true', 
+    group.add_argument('--profile-only', action='store_true',
                        help='Only Run profile timing')
-    group.add_argument('--e2e-only', action='store_true', 
+    group.add_argument('--e2e-only', action='store_true',
                        help='Only Run E2E timing')
-    parser.add_argument('--num-iter', type=int, default=20, 
+    parser.add_argument('--num-iter', type=int, default=20,
                         help='Number of iterations')
     return parser.parse_args()
 
 
@@ -7,26 +7,25 @@
 backward = False
 
 
-def Geometric(shape, dtype, backward, device):
-    input = torch.randn(shape, dtype=torch.bfloat16, device=device)
+def Geometric(input, device):
     input.geometric_(0.5)
 
-def run_profile(shape, dtype, backward, device, num_iter):
+def run_profile(input, device, num_iter):
     with profile(
-        activities=[ProfilerActivity.CPU, 
+        activities=[ProfilerActivity.CPU,
                   ProfilerActivity.XPU if device == 'xpu' else ProfilerActivity.CUDA],
         record_shapes=True,
     ) as prof:
-        for _ in range(num_iter):
-            Geometric(shape, dtype, backward, device)
+        for i in range(num_iter):
+            Geometric(input, device)
     print(prof.key_averages().table(sort_by="{}_time_total".format(device)))
 
-def run_e2e(shape, dtype, backward, device, num_iter):
+def run_e2e(input, device, num_iter):
     if device in ['xpu', 'cuda']:
         torch.xpu.synchronize() if device == 'xpu' else torch.cuda.synchronize()
     t1 = time.time()
-    for _ in range(num_iter):
-        Geometric(shape, dtype, backward, device)
+    for i in range(num_iter):
+        Geometric(input, device)
     if device in ['xpu', 'cuda']:
         torch.xpu.synchronize() if device == 'xpu' else torch.cuda.synchronize()
     t2 = time.time()
@@ -36,8 +35,9 @@ def run_e2e(shape, dtype, backward, device, num_iter):
 def benchmark(args):
     for shape in shape_list:
         for dtype in [torch.bfloat16, torch.float16, torch.float32]:
+            input = torch.randn(shape, dtype=torch.bfloat16, device=args.device)
             # warm up
-            Geometric(shape, dtype, backward, args.device)
+            Geometric(input, args.device)
 
             # go
             print(
@@ -51,21 +51,21 @@ def benchmark(args):
                 backward,
             )
             if not args.e2e_only:
-                run_profile(shape, dtype, backward, args.device, args.num_iter)
+                run_profile(input, args.device, args.num_iter)
 
             if not args.profile_only:
-                run_e2e(shape, dtype, backward, args.device, args.num_iter)
+                run_e2e(input, args.device, args.num_iter)
 
 def parse_args():
     parser = argparse.ArgumentParser(description='OP Benchmark')
-    parser.add_argument('--device', type=str, default='xpu', 
+    parser.add_argument('--device', type=str, default='xpu',
                         help='Device to run on (e.g., "cpu", "cuda", "xpu")')
     group = parser.add_mutually_exclusive_group()
-    group.add_argument('--profile-only', action='store_true', 
+    group.add_argument('--profile-only', action='store_true',
                        help='Only Run profile timing')
-    group.add_argument('--e2e-only', action='store_true', 
+    group.add_argument('--e2e-only', action='store_true',
                        help='Only Run E2E timing')
-    parser.add_argument('--num-iter', type=int, default=20, 
+    parser.add_argument('--num-iter', type=int, default=20,
                         help='Number of iterations')
     return parser.parse_args()