intel
diff --git a/‎test/microbench/im2col.py
Lines changed: 2 additions & 2 deletions b/‎test/microbench/im2col.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎test/microbench/loss.binary_cross_entropy.py
Lines changed: 71 additions & 51 deletions b/‎test/microbench/loss.binary_cross_entropy.py
Lines changed: 71 additions & 51 deletions
diff --git a/‎test/microbench/loss.ctc_loss.py
Lines changed: 72 additions & 60 deletions b/‎test/microbench/loss.ctc_loss.py
Lines changed: 72 additions & 60 deletions
@@ -23,15 +23,15 @@ def run_profile(shape, dtype, backward, device, num_iter):
                   ProfilerActivity.XPU if device == 'xpu' else ProfilerActivity.CUDA],
         record_shapes=True,
     ) as prof:
-        for _ in range(num_iter):
+        for i in range(num_iter):
             Im2col(shape, dtype, backward, device)
     print(prof.key_averages().table(sort_by="{}_time_total".format(device)))
 
 def run_e2e(shape, dtype, backward, device, num_iter):
     if device in ['xpu', 'cuda']:
         torch.xpu.synchronize() if device == 'xpu' else torch.cuda.synchronize()
     t1 = time.time()
-    for _ in range(num_iter):
+    for i in range(num_iter):
         Im2col(shape, dtype, backward, device)
     if device in ['xpu', 'cuda']:
         torch.xpu.synchronize() if device == 'xpu' else torch.cuda.synchronize()
 
@@ -1,16 +1,11 @@
 import time
-
+import argparse
 import torch
 import torch.nn as nn
 from torch.profiler import profile, ProfilerActivity
 
-device = "xpu"
-backward = True
-num_iter = 20
 shape_list = [(8733, 8733), (8733, 513), (513, 8733), (8192, 8192)]
-
-cache_r = torch.randn(1024 * 1024 * 1024, device=device)
-cache_w = torch.randn(1024 * 1024 * 1024, device=device)
+backward = True
 
 
 def _do_test(loss, input, target, dtype, device):
@@ -20,51 +15,76 @@ def _do_test(loss, input, target, dtype, device):
 
     return output, grad_inputs
 
-
-for shape in shape_list:
-    for dtype in [torch.bfloat16, torch.float16, torch.float32]:
-        M, N = shape[0], shape[1]
-        input = torch.randn((M, N), requires_grad=True)
-        target = torch.empty((M, N)).random_(2)
-        for reduce in ["none", "mean", "sum"]:
-            loss = nn.BCELoss(reduce=reduce)
-            m = nn.Sigmoid()
-            input = m(input).to(dtype=dtype, device=device)
-            target = target.to(dtype=dtype, device=device)
-            # warm up
+def run_profile(loss, input, target, dtype, backward, cache_r, cache_w, device, num_iter):
+    with profile(
+        activities=[ProfilerActivity.CPU, 
+                  ProfilerActivity.XPU if device == 'xpu' else ProfilerActivity.CUDA],
+        record_shapes=True,
+    ) as prof:
+        for _ in range(num_iter):
+            cache_r = cache_w + 1
             _do_test(loss, input, target, dtype, device)
+    print(prof.key_averages().table(sort_by="{}_time_total".format(device)))
 
-            # go
-            print(
-                "shape:",
-                (M, N),
-                "; datatype:",
-                dtype,
-                "; reduce:",
-                reduce,
-                "; backward:",
-                backward,
-            )
-            with profile(
-                activities=[ProfilerActivity.CPU, ProfilerActivity.XPU],
-                record_shapes=True,
-            ) as prof:
-                for i in range(num_iter):
-                    cache_r = cache_w + 1
-                    output_xpu, grad_input_xpu = _do_test(
-                        loss, input, target, dtype, device
-                    )
-            print(prof.key_averages().table(sort_by="xpu_time_total"))
+def run_e2e(loss, input, target, dtype, backward, cache_r, cache_w, device, num_iter):
+    if device in ['xpu', 'cuda']:
+        torch.xpu.synchronize() if device == 'xpu' else torch.cuda.synchronize()
+    t1 = time.time()
+    for _ in range(num_iter):
+        cache_r = cache_w + 1
+        _do_test(loss, input, target, dtype, device)
+    if device in ['xpu', 'cuda']:
+        torch.xpu.synchronize() if device == 'xpu' else torch.cuda.synchronize()
+    t2 = time.time()
+    e2e_time = (t2 - t1) / num_iter
+    print("E2E total time:", f"{float(e2e_time):.20f}")
 
-            # E2E time
-            torch.xpu.synchronize()
-            t1 = time.time()
-            for i in range(num_iter):
-                cache_r = cache_w + 1
-                output_xpu, grad_input_xpu = _do_test(
-                    loss, input, target, dtype, device
+def benchmark(args):
+    for shape in shape_list:
+        for dtype in [torch.bfloat16, torch.float16, torch.float32]:
+            M, N = shape[0], shape[1]
+            input = torch.randn((M, N), requires_grad=True)
+            target = torch.empty((M, N)).random_(2)
+            cache_r = torch.randn(1024 * 1024 * 1024, device=args.device)
+            cache_w = torch.randn(1024 * 1024 * 1024, device=args.device)
+            for reduce in ["none", "mean", "sum"]:
+                loss = nn.BCELoss(reduce=reduce)
+                m = nn.Sigmoid()
+                input = m(input).to(dtype=dtype, device=args.device)
+                target = target.to(dtype=dtype, device=args.device)
+                # warm up
+                _do_test(loss, input, target, dtype, args.device)
+
+                # go
+                print(
+                    "shape:",
+                    (M, N),
+                    "; datatype:",
+                    dtype,
+                    "; reduce:",
+                    reduce,
+                    "; backward:",
+                    backward,
                 )
-            torch.xpu.synchronize()
-            t2 = time.time()
-            e2e_time = (t2 - t1) / num_iter
-            print("E2E total time:", f"{float(e2e_time):.20f}")
+                if not args.e2e_only:
+                    run_profile(loss, input, target, dtype, backward, cache_r, cache_w, args.device, args.num_iter)
+
+                if not args.profile_only:
+                    run_e2e(loss, input, target, dtype, backward, cache_r, cache_w, args.device, args.num_iter)
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='OP Benchmark')
+    parser.add_argument('--device', type=str, default='xpu', 
+                        help='Device to run on (e.g., "cpu", "cuda", "xpu")')
+    group = parser.add_mutually_exclusive_group()
+    group.add_argument('--profile-only', action='store_true', 
+                       help='Only Run profile timing')
+    group.add_argument('--e2e-only', action='store_true', 
+                       help='Only Run E2E timing')
+    parser.add_argument('--num-iter', type=int, default=20, 
+                        help='Number of iterations')
+    return parser.parse_args()
+
+if __name__ == "__main__":
+    args = parse_args()
+    benchmark(args)
@@ -1,80 +1,92 @@
 import time
-
+import argparse
 import torch
 from torch.profiler import profile, ProfilerActivity
 
-device = "xpu"
-backward = True
-num_iter = 20
 # T,N,C,S
 shape_list = [(32, 32, 32, 16), (128, 128, 128, 128), (8, 8, 4, 8)]
+backward = True
 
 
-def _test_loss_ctc(log_probs, targets, input_lengths, target_lengths, dtype):
-    log_probs_dpcpp = log_probs.to("xpu")
-    log_probs_dpcpp.requires_grad_(True)
-    targets_dpcpp = targets.to("xpu")
-    input_lengths_dpcpp = input_lengths.to("xpu")
-    target_lengths_dpcpp = target_lengths.to("xpu")
-
-    # warm up
+def _test_loss_ctc(log_probs, targets, input_lengths, target_lengths, backward):
     loss_dpcpp = torch.nn.functional.ctc_loss(
-        log_probs_dpcpp, targets_dpcpp, input_lengths_dpcpp, target_lengths_dpcpp
+        log_probs, targets, input_lengths, target_lengths
     )
-    loss_dpcpp.backward()
+    if backward:
+        loss_dpcpp.backward()
 
-    # go
-    print(
-        "shape:",
-        (shape[0], shape[1], shape[2], shape[3]),
-        "; datatype:",
-        dtype,
-        "; backward:",
-        backward,
-    )
+def run_profile(log_probs, targets, input_lengths, target_lengths, backward, device, num_iter):
     with profile(
-        activities=[ProfilerActivity.CPU, ProfilerActivity.XPU], record_shapes=True
+        activities=[ProfilerActivity.CPU, 
+                  ProfilerActivity.XPU if device == 'xpu' else ProfilerActivity.CUDA],
+        record_shapes=True,
     ) as prof:
-        for i in range(num_iter):
-            loss_dpcpp = torch.nn.functional.ctc_loss(
-                log_probs_dpcpp,
-                targets_dpcpp,
-                input_lengths_dpcpp,
-                target_lengths_dpcpp,
-            )
-            loss_dpcpp.backward()
-    print(prof.key_averages().table(sort_by="xpu_time_total"))
+        for _ in range(num_iter):
+            _test_loss_ctc(log_probs, targets, input_lengths, target_lengths, backward)
+    print(prof.key_averages().table(sort_by="{}_time_total".format(device)))
 
-    # E2E time
-    torch.xpu.synchronize()
+def run_e2e(log_probs, targets, input_lengths, target_lengths, backward, device, num_iter):
+    if device in ['xpu', 'cuda']:
+        torch.xpu.synchronize() if device == 'xpu' else torch.cuda.synchronize()
     t1 = time.time()
-    for i in range(num_iter):
-        loss_dpcpp = torch.nn.functional.ctc_loss(
-            log_probs_dpcpp,
-            targets_dpcpp,
-            input_lengths_dpcpp,
-            target_lengths_dpcpp,
-        )
-        loss_dpcpp.backward()
-    torch.xpu.synchronize()
+    for _ in range(num_iter):
+        _test_loss_ctc(log_probs, targets, input_lengths, target_lengths, backward)
+    if device in ['xpu', 'cuda']:
+        torch.xpu.synchronize() if device == 'xpu' else torch.cuda.synchronize()
     t2 = time.time()
     e2e_time = (t2 - t1) / num_iter
     print("E2E total time:", f"{float(e2e_time):.20f}")
 
+def benchmark(args):
+    for shape in shape_list:
+        for dtype in [torch.float32]:
+            T, N, C, S = shape[0], shape[1], shape[2], shape[3]
+            g_cpu = torch.Generator()
+            g_cpu.manual_seed(15)
+            torch.manual_seed(15)
+            log_probs = (
+                torch.randn(T, N, C, dtype=dtype, device=args.device).log_softmax(2).detach().requires_grad_()
+            )
+            targets = torch.randint(1, N, (N, S), dtype=torch.long, device=args.device)
+            input_lengths = torch.full((N,), T, dtype=torch.long, device=args.device)
+            target_lengths = torch.randint(1, S, (N,), dtype=torch.long, device=args.device)
+
+            if backward:
+                log_probs.requires_grad_(True)
+
+            # warm up
+            _test_loss_ctc(log_probs, targets, input_lengths, target_lengths, backward)
+            # go
+            print(
+                "shape:",
+                (shape[0], shape[1], shape[2], shape[3]),
+                "; datatype:",
+                dtype,
+                "; backward:",
+                backward,
+            )
+            if not args.e2e_only:
+                run_profile(log_probs, targets, input_lengths, target_lengths, backward, args.device, args.num_iter)
+
+            if not args.profile_only:
+                run_e2e(log_probs, targets, input_lengths, target_lengths, backward, args.device, args.num_iter)
+            g_cpu = torch.Generator()
+            g_cpu.manual_seed(15)
+            torch.manual_seed(15)
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='OP Benchmark')
+    parser.add_argument('--device', type=str, default='xpu', 
+                        help='Device to run on (e.g., "cpu", "cuda", "xpu")')
+    group = parser.add_mutually_exclusive_group()
+    group.add_argument('--profile-only', action='store_true', 
+                       help='Only Run profile timing')
+    group.add_argument('--e2e-only', action='store_true', 
+                       help='Only Run E2E timing')
+    parser.add_argument('--num-iter', type=int, default=20, 
+                        help='Number of iterations')
+    return parser.parse_args()
 
-for shape in shape_list:
-    for dtype in [torch.float32]:
-        T, N, C, S = shape[0], shape[1], shape[2], shape[3]
-        g_cpu = torch.Generator()
-        g_cpu.manual_seed(15)
-        torch.manual_seed(15)
-        log_probs = (
-            torch.randn(T, N, C, dtype=dtype).log_softmax(2).detach().requires_grad_()
-        )
-        targets = torch.randint(1, N, (N, S), dtype=torch.long, generator=g_cpu)
-        input_lengths = torch.full((N,), T, dtype=torch.long)
-        target_lengths = torch.randint(1, S, (N,), dtype=torch.long, generator=g_cpu)
-        _test_loss_ctc(log_probs, targets, input_lengths, target_lengths, dtype)
-        g_cpu = torch.Generator()
-        g_cpu.manual_seed(15)
-        torch.manual_seed(15)
+if __name__ == "__main__":
+    args = parse_args()
+    benchmark(args)