Clean up perf scorecard and add barplot generation script (#212)

bertmaher · web-flow · commit 33539d02d48c · 2021-10-18T16:26:48.000-04:00
diff --git a/benchmarks/pointwise_scorecard.py b/benchmarks/pointwise_scorecard.py
@@ -1,3 +1,4 @@
+import sys
 import time
 import torch
 import inspect
@@ -38,6 +39,12 @@ def medium_transpose():
     return (rand(32, 12, 64, 64).transpose(-1, -2),
             rand(32, 12, 64, 64).transpose(-1, -2))
 
+def medium2():
+    return (rand(32, 3, 224, 224), rand(32, 3, 224, 224))
+
+def medium3d():
+    return (rand(16, 32, 64), rand(16, 32, 64))
+
 def medium_channels_last():
     return (rand(32, 3, 224, 224).to(memory_format=torch.channels_last),
             rand(32, 3, 224, 224).to(memory_format=torch.channels_last))
@@ -56,6 +63,10 @@ def large_transpose():
     return (rand(8192, 8192).transpose(0, 1),
             rand(8192, 8192).transpose(0, 1))
 
+def large_channels_last():
+    return (rand(32, 32, 256, 256).to(memory_format=torch.channels_last),
+            rand(32, 32, 256, 256).to(memory_format=torch.channels_last))
+
 def pathological_broadcast():
     return (rand(1, 32, 32, 2), rand(1024, 1, 1, 2))
 
@@ -89,14 +100,14 @@ def log(a):
 def exp(a):
     return a.exp()
 
-def pow(a):
+def square(a):
     return a ** 2
 
 def fma(a, b):
     return a * b + b
 
 def hardswish(a):
-    return a * (a + 3).clamp(0, 6) / 6
+    return a * (a + 3.0).clamp(0.0, 6.0) / 6.0
 
 def native_hardswish(a):
     return torch._C._nn.hardswish(a)
@@ -107,19 +118,55 @@ def softplus(a):
 def mish(a):
     return a * ((a * 1.0).exp().log1p() / 1.0).tanh()
 
+# ------------------------------------------------------------------------------
+# Helpers
+# ------------------------------------------------------------------------------
+def time_cpu(fn, args, iters):
+    s = time.perf_counter()
+    for _ in range(iters):
+        fn(*args)
+    e = time.perf_counter()
+    return e - s
+
+def time_cuda(fn, args, iters):
+    start = torch.cuda.Event(enable_timing=True)
+    end = torch.cuda.Event(enable_timing=True)
+    start.record()
+    for _ in range(iters):
+        fn(*args)
+    end.record()
+    torch.cuda.synchronize()
+    return start.elapsed_time(end) / 1e3
+
+def benchmark_with_timer(fn, args, timer):
+    timer(fn, args, 3)
+    calibration = timer(fn, args, 1)
+    iters = int(1.0 / calibration)
+    return timer(fn, args, iters) / iters
+
+def benchmark(fn, args):
+    timer = time_cpu if args[0].device.type == "cpu" else time_cuda
+    return benchmark_with_timer(fn, args, timer)
+
+def micros(s):
+    return f"{s * 1e6:.1f}"
+
 shapes = [
     scalar,
     small,
     small_2d,
     small_broadcast,
     medium,
+    medium2,
+    medium3d,
     medium_sliced,
     medium_transpose,
     medium_channels_last,
     medium_broadcast,
     medium_broadcast_channels_last,
     large,
     large_transpose,
+    large_channels_last,
     pathological_broadcast,
 ]
 
@@ -133,20 +180,16 @@ def mish(a):
     tanh,
     log,
     exp,
-    pow,
+    square,
     fma,
     hardswish,
     native_hardswish,
 ]
-#shapes = [large_transpose]
-#operators = [add]
-#shapes = [scalar]
-#operators = [add]
+
 nope = set()
 for shape, operator in itertools.product(shapes, operators):
     nargs = len(inspect.signature(operator).parameters)
     args = shape()[:nargs]
-    #print(f"{operator.__name__} {shape.__name__}")
 
     try:
         if shape == medium_transpose:
@@ -160,41 +203,13 @@ def mish(a):
     ts_op = torch.jit.script(operator)
     torch.testing.assert_allclose(operator(*args), ts_op(*args))
 
-def time_cpu(fn, args, iters):
-    s = time.perf_counter()
-    for _ in range(iters):
-        fn(*args)
-    e = time.perf_counter()
-    return e - s
-
-def time_cuda(fn, args, iters):
-    start = torch.cuda.Event(enable_timing=True)
-    end = torch.cuda.Event(enable_timing=True)
-    start.record()
-    for _ in range(iters):
-        fn(*args)
-    end.record()
-    torch.cuda.synchronize()
-    return start.elapsed_time(end) / 1e3
-
-def benchmark_with_timer(fn, args, timer):
-    timer(fn, args, 3)
-    calibration = timer(fn, args, 1)
-    iters = int(1.0 / calibration)
-    return timer(fn, args, iters) / iters
-
-def benchmark(fn, args):
-    timer = time_cpu if args[0].device.type == "cpu" else time_cuda
-    return benchmark_with_timer(fn, args, timer)
-
-def micros(s):
-    return f"{s * 1e6:.1f}"
 
+print("fuser,device,operator,shape,time")
 results = []
 for shape, operator in itertools.product(shapes, operators):
     nargs = len(inspect.signature(operator).parameters)
     args = shape()[:nargs]
-    
+
     result = benchmark(operator, args)
     print(",".join(["eager", args[0].device.type, operator.__name__, shape.__name__, micros(result)]))
     try:
@@ -206,18 +221,9 @@ def micros(s):
         result = benchmark(pw_op, args)
         print(",".join(["pointwise", args[0].device.type, operator.__name__, shape.__name__, micros(result)]))
     except Exception:
-        #print(f"pointwise_operator failed on {operator.__name__}, {shape.__name__}")
-        #nope.add((operator, shape))
         print(",".join(["pointwise", args[0].device.type, operator.__name__, shape.__name__, micros(float("nan"))]))
 
     ts_op = torch.jit.script(operator)
     result = benchmark(ts_op, args)
     print(",".join(["fuser", args[0].device.type, operator.__name__, shape.__name__, micros(result)]))
-
-# cpu
-# parallel cpu
-# cuda
-
-# casts
-
-# inplace?
+    sys.stdout.flush()
diff --git a/benchmarks/process_scorecard.py b/benchmarks/process_scorecard.py
@@ -0,0 +1,19 @@
+import pandas
+import matplotlib.pyplot as plt
+
+df = pandas.read_csv("perf.csv")
+
+ops = pandas.unique(df["operator"])
+nops = len(ops)
+pivot_op_shape = df.pivot_table(values="time", index=["operator", "shape"], columns=["fuser"])
+pivot_speedups = (pivot_op_shape.T / pivot_op_shape["eager"]).T
+
+plt.rcParams["figure.figsize"] = (20,100)
+fig, axs = plt.subplots(nops)
+plt.subplots_adjust(hspace=0.5)
+for idx, op in enumerate(ops):
+    op_speedups = pivot_speedups.T[op].T
+    op_speedups.plot(ax=axs[idx], kind="bar", ylim=(0, 5), rot=45)
+    axs[idx].set_title(op)
+    axs[idx].set_xlabel("")
+plt.savefig("scorecard.svg")