Initial commit

bertmaher · bertmaher · commit a91908adb488 · 2025-06-27T10:53:50.000-07:00
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1 @@
+__pycache__/
diff --git a/BackendBench/backends.py b/BackendBench/backends.py
diff --git a/BackendBench/eval.py b/BackendBench/eval.py
@@ -0,0 +1,59 @@
+import logging
+
+import torch
+from triton.testing import do_bench
+
+logger = logging.getLogger(__name__)
+
+
+def allclose(a, b):
+    if isinstance(a, torch.Tensor):
+        torch.testing.assert_close(a, b, equal_nan=True)
+        return True
+    if isinstance(a, (list, tuple)):
+        return all(allclose(x, y) for x, y in zip(a, b))
+    return a == b
+
+
+EXC_MSG = """
+Exception raised for {op}:
+    args: {args}
+    kwargs: {kwargs}
+    exc: {exc}
+"""
+
+
+def eval_correctness_test(op, impl, test):
+    """Evaluate impl of op against test."""
+    args, kwargs = test.args, test.kwargs
+    ref = op(*args, **kwargs)
+    try:
+        res = impl(*args, **kwargs)
+        return allclose(ref, res)
+    except Exception as e:
+        logger.debug(EXC_MSG.format(op=op, args=args, kwargs=kwargs, exc=e))
+        return False
+
+
+def eval_correctness(op, impl, tests):
+    correct, total = 0, 0
+    for test in tests:
+        if eval_correctness_test(op, impl, test):
+            correct += 1
+        total += 1
+    return correct / total
+
+
+def eval_performance(op, impl, tests):
+    base_times = [do_bench(lambda: op(*test.args, **test.kwargs)) for test in tests]
+    test_times = [do_bench(lambda: impl(*test.args, **test.kwargs)) for test in tests]
+    speedups = torch.tensor(test_times) / torch.tensor(base_times)
+    # geometric mean of speedups
+    return speedups.log().mean().exp()
+
+
+def eval_one_op(op, impl, correctness_tests, performance_tests):
+    """Evaluate impl of op against correctness_tests and performance_tests."""
+    return eval_correctness(op, impl, correctness_tests), eval_performance(
+        op, impl, performance_tests
+    )
diff --git a/BackendBench/opinfo_suite.py b/BackendBench/opinfo_suite.py
@@ -0,0 +1,88 @@
+import logging
+from collections import defaultdict
+
+import torch
+from torch.testing._internal.common_methods_invocations import op_db
+from torch.utils._python_dispatch import TorchDispatchMode
+
+from .eval import allclose
+from .suite import OpTest, Test, TestSuite
+
+logger = logging.getLogger(__name__)
+
+
+class OpInfoTest:
+    def __init__(self, *args, **kwargs):
+        self.args = args
+        self.kwargs = kwargs
+
+
+class OpInfoOpTest(OpTest):
+    def __init__(self, op, correctness_tests, indices):
+        self.op = op
+        self._correctness_tests = correctness_tests
+        self.indices = set(indices)
+        self.performance_tests = []
+
+    @property
+    def correctness_tests(self):
+        for idx, test in enumerate(self._correctness_tests):
+            if idx in self.indices:
+                # print(f"{idx} {test.input=} {test.args=} {test.kwargs=}")
+                yield OpInfoTest(test.input, *test.args, **test.kwargs)
+
+
+class OpTracerMode(TorchDispatchMode):
+    def __init__(self):
+        self.ops = []
+        self.args = []
+        self.kwargs = []
+
+    def __torch_dispatch__(self, fn, types, args=(), kwargs={}):
+        self.ops.append(fn)
+        self.args.append(args)
+        self.kwargs.append(kwargs)
+        return fn(*args, **kwargs)
+
+
+def build_op_tests(device, dtype, filter=None):
+    op_info_op_tests = []
+    for op in op_db:
+        if filter and op.name not in filter:
+            continue
+        if "." in op.name and "nn.functional" not in op.name:
+            continue
+        if dtype not in op.supported_dtypes(device):
+            continue
+        if op.name in ["nonzero_static"]:
+            continue
+
+        op_indices = defaultdict(list)
+        for idx, test in enumerate(op.sample_inputs(device, dtype)):
+            # print(f"{idx=} {test.input=} {test.args=} {test.kwargs=}")
+            with OpTracerMode() as tracer:
+                ref = op.op(test.input, *test.args, **test.kwargs)
+            if len(tracer.ops) == 1:
+                try:
+                    res = tracer.ops[0](test.input, *test.args, **test.kwargs)
+                    if allclose(ref, res):
+                        op_indices[tracer.ops[0]].append(idx)
+                except Exception:
+                    logger.debug(
+                        f"opinfo {op.name} couldn't run underlying op {tracer.ops[0]}"
+                    )
+            else:
+                logger.debug(f"opinfo {op.name} has {len(tracer.ops)} ops")
+
+        for overload, indices in op_indices.items():
+            if len(indices) > 0:
+                op_info_op_tests.append(
+                    OpInfoOpTest(overload, op.sample_inputs(device, dtype), indices)
+                )
+
+    return op_info_op_tests
+
+
+class OpInfoTestSuite(TestSuite):
+    def __init__(self, name, device, dtype, filter=None):
+        super().__init__(name, build_op_tests(device, dtype, filter))
diff --git a/BackendBench/suite.py b/BackendBench/suite.py
@@ -0,0 +1,52 @@
+import torch
+
+
+def randn(*args, **kwargs):
+    return lambda: torch.randn(*args, **kwargs)
+
+
+class Test:
+    def __init__(self, *args, **kwargs):
+        self._args = args
+        self._kwargs = kwargs
+
+    @property
+    def args(self):
+        return [arg() for arg in self._args]
+
+    @property
+    def kwargs(self):
+        return {k: v() for k, v in self._kwargs.items()}
+
+
+class OpTest:
+    def __init__(self, op, correctness_tests, performance_tests):
+        self.op = op
+        self.correctness_tests = correctness_tests
+        self.performance_tests = performance_tests
+
+
+class TestSuite:
+    def __init__(self, name, optests):
+        self.name = name
+        self.optests = optests
+
+    def __iter__(self):
+        for optest in self.optests:
+            yield optest
+
+
+SmokeTestSuite = TestSuite(
+    "smoke",
+    [
+        OpTest(
+            torch.ops.aten.relu.default,
+            [
+                Test(randn(2, device="cuda")),
+            ],
+            [
+                Test(randn(2**28, device="cuda")),
+            ],
+        )
+    ],
+)
diff --git a/README.md b/README.md
@@ -0,0 +1,26 @@
+# Usage:
+
+Run a simple smoke test (relu) with the default ATen backend:
+```bash
+python scripts/main.py --suite smoke --backend aten
+```
+
+Run the smoke test with FlagGems:
+```bash
+python scripts/main.py --suite smoke --backend flag_gems
+```
+
+Run opinfo tests (correctness only) with ATen
+```bash
+python scripts/main.py --suite opinfo --backend aten
+```
+
+Run a filtered set of opinfo tests with FlagGems
+```bash
+python scripts/main.py --suite opinfo --backend flag_gems --ops "add,sub"
+```
+
+Run all the opinfo tests with FlagGems (takes a few minutes)
+```bash
+python scripts/main.py --suite opinfo --backend flag_gems
+```
diff --git a/scripts/main.py b/scripts/main.py
@@ -0,0 +1,81 @@
+import logging
+import sys
+
+import BackendBench.backends as backends
+import BackendBench.eval as eval
+import click
+import torch
+from BackendBench.opinfo_suite import OpInfoTestSuite
+from BackendBench.suite import SmokeTestSuite
+
+logger = logging.getLogger(__name__)
+
+
+@click.command()
+@click.option(
+    "--suite",
+    default="smoke",
+    type=click.Choice(["smoke", "opinfo"]),
+    help="Which suite to run",
+)
+@click.option(
+    "--backend",
+    default="aten",
+    type=click.Choice(["aten", "flag_gems"]),
+    help="Which backend to run",
+)
+@click.option(
+    "--ops",
+    default=None,
+    type=str,
+    help="Comma-separated list of ops to run",
+)
+def cli(suite, backend, ops):
+    if ops:
+        ops = ops.split(",")
+
+    backend = {
+        "aten": backends.AtenBackend,
+        "flag_gems": backends.FlagGemsBackend,
+    }[backend]()
+
+    suite = {
+        "smoke": lambda: SmokeTestSuite,
+        "opinfo": lambda: OpInfoTestSuite(
+            "opinfo_cuda_bfloat16",
+            "cuda",
+            torch.bfloat16,
+            filter=ops,
+        ),
+    }[suite]()
+
+    overall_correctness = []
+    overall_performance = []
+
+    for test in suite:
+        if test.op not in backend:
+            continue
+
+        logger.debug(test.op)
+
+        correctness, perf = eval.eval_one_op(
+            test.op,
+            backend[test.op],
+            test.correctness_tests,
+            test.performance_tests,
+        )
+        overall_correctness.append(correctness)
+        overall_performance.append(perf)
+
+        logger.debug(f"max memory allocated: {torch.cuda.max_memory_allocated():,}")
+
+    mean_correctness = torch.tensor(overall_correctness).mean().item()
+    geomean_perf = torch.tensor(overall_performance).log().mean().exp().item()
+    print(
+        f"correctness score (mean pass rate over all operators): {mean_correctness:.2f}"
+    )
+    print(f"performance score (geomean speedup over all operators): {geomean_perf:.2f}")
+
+
+if __name__ == "__main__":
+    cli()