Add proper test harness (#32)

msaroufim · web-flow · commit 86d96b1e9f08 · 2025-07-22T10:05:02.000-04:00
diff --git a/.github/workflows/smoke-test.yml b/.github/workflows/smoke-test.yml
@@ -14,18 +14,11 @@ jobs:
       with:
         python-version: '3.x'
     
-    - name: Cache pip dependencies
-      uses: actions/cache@v3
-      with:
-        path: ~/.cache/pip
-        key: ${{ runner.os }}-pip-${{ hashFiles('requirements.txt') }}
-        restore-keys: |
-          ${{ runner.os }}-pip-
-    
     - name: Install dependencies
       run: |
         pip install -r requirements.txt
+        pip install -r requirements-dev.txt
     
     - name: Run smoke test
       run: |
-        PYTHONPATH=. python scripts/main.py --suite smoke --backend aten 
+        PYTHONPATH=. pytest test/
diff --git a/.gitignore b/.gitignore
@@ -2,4 +2,6 @@ __pycache__/
 .claude/
 .vscode/
 .ruff_cache/
-generated_kernels/
+generated_kernels/
+venv/
+CLAUDE.md
diff --git a/BackendBench/eval.py b/BackendBench/eval.py
@@ -28,14 +28,17 @@ def format_kwargs(kwargs):
 
 
 def format_exception(e, op, args, kwargs):
-    return EXC_MSG.format(op=op, args=format_args(args), kwargs=format_kwargs(kwargs), exc=e)
+    op_name = getattr(op, "__name__", str(op))
+    return EXC_MSG.format(op=op_name, args=format_args(args), kwargs=format_kwargs(kwargs), exc=e)
 
 
 def allclose(a, b):
     if isinstance(a, torch.Tensor):
         torch.testing.assert_close(a, b, equal_nan=True, atol=1e-2, rtol=1e-2)
         return True
     if isinstance(a, (list, tuple)):
+        if len(a) != len(b):
+            raise ValueError(f"Length mismatch: {len(a)} vs {len(b)}")
         return all(allclose(x, y) for x, y in zip(a, b))
     return a == b
 
@@ -92,7 +95,7 @@ def eval_performance(op, impl, tests):
             test_times.append(base_times[-1])
             continue
         test_times.append(bench_fn(lambda: impl(*test.args, **test.kwargs)))
-    speedups = torch.tensor(test_times) / torch.tensor(base_times)
+    speedups = torch.tensor(base_times) / torch.tensor(test_times)
     return speedups.log().mean().exp()
 
 
diff --git a/pytest.ini b/pytest.ini
@@ -0,0 +1,43 @@
+[pytest]
+# Pytest configuration for BackendBench
+
+# Test discovery patterns
+python_files = test_*.py
+python_classes = Test*
+python_functions = test_*
+
+# Test directories
+testpaths = test
+
+# Output options
+addopts = 
+    -v
+    --tb=short
+    --strict-markers
+    --disable-warnings
+    -p no:warnings
+
+# Markers for categorizing tests
+markers =
+    smoke: Basic smoke tests that should always pass
+    unit: Unit tests for individual components
+    integration: Integration tests that test multiple components
+    slow: Tests that take a long time to run
+    requires_cuda: Tests that require CUDA/GPU
+    requires_api_key: Tests that require API keys (e.g., for LLM backends)
+
+# Coverage settings (if pytest-cov is installed)
+[coverage:run]
+source = BackendBench
+omit = 
+    */test/*
+    */tests/*
+    setup.py
+
+[coverage:report]
+exclude_lines =
+    pragma: no cover
+    def __repr__
+    raise AssertionError
+    raise NotImplementedError
+    if __name__ == .__main__.:
diff --git a/requirements-dev.txt b/requirements-dev.txt
@@ -0,0 +1,5 @@
+pytest
+pytest-cov
+pytest-mock
+pytest-timeout
+ruff==0.12.1
diff --git a/test/test_backends.py b/test/test_backends.py
@@ -0,0 +1,215 @@
+import pytest
+import torch
+from unittest.mock import Mock, patch
+from BackendBench.backends import AtenBackend, FlagGemsBackend, LLMBackend, KernelAgentBackend
+
+try:
+    import importlib.util
+
+    HAS_FLAG_GEMS = importlib.util.find_spec("flag_gems") is not None
+except ImportError:
+    HAS_FLAG_GEMS = False
+
+try:
+    import sys
+    import os
+    import importlib.util
+
+    kernel_agent_path = os.path.join(os.path.dirname(__file__), "..", "KernelAgent")
+    sys.path.insert(0, os.path.abspath(kernel_agent_path))
+    HAS_KERNEL_AGENT = importlib.util.find_spec("triton_kernel_agent") is not None
+except ImportError:
+    HAS_KERNEL_AGENT = False
+
+
+class TestAtenBackend:
+    def test_aten_backend_initialization(self):
+        backend = AtenBackend()
+        assert backend.name == "aten"
+
+    def test_aten_backend_contains_op(self):
+        backend = AtenBackend()
+
+        assert torch.ops.aten.relu.default in backend
+        assert torch.ops.aten.add.Tensor in backend
+
+        fake_op = Mock()
+        fake_op.__module__ = "fake_module"
+        assert fake_op in backend  # AtenBackend contains everything
+
+    def test_aten_backend_getitem(self):
+        backend = AtenBackend()
+
+        relu_op = torch.ops.aten.relu.default
+        assert backend[relu_op] == relu_op
+
+        fake_op = Mock()
+        fake_op.__module__ = "fake_module"
+        assert backend[fake_op] == fake_op  # AtenBackend returns the op itself
+
+
+class TestFlagGemsBackend:
+    @pytest.mark.skipif(not HAS_FLAG_GEMS, reason="flag_gems not available")
+    @patch("BackendBench.backends.flag_gems")
+    def test_flag_gems_backend_initialization(self, mock_flag_gems):
+        backend = FlagGemsBackend()
+        assert backend.name == "flaggems"
+        assert isinstance(backend.ops, dict)
+
+    @pytest.mark.skipif(not HAS_FLAG_GEMS, reason="flag_gems not available")
+    @patch("BackendBench.backends.flag_gems")
+    def test_flag_gems_backend_contains_op(self, mock_flag_gems):
+        mock_flag_gems.abs = Mock()
+
+        backend = FlagGemsBackend()
+
+        assert torch.ops.aten.abs.default in backend
+
+        fake_op = Mock()
+        fake_op.__str__ = Mock(return_value="fake_op")
+        assert fake_op not in backend
+
+    @pytest.mark.skipif(not HAS_FLAG_GEMS, reason="flag_gems not available")
+    @patch("BackendBench.backends.flag_gems")
+    def test_flag_gems_backend_getitem(self, mock_flag_gems):
+        mock_abs_impl = Mock()
+        mock_flag_gems.abs = mock_abs_impl
+
+        backend = FlagGemsBackend()
+
+        assert backend[torch.ops.aten.abs.default] == mock_abs_impl
+
+        fake_op = Mock()
+        fake_op.__str__ = Mock(return_value="fake_op")
+        with pytest.raises(KeyError):
+            _ = backend[fake_op]
+
+
+class TestLLMBackend:
+    def test_llm_backend_initialization(self):
+        with (
+            patch("os.makedirs"),
+            patch("builtins.open"),
+            patch("datetime.datetime") as mock_datetime,
+        ):
+            mock_datetime.now.return_value.strftime.return_value = "20250721_204542"
+            backend = LLMBackend()
+            assert backend.name == "llm"
+            assert "generated_kernels/run_" in backend.kernels_dir
+            assert isinstance(backend.compiled_kernels, dict)
+
+    @pytest.mark.skip(
+        reason="Complex file I/O mocking needed - test requires full file system interaction"
+    )
+    def test_llm_backend_add_kernel(self):
+        with (
+            patch("os.makedirs"),
+            patch("builtins.open"),
+            patch("datetime.datetime") as mock_datetime,
+        ):
+            mock_datetime.now.return_value.strftime.return_value = "20250721_204542"
+            backend = LLMBackend()
+
+            mock_op = Mock()
+            mock_op.__name__ = "test_op"
+
+            kernel_code = """
+def test_kernel(x):
+    return x + 1
+"""
+
+            with patch("builtins.open", create=True) as mock_open:
+                backend.add_kernel(mock_op, kernel_code, "test_op")
+
+            mock_open.assert_called()
+
+            assert mock_op in backend
+
+    @pytest.mark.skip(
+        reason="Complex file I/O mocking needed - test requires full file system interaction"
+    )
+    def test_llm_backend_test_kernel_correctness(self):
+        with (
+            patch("os.makedirs"),
+            patch("builtins.open"),
+            patch("datetime.datetime") as mock_datetime,
+        ):
+            mock_datetime.now.return_value.strftime.return_value = "20250721_204542"
+            backend = LLMBackend()
+
+            mock_op = Mock(return_value=torch.tensor([2.0]))
+
+            kernel_code = """
+def generated_kernel(x):
+    return x + 1
+"""
+
+            mock_test = Mock()
+            mock_test.args = [torch.tensor([1.0])]
+            mock_test.kwargs = {}
+
+            with patch("builtins.open", create=True):
+                is_correct, feedback = backend.test_kernel_correctness(
+                    mock_op, kernel_code, [mock_test], attempt=1
+                )
+
+            assert is_correct is True
+
+
+class TestKernelAgentBackend:
+    @pytest.mark.skipif(not HAS_KERNEL_AGENT, reason="KernelAgent not available")
+    def test_kernel_agent_backend_initialization(self):
+        with patch("os.makedirs"):
+            backend = KernelAgentBackend()
+            assert backend.name == "kernel_agent"
+            assert "kernel_agent_run_" in backend.kernels_dir
+            assert backend.num_workers == 4  # default value
+            assert backend.max_rounds == 10  # default value
+
+    @pytest.mark.skipif(not HAS_KERNEL_AGENT, reason="KernelAgent not available")
+    def test_kernel_agent_backend_set_config(self):
+        with patch("os.makedirs"):
+            backend = KernelAgentBackend()
+
+            backend.set_config(num_workers=8, max_rounds=20)
+
+            assert backend.num_workers == 8
+            assert backend.max_rounds == 20
+
+    @pytest.mark.skipif(not HAS_KERNEL_AGENT, reason="KernelAgent not available")
+    def test_kernel_agent_backend_generate_kernel(self):
+        with (
+            patch("os.makedirs"),
+            patch("triton_kernel_agent.TritonKernelAgent") as mock_kernel_agent_class,
+        ):
+            backend = KernelAgentBackend()
+
+            mock_agent = Mock()
+            mock_kernel_agent_class.return_value = mock_agent
+
+            mock_agent.generate_kernel.return_value = (True, "def kernel(): pass")
+
+            mock_op = Mock()
+            mock_op.__str__ = Mock(return_value="test_op")
+            with patch("builtins.open", create=True):
+                kernel_code, success = backend.generate_kernel_with_agent(mock_op, "test_op")
+            assert success is True
+            assert kernel_code == "def kernel(): pass"
+            mock_kernel_agent_class.assert_called_once()
+
+
+class TestBackendIntegration:
+    @pytest.mark.skipif(not HAS_FLAG_GEMS, reason="flag_gems not available")
+    def test_backend_polymorphism(self):
+        backends = []
+        backends.append(AtenBackend())
+        with patch("BackendBench.backends.flag_gems"):
+            backends.append(FlagGemsBackend())
+        with patch("os.makedirs"):
+            backends.append(LLMBackend())
+            backends.append(KernelAgentBackend())
+        for backend in backends:
+            assert hasattr(backend, "name")
+            assert hasattr(backend, "__contains__")
+            assert hasattr(backend, "__getitem__")
+            assert isinstance(backend.name, str)
diff --git a/test/test_eval.py b/test/test_eval.py
diff --git a/test/test_smoke.py b/test/test_smoke.py
diff --git a/test/test_suite.py b/test/test_suite.py