diff --git a/.gitignore b/.gitignore
index 1592432..4e3f765 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,7 +2,6 @@ __pycache__/
 .claude/
 .vscode/
 .ruff_cache/
-generated_kernels/
 backendbench.egg-info/
 CLAUDE.md
 venv/
@@ -10,3 +9,6 @@ ops/
 uv.lock
 pytorch_operator_coverage.csv
 .pre-commit-cache/
+generated_kernels/
+internal_operators.csv
+torchbench_operator_folder_mapping.csv
\ No newline at end of file
diff --git a/BackendBench/__init__.py b/BackendBench/__init__.py
index f59deee..cbac6f5 100644
--- a/BackendBench/__init__.py
+++ b/BackendBench/__init__.py
@@ -5,125 +5,7 @@
 # LICENSE file in the root directory of this source tree.
 
 """
-BackendBench: A PyTorch backend evaluation framework with monkey patching support.
-
-Import this module to automatically monkey patch PyTorch operations with custom backends.
+BackendBench: A PyTorch backend evaluation framework.
 """
 
-import os
-
-from .backends import AtenBackend, FlagGemsBackend
-
-
-class BackendRegistry:
-    """Registry for managing different PyTorch backends."""
-
-    def __init__(self):
-        self._current_backend = None
-        self._original_ops = {}
-        self._patched = False
-
-    def register_backend(self, backend_name: str, backend_instance=None):
-        """Register and activate a backend."""
-        if backend_instance is None:
-            backend_instance = self._create_backend(backend_name)
-
-        if self._patched:
-            self.unpatch()
-
-        self._current_backend = backend_instance
-        self._patch_torch_ops()
-
-    def _create_backend(self, backend_name: str):
-        """Create a backend instance."""
-        backends = {"aten": AtenBackend, "flag_gems": FlagGemsBackend}
-
-        if backend_name not in backends:
-            raise ValueError(f"Unknown backend: {backend_name}. Available: {list(backends.keys())}")
-
-        return backends[backend_name]()
-
-    def _patch_torch_ops(self):
-        """Monkey patch torch operations with current backend."""
-        if self._current_backend is None:
-            return
-
-        # Get all torch ops that the backend supports
-        if hasattr(self._current_backend, "ops"):
-            for torch_op, backend_impl in self._current_backend.ops.items():
-                if torch_op not in self._original_ops:
-                    self._original_ops[torch_op] = torch_op.default
-                torch_op.default = backend_impl
-
-        self._patched = True
-        print(
-            f"BackendBench: Monkey patched {len(self._original_ops)} operations with {self._current_backend.name} backend"
-        )
-
-    def unpatch(self):
-        """Restore original torch operations."""
-        if not self._patched:
-            return
-
-        for torch_op, original_impl in self._original_ops.items():
-            torch_op.default = original_impl
-
-        self._original_ops.clear()
-        self._patched = False
-        print("BackendBench: Restored original PyTorch operations")
-
-    def get_current_backend(self):
-        """Get the currently active backend."""
-        return self._current_backend
-
-    def is_patched(self):
-        """Check if operations are currently patched."""
-        return self._patched
-
-
-# Global registry instance
-_registry = BackendRegistry()
-
-
-def use_backend(backend_name: str, backend_instance=None):
-    """
-    Switch to a different backend.
-
-    Args:
-        backend_name: Name of the backend ('aten', 'flag_gems')
-        backend_instance: Optional pre-configured backend instance
-    """
-    _registry.register_backend(backend_name, backend_instance)
-
-
-def get_backend():
-    """Get the currently active backend."""
-    return _registry.get_current_backend()
-
-
-def restore_pytorch():
-    """Restore original PyTorch operations."""
-    _registry.unpatch()
-
-
-def is_patched():
-    """Check if BackendBench is currently patching operations."""
-    return _registry.is_patched()
-
-
-# Auto-configuration based on environment variables
-def _auto_configure():
-    """Auto-configure backend based on environment variables."""
-    backend_name = os.getenv("BACKENDBENCH_BACKEND", "aten")
-
-    try:
-        use_backend(backend_name)
-    except Exception as e:
-        print(f"Warning: Failed to initialize {backend_name} backend: {e}")
-        print("Falling back to aten backend")
-        use_backend("aten")
-
-
-# Auto-configure on import unless explicitly disabled
-if os.getenv("BACKENDBENCH_NO_AUTO_PATCH", "").lower() not in ("1", "true", "yes"):
-    _auto_configure()
+__version__ = "0.1.0"
diff --git a/BackendBench/backends/directory.py b/BackendBench/backends/directory.py
index 6da0956..ef70eb7 100644
--- a/BackendBench/backends/directory.py
+++ b/BackendBench/backends/directory.py
@@ -34,22 +34,28 @@ def _load_kernels(self):
             if not os.path.isdir(op_dir):
                 continue
 
-            impl_files = [f for f in os.listdir(op_dir) if f.endswith(".py")]
+            impl_files = [
+                f
+                for f in os.listdir(op_dir)
+                if f.endswith(".py") and f.startswith(f"{op_name}_implementation")
+            ]
             if not impl_files:
-                logger.warning(f"No Python files found in {op_dir}")
+                logger.debug(f"No implementation files found in {op_dir}")
                 continue
 
             # Use the first implementation file
-            impl_file = impl_files[0]
+            impl_file = sorted(impl_files)[0]  # Sort to ensure consistent selection
             impl_path = os.path.join(op_dir, impl_file)
 
             try:
                 # Load the implementation and map to PyTorch operation
                 kernel_func = self._load_kernel_from_file(impl_path, op_name)
-                pytorch_op = self._find_pytorch_op(op_name)
-                if pytorch_op:
-                    self.compiled_kernels[pytorch_op] = kernel_func
-                    logger.info(f"Loaded {op_name} from {impl_file}")
+                pytorch_ops = self._find_pytorch_ops(op_name)
+
+                if pytorch_ops:
+                    for pytorch_op in pytorch_ops:
+                        self.compiled_kernels[pytorch_op] = kernel_func
+                        logger.info(f"Loaded {op_name} from {impl_file} -> {pytorch_op}")
                     loaded_count += 1
                 else:
                     logger.warning(f"Could not map {op_name} to PyTorch operation")
@@ -68,23 +74,44 @@ def _load_kernel_from_file(self, file_path: str, op_name: str) -> Callable:
         if hasattr(module, kernel_func_name):
             return getattr(module, kernel_func_name)
         else:
-            raise ValueError(f"No callable function found in {file_path}")
-
-    def _find_pytorch_op(self, op_name: str):
-        """Map operation name to PyTorch operation."""
-        # Try common patterns
-        try:
-            return getattr(torch.ops.aten, op_name).default
-        except AttributeError:
-            pass
-
-        try:
-            return getattr(torch.ops.aten, op_name).Tensor
-        except AttributeError:
-            pass
-
-        # Not 100% sure this is right, will need to iterate over all ops
-        return None
+            raise ValueError(f"No function named {kernel_func_name} found in {file_path}")
+
+    def _find_pytorch_ops(self, op_name: str):
+        """Map operation name to PyTorch operations.
+
+        Returns a list of PyTorch operations that match the directory name.
+        This handles the common case where a directory name like 'add' should map
+        to multiple overloads like add.default, add.Tensor, etc.
+        """
+        matched_ops = []
+
+        # Handle suffixed directory names (e.g., add_out -> add.out)
+        base_name = op_name
+        suffix = None
+        if "_" in op_name:
+            parts = op_name.rsplit("_", 1)
+            if parts[1] in ["out", "inplace", "scalar"]:
+                base_name = parts[0]
+                suffix = parts[1]
+
+        # Try to find the operation in torch.ops.aten
+        if hasattr(torch.ops.aten, base_name):
+            aten_op = getattr(torch.ops.aten, base_name)
+
+            # If we have a specific suffix, try to get that overload
+            if suffix and hasattr(aten_op, suffix):
+                matched_ops.append(getattr(aten_op, suffix))
+            else:
+                # Otherwise, try common overloads
+                for overload in ["default", "Tensor", "Scalar", "int", "float"]:
+                    if hasattr(aten_op, overload):
+                        op = getattr(aten_op, overload)
+                        matched_ops.append(op)
+
+        # Also check for operations that might be in other namespaces
+        # This could be extended based on actual usage patterns
+
+        return matched_ops
 
     def __getitem__(self, key):
         if key in self.compiled_kernels:
@@ -93,4 +120,4 @@ def __getitem__(self, key):
         return key
 
     def __contains__(self, key):
-        return key in self.compiled_kernels or True  # Always claim to contain ops for fallback
+        return key in self.compiled_kernels
diff --git a/BackendBench/scripts/create_simple_test_ops.py b/BackendBench/scripts/create_simple_test_ops.py
index e26fd4f..7a8d04d 100644
--- a/BackendBench/scripts/create_simple_test_ops.py
+++ b/BackendBench/scripts/create_simple_test_ops.py
@@ -19,7 +19,7 @@
 
 def create_relu():
     os.makedirs("generated_kernels/relu", exist_ok=True)
-    with open("generated_kernels/relu/relu_implementation_1.py", "w") as f:
+    with open("generated_kernels/relu/relu_implementation_v1.py", "w") as f:
         f.write('''import torch
 
 def relu_kernel_impl(input):
@@ -37,7 +37,7 @@ def relu_kernel_impl(input):
 
 def create_add():
     os.makedirs("generated_kernels/add", exist_ok=True)
-    with open("generated_kernels/add/add_implementation_1.py", "w") as f:
+    with open("generated_kernels/add/add_implementation_v1.py", "w") as f:
         f.write('''import torch
 
 def add_kernel_impl(input, other):
@@ -56,7 +56,7 @@ def add_kernel_impl(input, other):
 
 def create_mul():
     os.makedirs("generated_kernels/mul", exist_ok=True)
-    with open("generated_kernels/mul/mul_implementation_1.py", "w") as f:
+    with open("generated_kernels/mul/mul_implementation_v1.py", "w") as f:
         f.write('''import torch
 
 def mul_kernel_impl(input, other):
@@ -75,7 +75,7 @@ def mul_kernel_impl(input, other):
 
 def create_abs():
     os.makedirs("generated_kernels/abs", exist_ok=True)
-    with open("generated_kernels/abs/abs_implementation_1.py", "w") as f:
+    with open("generated_kernels/abs/abs_implementation_v1.py", "w") as f:
         f.write('''import torch
 
 def abs_kernel_impl(input):
@@ -93,7 +93,7 @@ def abs_kernel_impl(input):
 
 def create_sum():
     os.makedirs("generated_kernels/sum", exist_ok=True)
-    with open("generated_kernels/sum/sum_implementation_1.py", "w") as f:
+    with open("generated_kernels/sum/sum_implementation_v1.py", "w") as f:
         f.write('''import torch
 
 def sum_kernel_impl(input, *args, **kwargs):
@@ -122,8 +122,8 @@ def main():
 
     logger.info("Created 5 simple kernel implementations in generated_kernels/")
     logger.info("Test them individually:")
-    logger.info("  python generated_kernels/relu/relu_implementation_1.py")
-    logger.info("  python generated_kernels/add/add_implementation_1.py")
+    logger.info("  python generated_kernels/relu/relu_implementation_v1.py")
+    logger.info("  python generated_kernels/add/add_implementation_v1.py")
     logger.info("  etc.")
     logger.info("Or test all with the backend:")
     logger.info("  python test/test_simple_directory_backend.py")
diff --git a/BackendBench/scripts/create_watermarked_operators.py b/BackendBench/scripts/create_watermarked_operators.py
new file mode 100755
index 0000000..282c226
--- /dev/null
+++ b/BackendBench/scripts/create_watermarked_operators.py
@@ -0,0 +1,176 @@
+#!/usr/bin/env python3
+
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD 3-Clause license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+Create watermarked operator implementations that return constant tensors.
+These implementations will verify monkey patching works but will fail correctness tests.
+"""
+
+import os
+import argparse
+from pathlib import Path
+
+
+WATERMARK_VALUE = 42.0
+
+
+def create_watermarked_impl(op_name: str, watermark_value: float = WATERMARK_VALUE) -> str:
+    """Generate a watermarked implementation that returns a constant tensor."""
+
+    return f'''# Watermarked implementation for {op_name} operator
+# This implementation returns a constant tensor to verify monkey patching
+
+import torch
+
+def {op_name}_kernel_impl(*args, **kwargs):
+    """Watermarked implementation of {op_name}.
+    
+    Returns a tensor filled with {watermark_value} to verify the operator
+    is being called through DirectoryBackend. This will fail correctness
+    tests but confirms the monkey patching mechanism is working.
+    """
+    # Find the first tensor argument to determine output shape and device
+    tensor_arg = None
+    for arg in args:
+        if isinstance(arg, torch.Tensor):
+            tensor_arg = arg
+            break
+    
+    if tensor_arg is not None:
+        # Return a tensor with same shape, dtype, and device as input
+        result = torch.full_like(tensor_arg, {watermark_value})
+        return result
+    else:
+        # Fallback for operators without tensor inputs
+        # Return a scalar tensor
+        return torch.tensor({watermark_value})
+'''
+
+
+def create_watermarked_operators(
+    base_dir: str = "generated_kernels",
+    watermark_value: float = WATERMARK_VALUE,
+    overwrite: bool = False,
+):
+    """Create watermarked implementations for all operators in the directory structure."""
+
+    base_path = Path(base_dir)
+    if not base_path.exists():
+        print(f"Error: Directory {base_path} does not exist.")
+        print("Please run setup_operator_directories.py first.")
+        return
+
+    created_count = 0
+    skipped_count = 0
+
+    # Iterate through all operator directories
+    for op_dir in base_path.iterdir():
+        if not op_dir.is_dir() or op_dir.name == "__pycache__":
+            continue
+
+        op_name = op_dir.name
+        impl_file = op_dir / f"{op_name}_implementation_v1.py"
+
+        # Skip if file exists and overwrite is False
+        if impl_file.exists() and not overwrite:
+            skipped_count += 1
+            continue
+
+        # Create watermarked implementation
+        impl_content = create_watermarked_impl(op_name, watermark_value)
+        impl_file.write_text(impl_content)
+        created_count += 1
+
+    print("\nWatermarked operator creation complete:")
+    print(f"- Created {created_count} watermarked implementations")
+    print(f"- Skipped {skipped_count} existing implementations")
+    print(f"- Watermark value: {watermark_value}")
+    print(f"- Base directory: {base_path.absolute()}")
+
+    # Create a verification script
+    verification_script = base_path / "verify_watermarks.py"
+    verification_content = f'''#!/usr/bin/env python3
+"""Verify that watermarked operators are being loaded correctly."""
+
+import torch
+from BackendBench.backends import DirectoryBackend
+
+# Expected watermark value
+WATERMARK_VALUE = {watermark_value}
+
+# Load the backend
+backend = DirectoryBackend("{base_dir}")
+
+# Test a few operators
+test_ops = ["relu", "add", "mul", "sub", "div"]
+
+print(f"Testing watermarked operators (expected value: {{WATERMARK_VALUE}})...")
+print(f"Loaded {{len(backend.compiled_kernels)}} operators\\n")
+
+for op_name in test_ops:
+    # Try to find the operator
+    found = False
+    for torch_op in backend.compiled_kernels:
+        if op_name in str(torch_op):
+            # Test the operator
+            try:
+                x = torch.tensor([1.0, 2.0, 3.0])
+                result = backend[torch_op](x)
+                
+                if torch.allclose(result, torch.full_like(x, WATERMARK_VALUE)):
+                    print(f"✓ {{op_name}}: Watermark detected correctly")
+                else:
+                    print(f"✗ {{op_name}}: Unexpected result {{result}}")
+                
+                found = True
+                break
+            except Exception as e:
+                print(f"✗ {{op_name}}: Error - {{e}}")
+                found = True
+                break
+    
+    if not found:
+        print(f"? {{op_name}}: Not found in loaded operators")
+'''
+
+    verification_script.write_text(verification_content)
+    os.chmod(verification_script, 0o755)
+
+    print(f"\nCreated verification script: {verification_script}")
+    print("\nTo verify watermarks are working:")
+    print(f"  python {verification_script}")
+    print("\nTo test with evaluation harness (should fail correctness):")
+    print("  python -m BackendBench.scripts.main --backend directory --ops relu,add --suite smoke")
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Create watermarked operator implementations for testing"
+    )
+    parser.add_argument(
+        "--base-dir",
+        default="generated_kernels",
+        help="Base directory containing operator subdirectories",
+    )
+    parser.add_argument(
+        "--watermark-value",
+        type=float,
+        default=WATERMARK_VALUE,
+        help=f"Value to use for watermarking (default: {WATERMARK_VALUE})",
+    )
+    parser.add_argument(
+        "--overwrite", action="store_true", help="Overwrite existing implementation files"
+    )
+
+    args = parser.parse_args()
+
+    create_watermarked_operators(args.base_dir, args.watermark_value, args.overwrite)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/BackendBench/scripts/debug_operator_mapping.py b/BackendBench/scripts/debug_operator_mapping.py
new file mode 100644
index 0000000..936940a
--- /dev/null
+++ b/BackendBench/scripts/debug_operator_mapping.py
@@ -0,0 +1,119 @@
+#!/usr/bin/env python3
+
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD 3-Clause license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+"""
+Debug script to show how TorchBench operator names map to DirectoryBackend folder names.
+Creates a CSV file showing the mapping for debugging purposes.
+
+Usage:
+    python -m BackendBench.scripts.debug_operator_mapping
+
+Output:
+    torchbench_operator_folder_mapping.csv - CSV file with operator mappings
+"""
+
+import csv
+from pathlib import Path
+from BackendBench.backends.directory import DirectoryBackend
+
+
+def get_operator_mapping():
+    """Get the mapping from TorchBench operators to folder names."""
+    mappings = []
+
+    # Create a DirectoryBackend to see what operators it loads
+    backend = DirectoryBackend("generated_kernels")
+
+    print(f"DirectoryBackend loaded {len(backend.compiled_kernels)} operators")
+
+    # Get all the folder names that exist
+    generated_kernels = Path("generated_kernels")
+    if generated_kernels.exists():
+        folder_names = [d.name for d in generated_kernels.iterdir() if d.is_dir()]
+        print(f"Found {len(folder_names)} folders in generated_kernels/")
+    else:
+        print("No generated_kernels directory found")
+        return []
+
+    # For each loaded operator, find its folder
+    for pytorch_op in sorted(backend.compiled_kernels.keys(), key=str):
+        op_str = str(pytorch_op)
+
+        # Extract the base name (e.g., "add" from "aten.add.Tensor")
+        if "aten." in op_str:
+            base_name = op_str.split("aten.")[1].split(".")[0]
+        else:
+            base_name = "unknown"
+
+        # Find the folder that maps to this operator by checking which folder
+        # the DirectoryBackend actually uses for this operator
+        folder_name = None
+
+        # Check each folder to see which one would produce this operator
+        for folder in folder_names:
+            test_backend = DirectoryBackend.__new__(DirectoryBackend)
+            test_ops = test_backend._find_pytorch_ops(folder)
+            if pytorch_op in test_ops:
+                folder_name = folder
+                break
+
+        # Get overload info
+        overload = "unknown"
+        if "." in op_str and "aten." in op_str:
+            parts = op_str.split(".")
+            if len(parts) >= 3:
+                overload = parts[2]
+
+        mappings.append(
+            {
+                "pytorch_operator": op_str,
+                "base_name": base_name,
+                "overload": overload,
+                "folder_name": folder_name or "NOT_FOUND",
+                "is_mapped": folder_name is not None,
+            }
+        )
+
+    return mappings
+
+
+def create_mapping_csv():
+    """Create a CSV file with the operator mapping."""
+    mappings = get_operator_mapping()
+
+    csv_file = "torchbench_operator_folder_mapping.csv"
+
+    with open(csv_file, "w", newline="") as f:
+        if mappings:
+            writer = csv.DictWriter(f, fieldnames=mappings[0].keys())
+            writer.writeheader()
+            writer.writerows(mappings)
+
+    print(f"\nCreated {csv_file} with {len(mappings)} operator mappings")
+
+    # Print some statistics
+    mapped_count = sum(1 for m in mappings if m["is_mapped"])
+    print(f"Successfully mapped: {mapped_count}/{len(mappings)} operators")
+
+    # Show some examples
+    print("\nExample mappings:")
+    for i, mapping in enumerate(mappings[:10]):
+        print(f"  {mapping['pytorch_operator']} -> {mapping['folder_name']}")
+
+    if len(mappings) > 10:
+        print(f"  ... and {len(mappings) - 10} more (see CSV file)")
+
+    return csv_file
+
+
+if __name__ == "__main__":
+    print("Creating TorchBench operator to folder mapping...")
+    csv_file = create_mapping_csv()
+    print(f"\nDebug CSV created: {csv_file}")
+    print("This file shows how PyTorch operators map to generated_kernels/ folder names")
diff --git a/BackendBench/scripts/setup_operator_directories.py b/BackendBench/scripts/setup_operator_directories.py
new file mode 100755
index 0000000..a9ec61c
--- /dev/null
+++ b/BackendBench/scripts/setup_operator_directories.py
@@ -0,0 +1,223 @@
+#!/usr/bin/env python3
+
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD 3-Clause license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+Setup script to create directory structure for all PyTorch operators.
+This creates empty directories that LLM researchers can fill with generated kernels.
+"""
+
+import os
+import csv
+import argparse
+from pathlib import Path
+
+# Import the generate_coverage_csv functionality
+from .generate_operator_coverage_csv import generate_coverage_csv
+
+
+def clean_op_name_for_directory(op_name: str) -> str:
+    """Convert operator name to valid directory name.
+
+    Examples:
+    - aten::add.Tensor -> add
+    - aten::add.out -> add_out
+    - aten::native_batch_norm -> native_batch_norm
+    - torch.ops.aten.add.default -> add
+    """
+    # Remove aten:: prefix
+    if op_name.startswith("aten::"):
+        op_name = op_name[6:]
+
+    # Remove torch.ops.aten. prefix
+    if op_name.startswith("torch.ops.aten."):
+        op_name = op_name[15:]
+
+    # Handle .default, .Tensor, .out suffixes
+    if "." in op_name:
+        parts = op_name.split(".")
+        base = parts[0]
+        suffix = parts[1] if len(parts) > 1 else ""
+
+        # For common suffixes, we might want to keep them to distinguish overloads
+        if suffix in ["out", "inplace", "scalar"]:
+            op_name = f"{base}_{suffix}"
+        else:
+            # For .default, .Tensor, etc., just use the base name
+            op_name = base
+
+    # Replace any remaining invalid characters
+    op_name = op_name.replace(":", "_").replace("/", "_").replace("\\", "_")
+
+    return op_name
+
+
+def create_readme_for_op(
+    op_dir: Path, op_name: str, is_core: bool, is_opinfo: bool, is_torchbench: bool
+):
+    """Create a README.md file for each operator directory."""
+    readme_path = op_dir / "README.md"
+
+    status = []
+    if is_core:
+        status.append("Core PyTorch operator")
+    if is_opinfo:
+        status.append("Has OpInfo tests")
+    if is_torchbench:
+        status.append("Used in TorchBench")
+
+    content = f"""# {op_name}
+
+Status: {", ".join(status) if status else "Regular operator"}
+
+## Implementation
+
+Place your generated kernel implementation in this directory as:
+- `{clean_op_name_for_directory(op_name)}_implementation_v1.py`
+- `{clean_op_name_for_directory(op_name)}_implementation_v2.py`
+- etc.
+
+Each implementation file should contain a function named:
+```python
+def {clean_op_name_for_directory(op_name)}_kernel_impl(*args, **kwargs):
+    # Your implementation here
+    pass
+```
+
+## Testing
+
+The DirectoryBackend will automatically load the first implementation file found in this directory.
+"""
+
+    readme_path.write_text(content)
+
+
+def setup_operator_directories(base_dir: str = "generated_kernels", include_all: bool = False):
+    """Set up directory structure for PyTorch operators."""
+
+    # First, generate the coverage CSV if it doesn't exist
+    csv_path = "pytorch_operator_coverage.csv"
+    if not os.path.exists(csv_path):
+        print("Generating operator coverage CSV...")
+        csv_path = generate_coverage_csv()
+
+    # Create base directory
+    base_path = Path(base_dir)
+    base_path.mkdir(exist_ok=True)
+
+    # Read operator data from CSV
+    operators = []
+    with open(csv_path, "r") as f:
+        reader = csv.DictReader(f)
+        for row in reader:
+            operators.append(
+                {
+                    "name": row["op_name"],
+                    "is_core": row["is_core"] == "True",
+                    "is_opinfo": row["is_in_opinfo"] == "True",
+                    "is_torchbench": row["is_in_torchbench"] == "True",
+                }
+            )
+
+    # Filter operators based on criteria
+    if not include_all:
+        # By default, only include operators that are in TorchBench
+        operators = [op for op in operators if op["is_torchbench"]]
+        print(f"Setting up directories for {len(operators)} TorchBench operators")
+    else:
+        print(f"Setting up directories for all {len(operators)} operators")
+
+    # Create directories
+    created_count = 0
+    skipped_count = 0
+
+    for op in operators:
+        op_name = op["name"]
+        dir_name = clean_op_name_for_directory(op_name)
+
+        if not dir_name:  # Skip if we couldn't clean the name
+            print(f"Skipping operator with invalid name: {op_name}")
+            skipped_count += 1
+            continue
+
+        op_dir = base_path / dir_name
+
+        if op_dir.exists():
+            skipped_count += 1
+            continue
+
+        op_dir.mkdir(exist_ok=True)
+        create_readme_for_op(op_dir, op_name, op["is_core"], op["is_opinfo"], op["is_torchbench"])
+        created_count += 1
+
+    print("\nDirectory setup complete:")
+    print(f"- Created {created_count} new directories")
+    print(f"- Skipped {skipped_count} existing directories")
+    print(f"- Base directory: {base_path.absolute()}")
+
+    # Create a main README
+    main_readme = base_path / "README.md"
+    main_readme.write_text("""# Generated Kernels Directory
+
+This directory contains subdirectories for PyTorch operators that need kernel implementations.
+
+## Structure
+
+Each subdirectory corresponds to a PyTorch operator and should contain:
+- Implementation files: `{op_name}_implementation_*.py`
+- README.md with operator information
+
+## Usage
+
+1. Navigate to the operator directory you want to implement
+2. Create your kernel implementation following the template in the README
+3. Test with DirectoryBackend: `python -m BackendBench.scripts.main --backend directory --ops {op_name}`
+
+## Operator Mapping
+
+The DirectoryBackend maps directory names to PyTorch operations as follows:
+- Directory `add` → `torch.ops.aten.add.default`
+- Directory `mul` → `torch.ops.aten.mul.default`
+- etc.
+
+For operators with multiple overloads (e.g., add.out), use suffixes:
+- Directory `add_out` → `torch.ops.aten.add.out`
+""")
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Set up directory structure for PyTorch operator implementations"
+    )
+    parser.add_argument(
+        "--base-dir",
+        default="generated_kernels",
+        help="Base directory for operator implementations (default: generated_kernels)",
+    )
+    parser.add_argument(
+        "--include-all",
+        action="store_true",
+        help="Include all operators, not just TorchBench operators",
+    )
+    parser.add_argument(
+        "--regenerate-csv",
+        action="store_true",
+        help="Force regeneration of the operator coverage CSV",
+    )
+
+    args = parser.parse_args()
+
+    # Remove existing CSV if regeneration is requested
+    if args.regenerate_csv and os.path.exists("pytorch_operator_coverage.csv"):
+        os.remove("pytorch_operator_coverage.csv")
+        print("Removed existing CSV, will regenerate...")
+
+    setup_operator_directories(args.base_dir, args.include_all)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/test/test_backend_evaluation.py b/test/test_backend_evaluation.py
new file mode 100644
index 0000000..3412ae0
--- /dev/null
+++ b/test/test_backend_evaluation.py
@@ -0,0 +1,199 @@
+#!/usr/bin/env python3
+
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD 3-Clause license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+Comprehensive test for BackendBench evaluation system.
+
+Tests:
+1. DirectoryBackend loads operators correctly
+2. Watermarked implementations fail correctness (proving monkey patching works)
+3. Main script evaluation works end-to-end
+4. eval.py integration works properly
+"""
+
+import sys
+import unittest
+import subprocess
+from pathlib import Path
+
+import torch
+
+# Add BackendBench to path
+sys.path.insert(0, str(Path(__file__).parent.parent))
+
+from BackendBench.backends import DirectoryBackend
+from BackendBench.eval import eval_correctness, eval_one_op
+from BackendBench.suite import Test
+
+
+class TestBackendEvaluation(unittest.TestCase):
+    """Comprehensive test for backend evaluation system."""
+
+    @classmethod
+    def setUpClass(cls):
+        """Generate required directory structure and operators."""
+        # Generate the directory structure
+        subprocess.run(
+            [sys.executable, "-m", "BackendBench.scripts.setup_operator_directories"], check=True
+        )
+        # Create watermarked implementations
+        subprocess.run(
+            [
+                sys.executable,
+                "-m",
+                "BackendBench.scripts.create_watermarked_operators",
+                "--overwrite",
+            ],
+            check=True,
+        )
+
+    def test_1_directory_backend_loads_operators(self):
+        """Test 1: Verify DirectoryBackend loads operators correctly."""
+        print("\n" + "=" * 60)
+        print("TEST 1: DirectoryBackend Operator Loading")
+        print("=" * 60)
+
+        backend = DirectoryBackend("generated_kernels")
+        operator_count = len(backend.compiled_kernels)
+
+        print(f"\n📊 Loaded {operator_count} operators")
+
+        # List some examples
+        print("\n📋 Sample operators:")
+        for i, op in enumerate(list(backend.compiled_kernels.keys())[:5]):
+            print(f"   {i + 1}. {op}")
+        print(f"   ... and {operator_count - 5} more")
+
+        # Verify we loaded a substantial number
+        self.assertGreater(operator_count, 100, "Should load many operators from generated_kernels")
+
+        print(f"\n✅ SUCCESS: DirectoryBackend loaded {operator_count} total operators")
+
+    def test_2_watermarked_implementations_fail_correctness(self):
+        """Test 2: Verify watermarked operators fail eval_correctness (proving monkey patching)."""
+        print("\n" + "=" * 60)
+        print("TEST 2: Watermarked Implementation Correctness")
+        print("=" * 60)
+
+        backend = DirectoryBackend("generated_kernels")
+
+        print("\n🧪 Testing watermarked operators with eval_correctness:")
+
+        failed_count = 0
+        total_tested = 0
+
+        # Test several operators that should have watermarked implementations
+        test_ops = [
+            (
+                torch.ops.aten.bitwise_and.Tensor,
+                lambda: torch.tensor([1, 2, 3]),
+                lambda: torch.tensor([2, 3, 4]),
+            ),
+            (
+                torch.ops.aten.fmod.Tensor,
+                lambda: torch.tensor([5.0, 7.0]),
+                lambda: torch.tensor([2.0, 3.0]),
+            ),
+        ]
+
+        for op, *arg_generators in test_ops:
+            if op in backend:
+                try:
+                    impl = backend[op]
+                    test = Test(*arg_generators)
+                    correctness = eval_correctness(op, impl, [test])
+
+                    total_tested += 1
+                    if correctness == 0.0:
+                        failed_count += 1
+                        print(f"  ✓ {str(op).split('.')[-2]}: Failed correctness (watermarked)")
+                    else:
+                        print(f"  ✗ {str(op).split('.')[-2]}: Passed correctness unexpectedly")
+
+                except Exception as e:
+                    print(f"  ? {str(op).split('.')[-2]}: Error testing - {e}")
+
+        print(f"\n📊 Results: {failed_count}/{total_tested} operators failed correctness")
+        print("   This proves our watermarked implementations are being used!")
+
+        self.assertGreater(failed_count, 0, "At least some watermarked ops should fail")
+
+    def test_3_main_script_evaluation(self):
+        """Test 3: Verify main.py script works with DirectoryBackend."""
+        print("\n" + "=" * 60)
+        print("TEST 3: Main Script Evaluation")
+        print("=" * 60)
+
+        cmd = [
+            sys.executable,
+            "-m",
+            "BackendBench.scripts.main",
+            "--backend",
+            "directory",
+            "--suite",
+            "smoke",
+            "--log-level",
+            "ERROR",
+        ]
+
+        print("\n🚀 Running: " + " ".join(cmd))
+        print("   (This uses eval.py internally for correctness evaluation)")
+
+        result = subprocess.run(cmd, capture_output=True, text=True, timeout=120)
+
+        print("\n📊 Evaluation Results:")
+        if result.stdout:
+            lines = result.stdout.strip().split("\n")
+            for line in lines:
+                if "score" in line:
+                    print(f"   {line}")
+
+        # Should complete without crashing
+        self.assertEqual(result.returncode, 0, "Main script should complete successfully")
+
+        print("\n✅ SUCCESS: Main script evaluation completed")
+
+    def test_4_eval_integration(self):
+        """Test 4: Verify eval.py functions work correctly."""
+        print("\n" + "=" * 60)
+        print("TEST 4: eval.py Integration")
+        print("=" * 60)
+
+        backend = DirectoryBackend("generated_kernels")
+
+        print("\n🔧 Testing eval_one_op function:")
+
+        # Find a watermarked operator to test
+        test_op = None
+        for op in backend.compiled_kernels.keys():
+            if "bitwise_and" in str(op) and "Tensor" in str(op):
+                test_op = op
+                break
+
+        if test_op:
+            impl = backend[test_op]
+            test = Test(lambda: torch.tensor([1, 2, 3]), lambda: torch.tensor([2, 3, 4]))
+
+            correctness, performance = eval_one_op(test_op, impl, [test], [test])
+
+            print(f"   Operation: {test_op}")
+            print(f"   Correctness: {correctness}")
+            print(f"   Performance: {performance}")
+
+            # Watermarked implementation should fail correctness
+            self.assertEqual(correctness, 0.0, "Watermarked implementation should fail correctness")
+
+            print("   ✓ eval_one_op works correctly with watermarked implementation")
+        else:
+            print("   ! No suitable test operator found, skipping detailed test")
+
+        print("\n✅ SUCCESS: eval.py integration verified")
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/test_directory_backend.py b/test/test_directory_backend.py
index 19856d6..220dd8c 100644
--- a/test/test_directory_backend.py
+++ b/test/test_directory_backend.py
@@ -20,15 +20,12 @@
 
 @pytest.fixture(scope="module")
 def backend():
-    expected_dirs = ["relu", "add", "mul", "abs", "sum"]
-    missing_dirs = [d for d in expected_dirs if not os.path.isdir(f"generated_kernels/{d}")]
-
-    if missing_dirs:
-        import subprocess
+    # Always create correct test implementations, overriding any watermarked ones
+    import subprocess
 
-        subprocess.run(
-            [sys.executable, "BackendBench/scripts/create_simple_test_ops.py"], check=True
-        )
+    subprocess.run(
+        [sys.executable, "-m", "BackendBench.scripts.create_simple_test_ops"], check=True
+    )
 
     return DirectoryBackend(ops_dir="generated_kernels")