Cherry pick Neuron support for kernels

danieldk · danieldk · commit 7d3dcd9867ac · 2026-03-04T09:31:34.000Z
This change does not add support for `build2cmake`. For building
kernels, use the `main` branch until the next major release.
diff --git a/builder/README.md b/builder/README.md
@@ -63,6 +63,9 @@ See [dockerfiles/README.md](./dockerfiles/README.md) for more options, including
 | XPU        | ✓               | ✓                      | ✗                       | 2    |
 | Metal      | ✓               | ✓                      | ✗                       | 2    |
 | Huawei NPU | ✓               | ✗                      | ✗                       | 3    |
+| Neuron     | ✓               | x                      | x                       | 3    |
+
+**Warning:** Neuron support is experimental and currently requires pre-release packages.
 
 # 📚 Documentation
 
diff --git a/kernels/src/kernels/layer/kernelize.py b/kernels/src/kernels/layer/kernelize.py
@@ -274,7 +274,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
 
 def _validate_device_type(device_type: str) -> None:
     """Validate that the device type is supported."""
-    supported_devices = {"cpu", "cuda", "mps", "npu", "rocm", "xpu"}
+    supported_devices = {"cpu", "cuda", "mps", "neuron", "npu", "rocm", "xpu"}
     if device_type not in supported_devices:
         raise ValueError(
             f"Unsupported device type '{device_type}'. Supported device types are: {', '.join(sorted(supported_devices))}"
@@ -310,3 +310,9 @@ def _is_rocm_platform():
     import torch
 
     return torch.version.hip is not None
+
+
+def _has_neuron_ops():
+    import torch
+
+    return hasattr(torch, "neuron")
diff --git a/kernels/src/kernels/layer/repos.py b/kernels/src/kernels/layer/repos.py
@@ -36,6 +36,8 @@ def create_repo(device: Device) -> "DeviceRepos":
             return _XPURepos()
         elif device.type == "npu":
             return _NPURepos()
+        elif device.type == "neuron":
+            return _NeuronRepos()
         else:
             raise ValueError(f"Unknown device type: {device.type}")
 
@@ -93,6 +95,26 @@ def insert(self, device: Device, repos: dict[Mode, RepositoryProtocol]):
         self._repos = repos
 
 
+class _NeuronRepos(DeviceRepos):
+    _repos: dict[Mode, RepositoryProtocol]
+
+    def __init__(self):
+        super().__init__()
+        self._repos = {}
+
+    @property
+    def repos(
+        self,
+    ) -> dict[Mode, RepositoryProtocol] | None:
+        return self._repos
+
+    def insert(self, device: Device, repos: dict[Mode, RepositoryProtocol]):
+        if device.type != "neuron":
+            raise ValueError(f"Device type must be 'neuron', got {device.type}")
+
+        self._repos = repos
+
+
 class _NPURepos(DeviceRepos):
     _repos: dict[Mode, RepositoryProtocol]
 
diff --git a/kernels/src/kernels/python_depends.json b/kernels/src/kernels/python_depends.json
@@ -14,6 +14,12 @@
       }
     },
     "metal": {},
+    "neuron": {
+      "nki": {
+        "nix": [],
+        "python": ["nki"]
+      }
+    },
     "rocm": {},
     "xpu": {
       "onednn": {
diff --git a/kernels/src/kernels/utils.py b/kernels/src/kernels/utils.py
@@ -28,9 +28,7 @@ def _get_cache_dir() -> str | None:
     """Returns the kernels cache directory."""
     cache_dir = os.environ.get("HF_KERNELS_CACHE", None)
     if cache_dir is not None:
-        logging.warning(
-            "HF_KERNELS_CACHE will be removed in the future, use KERNELS_CACHE instead"
-        )
+        logging.warning("HF_KERNELS_CACHE will be removed in the future, use KERNELS_CACHE instead")
         return cache_dir
 
     return os.environ.get("KERNELS_CACHE", None)
@@ -50,7 +48,11 @@ def _get_privateuse_backend_name() -> str | None:
 def backend() -> str:
     import torch
 
-    if torch.version.cuda is not None:
+    if hasattr(torch, "neuron"):
+        # Needs to be sorted before specific Torch builds, since Neuron
+        # extension can be loaded into e.g. CUDA Torch builds.
+        return "neuron"
+    elif torch.version.cuda is not None:
         return "cuda"
     elif torch.version.hip is not None:
         return "hip"
@@ -104,7 +106,11 @@ def build_variant() -> str:
 def build_variant_noarch() -> str:
     import torch
 
-    if torch.version.cuda is not None:
+    if hasattr(torch, "neuron"):
+        # Needs to be sorted before specific Torch builds, since Neuron
+        # extension can be loaded into e.g. CUDA Torch builds.
+        return "torch-neuron"
+    elif torch.version.cuda is not None:
         return "torch-cuda"
     elif torch.version.hip is not None:
         return "torch-rocm"
@@ -197,9 +203,7 @@ def install_kernel(
     try:
         return _find_kernel_in_repo_path(repo_path, package_name, variant_locks)
     except FileNotFoundError:
-        raise FileNotFoundError(
-            f"Cannot install kernel from repo {repo_id} (revision: {revision})"
-        )
+        raise FileNotFoundError(f"Cannot install kernel from repo {repo_id} (revision: {revision})")
 
 
 def _find_kernel_in_repo_path(
@@ -264,9 +268,7 @@ def install_kernel_all_variants(
             if variant_lock is None:
                 raise ValueError(f"No lock found for build variant: {variant}")
 
-            validate_kernel(
-                repo_path=repo_path, variant=variant, hash=variant_lock.hash
-            )
+            validate_kernel(repo_path=repo_path, variant=variant, hash=variant_lock.hash)
 
     return repo_path / "build"
 
@@ -309,9 +311,7 @@ def get_kernel(
         ```
     """
     revision = select_revision_or_version(repo_id, revision=revision, version=version)
-    package_name, variant_path = install_kernel(
-        repo_id, revision=revision, user_agent=user_agent
-    )
+    package_name, variant_path = install_kernel(repo_id, revision=revision, user_agent=user_agent)
     return _import_from_path(package_name, variant_path)
 
 
@@ -344,9 +344,7 @@ def get_local_kernel(repo_path: Path, package_name: str) -> ModuleType:
     raise FileNotFoundError(f"Could not find package '{package_name}' in {repo_path}")
 
 
-def has_kernel(
-    repo_id: str, revision: str | None = None, version: int | str | None = None
-) -> bool:
+def has_kernel(repo_id: str, revision: str | None = None, version: int | str | None = None) -> bool:
     """
     Check whether a kernel build exists for the current environment (Torch version and compute framework).
 
@@ -419,9 +417,7 @@ def load_kernel(repo_id: str, *, lockfile: Path | None) -> ModuleType:
     )
 
     try:
-        package_name, variant_path = _find_kernel_in_repo_path(
-            repo_path, package_name, variant_locks=None
-        )
+        package_name, variant_path = _find_kernel_in_repo_path(repo_path, package_name, variant_locks=None)
         return _import_from_path(package_name, variant_path)
     except FileNotFoundError:
         raise FileNotFoundError(
@@ -447,9 +443,7 @@ def get_locked_kernel(repo_id: str, local_files_only: bool = False) -> ModuleTyp
     if locked_sha is None:
         raise ValueError(f"Kernel `{repo_id}` is not locked")
 
-    package_name, variant_path = install_kernel(
-        repo_id, locked_sha, local_files_only=local_files_only
-    )
+    package_name, variant_path = install_kernel(repo_id, locked_sha, local_files_only=local_files_only)
 
     return _import_from_path(package_name, variant_path)
 
diff --git a/kernels/tests/conftest.py b/kernels/tests/conftest.py
@@ -10,6 +10,9 @@
     and torch.version.cuda is not None
     and torch.cuda.device_count() > 0
 )
+
+has_neuron = hasattr(torch, "neuron") and torch.neuron.device_count() > 0
+
 has_rocm = (
     hasattr(torch.version, "hip")
     and torch.version.hip is not None
@@ -46,6 +49,8 @@ def device():
 def pytest_runtest_setup(item):
     if "cuda_only" in item.keywords and not has_cuda:
         pytest.skip("skipping CUDA-only test on host without CUDA")
+    if "neuron_only" in item.keywords and not has_neuron:
+        pytest.skip("skipping Neuron-only test on host without Neuron")
     if "rocm_only" in item.keywords and not has_rocm:
         pytest.skip("skipping ROCm-only test on host without ROCm")
     if "darwin_only" in item.keywords and not sys.platform.startswith("darwin"):
diff --git a/kernels/tests/test_basic.py b/kernels/tests/test_basic.py
@@ -199,6 +199,13 @@ def test_flattened_build(repo_revision, device):
     torch.testing.assert_close(kernel.silu_and_mul(x), silu_and_mul_torch(x))
 
 
+@pytest.mark.neuron_only
+def test_neuron():
+    relu = get_kernel("kernels-test/relu-nki", version=1)
+    x = torch.randn((16, 16), dtype=torch.float16).to(device="neuron")
+    torch.testing.assert_close(relu.relu(x), x.relu())
+
+
 def silu_and_mul_torch(x: torch.Tensor):
     d = x.shape[-1] // 2
     return F.silu(x[..., :d]) * x[..., d:]
diff --git a/kernels/tests/test_layer.py b/kernels/tests/test_layer.py
@@ -84,6 +84,23 @@ class RMSNormWithKernel(RMSNorm):
     pass
 
 
+class ReLU(nn.Module):
+    def __init__(self):
+        super().__init__()
+        # Used to check that we called hub kernel.
+        self.n_calls = 0
+
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        self.n_calls += 1
+        d = input.shape[-1] // 2
+        return F.relu(input)
+
+
+@use_kernel_forward_from_hub("ReLU")
+class ReLUWithKernel(ReLU):
+    pass
+
+
 class SiluAndMul(nn.Module):
     def __init__(self):
         super().__init__()
@@ -188,6 +205,55 @@ def test_hub_func(cls):
     assert silu_and_mul_with_kernel.n_calls == 0
 
 
+@pytest.mark.neuron_only
+def test_hub_forward_neuron():
+    torch.manual_seed(0)
+
+    mapping = {
+        "ReLU": {
+            "neuron": LayerRepository(
+                repo_id="kernels-test/relu-nki", version=1, layer_name="ReLU"
+            )
+        }
+    }
+
+    relu = ReLU()
+    X = torch.randn((16, 16), device="neuron")
+    Y = relu(X)
+
+    with use_kernel_mapping(mapping):
+        relu_with_kernel = kernelize(
+            ReLUWithKernel(), device="neuron", mode=Mode.INFERENCE
+        )
+    Y_kernel = relu_with_kernel(X)
+
+    torch.testing.assert_close(Y_kernel, Y)
+
+    assert relu.n_calls == 1
+    assert relu_with_kernel.n_calls == 0
+
+    # Check that the device type can be determined automatically.
+    class SMOL(nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.linear = nn.Linear(16, 16)
+            self.relu = ReLUWithKernel()
+
+        def forward(self, x):
+            return self.relu(self.linear(x))
+
+    smol = SMOL().to("neuron")
+
+    Y = smol(X)
+
+    with use_kernel_mapping(mapping):
+        smol = kernelize(smol, mode=Mode.INFERENCE)
+    Y_kernel = smol(X)
+
+    torch.testing.assert_close(Y, Y_kernel)
+    assert smol.relu.n_calls == 1
+
+
 @pytest.mark.rocm_only
 def test_hub_forward_rocm():
     torch.manual_seed(0)