kernels: add support for Neuron and NKI (huggingface#285)

danieldk · web-flow · commit d8bb02b838c7 · 2026-03-04T10:02:23.000+01:00
* Add basic Neuron + NKI support to `kernels`

* Add Neuron layer support

* Fix accidental import removal

* Exclude Neuron in init test

* Sync with latest neuronx

* build2cmake: add Neuron support

Also add an example kernel using NKI.

* Add Neuron to the builder README

* Fix typo
diff --git a/build2cmake/src/config/mod.rs b/build2cmake/src/config/mod.rs
@@ -44,6 +44,7 @@ pub struct General {
     pub python_depends: Option<Vec<String>>,
 
     pub cuda: Option<CudaGeneral>,
+    pub neuron: Option<NeuronGeneral>,
     pub xpu: Option<XpuGeneral>,
 }
 
@@ -106,6 +107,10 @@ pub struct XpuGeneral {
     pub python_depends: Option<Vec<String>>,
 }
 
+pub struct NeuronGeneral {
+    pub python_depends: Option<Vec<String>>,
+}
+
 pub struct Hub {
     pub repo_id: Option<String>,
     pub branch: Option<String>,
@@ -237,16 +242,18 @@ pub enum Backend {
     Cpu,
     Cuda,
     Metal,
+    Neuron,
     Rocm,
     Xpu,
 }
 
 impl Backend {
-    pub const fn all() -> [Backend; 5] {
+    pub const fn all() -> [Backend; 6] {
         [
             Backend::Cpu,
             Backend::Cuda,
             Backend::Metal,
+            Backend::Neuron,
             Backend::Rocm,
             Backend::Xpu,
         ]
@@ -259,6 +266,7 @@ impl Display for Backend {
             Backend::Cpu => write!(f, "cpu"),
             Backend::Cuda => write!(f, "cuda"),
             Backend::Metal => write!(f, "metal"),
+            Backend::Neuron => write!(f, "neuron"),
             Backend::Rocm => write!(f, "rocm"),
             Backend::Xpu => write!(f, "xpu"),
         }
@@ -273,6 +281,7 @@ impl FromStr for Backend {
             "cpu" => Ok(Backend::Cpu),
             "cuda" => Ok(Backend::Cuda),
             "metal" => Ok(Backend::Metal),
+            "neuron" => Ok(Backend::Neuron),
             "rocm" => Ok(Backend::Rocm),
             "xpu" => Ok(Backend::Xpu),
             _ => Err(format!("Unknown backend: {s}")),
diff --git a/build2cmake/src/config/v1.rs b/build2cmake/src/config/v1.rs
@@ -86,6 +86,7 @@ impl TryFrom<Build> for super::Build {
                 Backend::Cpu,
                 Backend::Cuda,
                 Backend::Metal,
+                Backend::Neuron,
                 Backend::Rocm,
                 Backend::Xpu,
             ]
@@ -102,6 +103,7 @@ impl TryFrom<Build> for super::Build {
                 license: None,
                 backends,
                 hub: None,
+                neuron: None,
                 python_depends: None,
                 cuda: None,
                 xpu: None,
diff --git a/build2cmake/src/config/v2.rs b/build2cmake/src/config/v2.rs
@@ -132,6 +132,7 @@ impl TryFrom<Build> for super::Build {
                 Backend::Cpu,
                 Backend::Cuda,
                 Backend::Metal,
+                Backend::Neuron,
                 Backend::Rocm,
                 Backend::Xpu,
             ]
@@ -168,6 +169,7 @@ impl General {
             backends,
             cuda,
             hub: general.hub.map(Into::into),
+            neuron: None,
             python_depends: None,
             xpu: None,
         }
diff --git a/build2cmake/src/config/v3.rs b/build2cmake/src/config/v3.rs
@@ -31,6 +31,8 @@ pub struct General {
 
     pub hub: Option<Hub>,
 
+    pub neuron: Option<NeuronGeneral>,
+
     pub python_depends: Option<Vec<String>>,
 
     pub xpu: Option<XpuGeneral>,
@@ -44,6 +46,12 @@ pub struct CudaGeneral {
     pub python_depends: Option<Vec<String>>,
 }
 
+#[derive(Debug, Deserialize, Serialize)]
+#[serde(deny_unknown_fields, rename_all = "kebab-case")]
+pub struct NeuronGeneral {
+    pub python_depends: Option<Vec<String>>,
+}
+
 #[derive(Debug, Deserialize, Serialize)]
 #[serde(deny_unknown_fields, rename_all = "kebab-case")]
 pub struct XpuGeneral {
@@ -121,6 +129,7 @@ pub enum Backend {
     Cpu,
     Cuda,
     Metal,
+    Neuron,
     Rocm,
     Xpu,
 }
@@ -150,6 +159,7 @@ impl From<General> for super::General {
             backends: general.backends.into_iter().map(Into::into).collect(),
             cuda: general.cuda.map(Into::into),
             hub: general.hub.map(Into::into),
+            neuron: general.neuron.map(Into::into),
             python_depends: general.python_depends,
             xpu: general.xpu.map(Into::into),
         }
@@ -166,6 +176,14 @@ impl From<CudaGeneral> for super::CudaGeneral {
     }
 }
 
+impl From<NeuronGeneral> for super::NeuronGeneral {
+    fn from(neuron: NeuronGeneral) -> Self {
+        Self {
+            python_depends: neuron.python_depends,
+        }
+    }
+}
+
 impl From<XpuGeneral> for super::XpuGeneral {
     fn from(xpu: XpuGeneral) -> Self {
         Self {
@@ -201,6 +219,7 @@ impl From<Backend> for super::Backend {
             Backend::Cpu => super::Backend::Cpu,
             Backend::Cuda => super::Backend::Cuda,
             Backend::Metal => super::Backend::Metal,
+            Backend::Neuron => super::Backend::Neuron,
             Backend::Rocm => super::Backend::Rocm,
             Backend::Xpu => super::Backend::Xpu,
         }
@@ -304,6 +323,7 @@ impl From<super::General> for General {
             backends: general.backends.into_iter().map(Into::into).collect(),
             cuda: general.cuda.map(Into::into),
             hub: general.hub.map(Into::into),
+            neuron: general.neuron.map(Into::into),
             python_depends: general.python_depends,
             xpu: general.xpu.map(Into::into),
         }
@@ -320,6 +340,14 @@ impl From<super::CudaGeneral> for CudaGeneral {
     }
 }
 
+impl From<super::NeuronGeneral> for NeuronGeneral {
+    fn from(neuron: super::NeuronGeneral) -> Self {
+        Self {
+            python_depends: neuron.python_depends,
+        }
+    }
+}
+
 impl From<super::XpuGeneral> for XpuGeneral {
     fn from(xpu: super::XpuGeneral) -> Self {
         Self {
@@ -355,6 +383,7 @@ impl From<super::Backend> for Backend {
             super::Backend::Cpu => Backend::Cpu,
             super::Backend::Cuda => Backend::Cuda,
             super::Backend::Metal => Backend::Metal,
+            super::Backend::Neuron => Backend::Neuron,
             super::Backend::Rocm => Backend::Rocm,
             super::Backend::Xpu => Backend::Xpu,
         }
diff --git a/build2cmake/src/python_dependencies.json b/build2cmake/src/python_dependencies.json
@@ -14,6 +14,12 @@
       }
     },
     "metal": {},
+    "neuron": {
+      "nki": {
+        "nix": [],
+        "python": ["nki"]
+      }
+    },
     "rocm": {},
     "xpu": {
       "onednn": {
diff --git a/build2cmake/src/templates/noarch/setup.py b/build2cmake/src/templates/noarch/setup.py
@@ -1,8 +1,9 @@
 #!/usr/bin/env python
 
-import shutil
-from pathlib import Path
 from typing import Any
+from pathlib import Path
+import shutil
+import sys
 
 from setuptools import setup
 from setuptools.command.build import build
@@ -30,7 +31,10 @@ def run(self) -> None:
         """Execute the build command."""
         project_root = Path(__file__).parent
 
-        import tomllib
+        if sys.version_info >= (3, 11):
+            import tomllib
+        else:
+            import tomli as tomllib
 
         with open(project_root / "build.toml", "rb") as f:
             build_toml: dict[str, Any] = tomllib.load(f)
diff --git a/builder/README.md b/builder/README.md
@@ -63,6 +63,9 @@ See [dockerfiles/README.md](./dockerfiles/README.md) for more options, including
 | XPU        | ✓               | ✓                      | ✗                       | 2    |
 | Metal      | ✓               | ✓                      | ✗                       | 2    |
 | Huawei NPU | ✓               | ✗                      | ✗                       | 3    |
+| Neuron     | ✓               | x                      | x                       | 3    |
+
+**Warning:** Neuron support is experimental and currently requires pre-release packages.
 
 # 📚 Documentation
 
diff --git a/builder/examples/relu-nki/build.toml b/builder/examples/relu-nki/build.toml
@@ -0,0 +1,9 @@
+[general]
+name = "relu-nki"
+version = 1
+backends = [
+    "neuron",
+]
+
+[general.neuron]
+python-depends = ["nki"]
diff --git a/builder/examples/relu-nki/torch-ext/relu_nki/__init__.py b/builder/examples/relu-nki/torch-ext/relu_nki/__init__.py
@@ -0,0 +1,27 @@
+import nki
+import nki.language as nl
+import nki.isa as nisa
+
+from ._ops import ops
+
+
+@nki.jit(platform_target="trn2")
+def relu(x):
+    # Check the first dimension's size to ensure it does not exceed on-chip
+    # memory tile size, since this simple kernel does not tile inputs.
+    assert x.shape[0] <= nl.tile_size.pmax
+    x_tile = sbuf.view(dtype=x.dtype, shape=x.shape)
+    nisa.dma_copy(dst=x_tile, src=x)
+    out_tile = sbuf.view(dtype=x.dtype, shape=x.shape)
+    nisa.tensor_scalar(dst=out_tile, data=x_tile, operand0=0, op0=nl.maximum)
+    c_output = hbm.view(dtype=x.dtype, shape=x.shape)
+    nisa.dma_copy(dst=c_output, src=out_tile)
+    return c_output
+
+
+from . import layers
+
+__all__ = [
+    "layers",
+    "relu",
+]
diff --git a/builder/examples/relu-nki/torch-ext/relu_nki/layers/__init__.py b/builder/examples/relu-nki/torch-ext/relu_nki/layers/__init__.py
@@ -0,0 +1,9 @@
+import torch
+import torch.nn as nn
+
+from .. import relu
+
+
+class ReLU(nn.Module):
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return relu(x)
diff --git a/kernels/src/kernels/layer/kernelize.py b/kernels/src/kernels/layer/kernelize.py
@@ -273,7 +273,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
 
 def _validate_device_type(device_type: str) -> None:
     """Validate that the device type is supported."""
-    supported_devices = {"cpu", "cuda", "mps", "npu", "rocm", "xpu"}
+    supported_devices = {"cpu", "cuda", "mps", "neuron", "npu", "rocm", "xpu"}
     if device_type not in supported_devices:
         raise ValueError(
             f"Unsupported device type '{device_type}'. Supported device types are: {', '.join(sorted(supported_devices))}"
@@ -309,3 +309,9 @@ def _is_rocm_platform():
     import torch
 
     return torch.version.hip is not None
+
+
+def _has_neuron_ops():
+    import torch
+
+    return hasattr(torch, "neuron")
diff --git a/kernels/src/kernels/layer/repos.py b/kernels/src/kernels/layer/repos.py
@@ -35,6 +35,8 @@ def create_repo(device: Device) -> "DeviceRepos":
             return _XPURepos()
         elif device.type == "npu":
             return _NPURepos()
+        elif device.type == "neuron":
+            return _NeuronRepos()
         else:
             raise ValueError(f"Unknown device type: {device.type}")
 
@@ -92,6 +94,26 @@ def insert(self, device: Device, repos: dict[Mode, RepositoryProtocol]):
         self._repos = repos
 
 
+class _NeuronRepos(DeviceRepos):
+    _repos: dict[Mode, RepositoryProtocol]
+
+    def __init__(self):
+        super().__init__()
+        self._repos = {}
+
+    @property
+    def repos(
+        self,
+    ) -> dict[Mode, RepositoryProtocol] | None:
+        return self._repos
+
+    def insert(self, device: Device, repos: dict[Mode, RepositoryProtocol]):
+        if device.type != "neuron":
+            raise ValueError(f"Device type must be 'neuron', got {device.type}")
+
+        self._repos = repos
+
+
 class _NPURepos(DeviceRepos):
     _repos: dict[Mode, RepositoryProtocol]
 
diff --git a/kernels/src/kernels/python_depends.json b/kernels/src/kernels/python_depends.json
@@ -14,6 +14,12 @@
       }
     },
     "metal": {},
+    "neuron": {
+      "nki": {
+        "nix": [],
+        "python": ["nki"]
+      }
+    },
     "rocm": {},
     "xpu": {
       "onednn": {
diff --git a/kernels/src/kernels/utils.py b/kernels/src/kernels/utils.py
diff --git a/kernels/tests/conftest.py b/kernels/tests/conftest.py
diff --git a/kernels/tests/test_basic.py b/kernels/tests/test_basic.py
diff --git a/kernels/tests/test_init.py b/kernels/tests/test_init.py
diff --git a/kernels/tests/test_layer.py b/kernels/tests/test_layer.py