kernelize gelu.

sayakpaul · sayakpaul · commit 58743c3ee75e · 2025-09-16T18:09:12.000+05:30
diff --git a/src/diffusers/models/activations.py b/src/diffusers/models/activations.py
@@ -17,10 +17,12 @@
 import torch.nn.functional as F
 from torch import nn
 
-from ..utils import deprecate
-from ..utils.import_utils import is_torch_npu_available, is_torch_version
+from ..utils import deprecate, get_logger, is_kernels_available, is_torch_npu_available, is_torch_version
+from ..utils.constants import DIFFUSERS_ENABLE_HUB_KERNELS
 
 
+logger = get_logger(__name__)
+
 if is_torch_npu_available():
     import torch_npu
 
@@ -31,6 +33,7 @@
     "gelu": nn.GELU,
     "relu": nn.ReLU,
 }
+KERNELS_REPO_ID = "kernels-community/activation"
 
 
 def get_activation(act_fn: str) -> nn.Module:
@@ -90,6 +93,38 @@ def forward(self, hidden_states):
         return hidden_states
 
 
+class CUDAOptimizedGELU(nn.Module):
+    def __init__(self, dim_in: int, dim_out: int, approximate: str = "none", bias: bool = True):
+        if not torch.cuda.is_available():
+            raise NotImplementedError(f"{self.__class__.__name__} is implemented only for CUDA devices.")
+        if not DIFFUSERS_ENABLE_HUB_KERNELS:
+            raise RuntimeError(
+                f"{self.__class__.__name__} isn't usable because the `DIFFUSERS_ENABLE_HUB_KERNELS` env var isn't set. Please set it like `export DIFFUSERS_ENABLE_HUB_KERNELS=yes`."
+            )
+        if not is_kernels_available():
+            raise NotImplementedError(
+                f"{self.__class__.__name__} requires the `kernels` library to be installed. Install it with `pip install kernels`."
+            )
+
+        from kernels import get_kernel
+
+        super().__init__()
+        self.proj = nn.Linear(dim_in, dim_out, bias=bias)
+        activations = get_kernel(KERNELS_REPO_ID)
+        if approximate == "tanh":
+            self.act = activations.gelu_tanh_and_mul
+        elif approximate == "none":
+            self.act = activations.gelu_and_mul
+        else:
+            raise NotImplementedError
+
+    def forward(self, hidden_states):
+        hidden_states = self.proj(hidden_states)
+        out = torch.empty_like(hidden_states)
+        output = self.act(out, hidden_states)
+        return output
+
+
 class GEGLU(nn.Module):
     r"""
     A [variant](https://huggingface.co/papers/2002.05202) of the gated linear unit activation function.
diff --git a/src/diffusers/models/attention.py b/src/diffusers/models/attention.py
@@ -20,7 +20,6 @@
 
 from ..utils import deprecate, logging
 from ..utils.import_utils import is_torch_npu_available, is_torch_xla_available, is_xformers_available
-from ..utils.kernels_utils import use_kernel_forward_from_hub
 from ..utils.torch_utils import maybe_allow_in_graph
 from .activations import GEGLU, GELU, ApproximateGELU, FP32SiLU, LinearActivation, SwiGLU
 from .attention_processor import Attention, AttentionProcessor, JointAttnProcessor2_0
@@ -1670,7 +1669,6 @@ def forward(
         return hidden_states
 
 
-@use_kernel_forward_from_hub("MLP")
 class FeedForward(nn.Module):
     r"""
     A feed-forward layer.
diff --git a/src/diffusers/utils/kernels_utils.py b/src/diffusers/utils/kernels_utils.py
@@ -38,7 +38,6 @@ def _get_fa3_from_hub():
         "RMSNorm": {
             "cuda": LayerRepository(repo_id="kernels-community/liger_kernels", layer_name="LigerRMSNorm"),
         },
-        "MLP": {"cuda": LayerRepository(repo_id="medmekk/triton-llama-mlp", layer_name="TritonLlamaMLP")},
     }
 
     register_kernel_mapping(_KERNEL_MAPPING)

Original file line number	Diff line number	Diff line change
`@@ -38,7 +38,6 @@ def _get_fa3_from_hub():`
`38`	`38`	`"RMSNorm": {`
`39`	`39`	`"cuda": LayerRepository(repo_id="kernels-community/liger_kernels", layer_name="LigerRMSNorm"),`
`40`	`40`	`},`
`41`		`- "MLP": {"cuda": LayerRepository(repo_id="medmekk/triton-llama-mlp", layer_name="TritonLlamaMLP")},`
`42`	`41`	`}`
`43`	`42`
`44`	`43`	`register_kernel_mapping(_KERNEL_MAPPING)`