[API compatibility] change gelu api for paddle (PaddlePaddle#74485)

Starrysea996 · maxiaolong001 · commit 314ca22531a3 · 2025-08-12T21:44:12.000+08:00
* change gelu api

* simple gelu function and class

* replace str with literal
diff --git a/python/paddle/nn/functional/activation.py b/python/paddle/nn/functional/activation.py
@@ -14,7 +14,7 @@
 
 from __future__ import annotations
 
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Literal
 
 import paddle
 from paddle import _C_ops, in_dynamic_mode
@@ -150,14 +150,18 @@ def elu_(x: Tensor, alpha: float = 1.0, name: str | None = None) -> Tensor:
 
 
 def gelu(
-    x: Tensor, approximate: bool = False, name: str | None = None
+    x: Tensor,
+    approximate: Literal["tanh", "none"] | bool = False,
+    name: str | None = None,
 ) -> Tensor:
     r"""
     gelu activation.
 
     The activation function of Gelu is calculated element by element. More information refers to :ref: `Gaussian Error Linear Units`.
 
-    if approximate is True
+    approximate parameter must be True, False, "tanh", "none".
+
+    if approximate is True or "tanh"
 
     .. math::
 
@@ -171,7 +175,7 @@ def gelu(
 
     Parameters:
         x (Tensor): The input Tensor with data type float32, float64.
-        approximate (bool, optional): Whether to enable approximation. Default is False.
+        approximate (str|bool, optional): Whether to enable approximation. Default is False.
         name (str|None, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
 
     Returns:
@@ -194,8 +198,23 @@ def gelu(
             Tensor(shape=[2, 2], dtype=float32, place=Place(cpu), stop_gradient=True,
             [[-0.15880796,  0.34571400],
              [ 0.84119201,  1.39957154]])
+            >>> out3 = F.gelu(x, "none")
+            >>> print(out3)
+            Tensor(shape=[2, 2], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[-0.15865529,  0.34573123],
+             [ 0.84134471,  1.39978933]])
+            >>> out4 = F.gelu(x, "tanh")
+            >>> print(out4)
+            Tensor(shape=[2, 2], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[-0.15880796,  0.34571400],
+             [ 0.84119201,  1.39957154]])
     """
 
+    if approximate == "tanh":
+        approximate = True
+    elif approximate == "none":
+        approximate = False
+
     if in_dynamic_or_pir_mode():
         return _C_ops.gelu(x, approximate)
     else:
diff --git a/python/paddle/nn/layer/activation.py b/python/paddle/nn/layer/activation.py
@@ -15,7 +15,7 @@
 
 from __future__ import annotations
 
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Literal
 
 from paddle.framework import get_default_dtype
 
@@ -176,7 +176,9 @@ class GELU(Layer):
     r"""
     GELU Activation.
 
-    If approximate is True
+    approximate parameter must be True, False, "tanh", "none".
+
+    If approximate is True or "tanh"
 
     .. math::
 
@@ -189,7 +191,7 @@ class GELU(Layer):
         GELU(x) = 0.5 * x * (1 + erf(\frac{x}{\sqrt{2}}))
 
     Parameters:
-        approximate (bool, optional): Whether to enable approximation. Default is False.
+        approximate (str|bool, optional): Whether to enable approximation. Default is False.
         name (str|None, optional): Name for the operation (optional, default is None).
             For more information, please refer to :ref:`api_guide_Name`.
 
@@ -208,6 +210,24 @@ class GELU(Layer):
             Tensor(shape=[2, 2], dtype=float32, place=Place(cpu), stop_gradient=True,
             [[-0.15865529,  0.34573123],
              [ 0.84134471,  1.39978933]])
+            >>> m = paddle.nn.GELU(False)
+            >>> out = m(x)
+            >>> print(out)
+            Tensor(shape=[2, 2], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[-0.15865529,  0.34573123],
+             [ 0.84134471,  1.39978933]])
+            >>> m = paddle.nn.GELU("none")
+            >>> out = m(x)
+            >>> print(out)
+            Tensor(shape=[2, 2], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[-0.15865529,  0.34573123],
+             [ 0.84134471,  1.39978933]])
+            >>> m = paddle.nn.GELU("tanh")
+            >>> out = m(x)
+            >>> print(out)
+            Tensor(shape=[2, 2], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[-0.15880796,  0.34571400],
+             [ 0.84119201,  1.39957154]])
             >>> m = paddle.nn.GELU(True)
             >>> out = m(x)
             >>> print(out)
@@ -217,7 +237,9 @@ class GELU(Layer):
     """
 
     def __init__(
-        self, approximate: bool = False, name: str | None = None
+        self,
+        approximate: Literal["tanh", "none"] | bool = False,
+        name: str | None = None,
     ) -> None:
         super().__init__()
         self._approximate = approximate
diff --git a/test/legacy_test/test_gelu_op.py b/test/legacy_test/test_gelu_op.py
@@ -20,10 +20,14 @@
 import paddle
 import paddle.base.dygraph as dg
 import paddle.nn.functional as F
-from paddle import base
+from paddle import base, nn
 
 
 def gelu(x, approximate):
+    if approximate == "tanh":
+        approximate = True
+    if approximate == "none":
+        approximate = False
     if approximate:
         y_ref = (
             0.5
@@ -46,9 +50,14 @@ def _test_case1_cpu(self, approximate):
         place = base.CPUPlace()
         with dg.guard(place) as g:
             x_var = paddle.to_tensor(x)
-            y_var = F.gelu(x_var, approximate)
-            y_test = y_var.numpy()
-        np.testing.assert_allclose(y_ref, y_test, rtol=1e-05, atol=1e-08)
+            y_var1 = F.gelu(x_var, approximate)
+            y_test1 = y_var1.numpy()
+
+            func = nn.GELU(approximate)
+            y_var2 = func(x_var)
+            y_test2 = y_var2.numpy()
+        np.testing.assert_allclose(y_ref, y_test1, rtol=1e-05, atol=1e-08)
+        np.testing.assert_allclose(y_ref, y_test2, rtol=1e-05, atol=1e-08)
 
     def _test_case1_gpu(self, approximate):
         x = np.random.uniform(-1, 1, size=(11, 17)).astype(np.float32)
@@ -57,12 +66,17 @@ def _test_case1_gpu(self, approximate):
         place = base.CUDAPlace(0)
         with dg.guard(place) as g:
             x_var = paddle.to_tensor(x)
-            y_var = F.gelu(x_var, approximate)
-            y_test = y_var.numpy()
-        np.testing.assert_allclose(y_ref, y_test, rtol=1e-05, atol=1e-08)
+            y_var1 = F.gelu(x_var, approximate)
+            y_test1 = y_var1.numpy()
+
+            func = nn.GELU(approximate)
+            y_var2 = func(x_var)
+            y_test2 = y_var2.numpy()
+        np.testing.assert_allclose(y_ref, y_test1, rtol=1e-05, atol=1e-08)
+        np.testing.assert_allclose(y_ref, y_test2, rtol=1e-05, atol=1e-08)
 
     def test_cases(self):
-        for approximate in [True, False]:
+        for approximate in [True, False, "none", "tanh"]:
             self._test_case1_cpu(approximate)
             if base.is_compiled_with_cuda():
                 self._test_case1_gpu(approximate)
@@ -86,15 +100,36 @@ def run_gelu_op(approximate):
                 x_grad = paddle.grad([y], [x], [paddle.to_tensor(y_g_np)])[0]
                 return y.numpy(), x_grad.numpy()
 
+        def run_gelu_class(approximate):
+            with dg.guard():
+                x = paddle.to_tensor(x_np)
+                x.stop_gradient = False
+                func = nn.GELU(approximate=approximate)
+                y = func(x)
+                x_grad = paddle.grad([y], [x], [paddle.to_tensor(y_g_np)])[0]
+                return y.numpy(), x_grad.numpy()
+
         use_fast_math(True)
-        y_fast_math, x_g_fast_math = run_gelu_op(True)
+        y_fast_math1, x_g_fast_math1 = run_gelu_op(True)
+        y_fast_math2, x_g_fast_math2 = run_gelu_class(True)
         use_fast_math(False)
 
-        y_ref, x_g_ref = run_gelu_op(True)
-        np.testing.assert_allclose(y_ref, y_fast_math, rtol=1e-05, atol=0.0005)
+        y_ref1, x_g_ref1 = run_gelu_op(True)
+        y_ref2, x_g_ref2 = run_gelu_class(True)
+        np.testing.assert_allclose(
+            y_ref1, y_fast_math1, rtol=1e-05, atol=0.0005
+        )
+
+        np.testing.assert_allclose(
+            x_g_ref1, x_g_fast_math1, rtol=1e-05, atol=0.0005
+        )
+
+        np.testing.assert_allclose(
+            y_ref2, y_fast_math2, rtol=1e-05, atol=0.0005
+        )
 
         np.testing.assert_allclose(
-            x_g_ref, x_g_fast_math, rtol=1e-05, atol=0.0005
+            x_g_ref2, x_g_fast_math2, rtol=1e-05, atol=0.0005
         )
 
 
@@ -105,38 +140,97 @@ def _test_case1_cpu(self, approximate):
 
         place = base.CPUPlace()
         with dg.guard(place) as g:
-            x_var = paddle.to_tensor(x)
-            x_var.stop_gradient = False
-            y_var = F.gelu(x_var, approximate)
-            y_test = y_var.numpy()
+            x_var1 = paddle.to_tensor(x)
+            x_var2 = paddle.to_tensor(x)
+
+            x_var1.stop_gradient = False
+            x_var2.stop_gradient = False
+
+            y_var1 = F.gelu(x_var1, approximate)
+            y_test1 = y_var1.numpy()
+
+            func = nn.GELU(approximate)
+            y_var2 = func(x_var2)
+            y_test2 = y_var2.numpy()
 
-            loss = paddle.sum(y_var)
-            loss.backward()
-        np.testing.assert_allclose(y_ref, y_test, rtol=1e-05, atol=1e-08)
-        np.testing.assert_allclose(x_var.grad.shape, x_var.shape)
+            loss1 = paddle.sum(y_var1)
+            loss1.backward()
+
+            loss2 = paddle.sum(y_var2)
+            loss2.backward()
+        np.testing.assert_allclose(y_ref, y_test1, rtol=1e-05, atol=1e-08)
+        np.testing.assert_allclose(x_var1.grad.shape, x_var1.shape)
+
+        np.testing.assert_allclose(y_ref, y_test2, rtol=1e-05, atol=1e-08)
+        np.testing.assert_allclose(x_var2.grad.shape, x_var2.shape)
 
     def _test_case1_gpu(self, approximate):
         x = np.random.uniform(-1, 1, size=(0, 17)).astype(np.float32)
         y_ref = gelu(x, approximate)
 
         place = base.CUDAPlace(0)
         with dg.guard(place) as g:
-            x_var = paddle.to_tensor(x)
-            x_var.stop_gradient = False
-            y_var = F.gelu(x_var, approximate)
-            y_test = y_var.numpy()
+            x_var1 = paddle.to_tensor(x)
+            x_var2 = paddle.to_tensor(x)
+
+            x_var1.stop_gradient = False
+            x_var2.stop_gradient = False
+
+            y_var1 = F.gelu(x_var1, approximate)
+            y_test1 = y_var1.numpy()
 
-            loss = paddle.sum(y_var)
-            loss.backward()
-        np.testing.assert_allclose(y_ref, y_test, rtol=1e-05, atol=1e-08)
-        np.testing.assert_allclose(x_var.grad.shape, x_var.shape)
+            func = nn.GELU(approximate)
+            y_var2 = func(x_var2)
+            y_test2 = y_var2.numpy()
+
+            loss1 = paddle.sum(y_var1)
+            loss1.backward()
+
+            loss2 = paddle.sum(y_var2)
+            loss2.backward()
+        np.testing.assert_allclose(y_ref, y_test1, rtol=1e-05, atol=1e-08)
+        np.testing.assert_allclose(x_var1.grad.shape, x_var1.shape)
+
+        np.testing.assert_allclose(y_ref, y_test2, rtol=1e-05, atol=1e-08)
+        np.testing.assert_allclose(x_var2.grad.shape, x_var2.shape)
 
     def test_cases(self):
-        for approximate in [True, False]:
+        for approximate in [True, False, "none", "tanh"]:
             self._test_case1_cpu(approximate)
             if base.is_compiled_with_cuda():
                 self._test_case1_gpu(approximate)
 
 
+class TestGeluError(unittest.TestCase):
+
+    def setUp(self):
+        x = np.random.uniform(-1, 1, size=(11, 17)).astype(np.float32)
+        self.x = paddle.to_tensor(x)
+
+    def test_gelu_op_error(self):
+
+        def test_type_error1():
+            y = F.gelu(self.x, "tan")
+
+        def test_type_error2():
+            y = F.gelu(self.x, 1234)
+
+        self.assertRaises(TypeError, test_type_error1)
+        self.assertRaises(TypeError, test_type_error2)
+
+    def test_gelu_class_error(self):
+
+        def test_type_error1():
+            func = nn.GELU("tan")
+            y = func(self.x)
+
+        def test_type_error2():
+            func = nn.GELU(1234)
+            y = func(self.x)
+
+        self.assertRaises(TypeError, test_type_error1)
+        self.assertRaises(TypeError, test_type_error2)
+
+
 if __name__ == '__main__':
     unittest.main()