Enable SVE ACLE implementation for tanH Aten op for FP32 dType. (pytorch#143741)

maajidkhann · amathewc · commit f9f9fcf87abc · 2025-04-17T07:03:26.000+03:00
In deep learning models, the tanh (hyperbolic tangent) function is a widely used activation function, primarily in feedforward networks, recurrent neural networks (RNNs), and various other architectures. Also, the tanh (hyperbolic tangent) function is commonly used in **Physics-Informed Neural Networks (PINNs).** PINNs are a class of machine learning models designed to solve partial differential equations (PDEs) by incorporating the governing physics directly into the loss function, along with data-driven terms. In PINNs, activation functions like tanh are used in the neural network architecture to enable the model to learn complex mappings between inputs (such as spatial and temporal coordinates) and outputs (such as field variables). **Operator: tanh()** **Current Implementation in OSS in ATen Backend:** **SVE Flow:** Uses SVE sleef when available else std implementation. **With this PR :** **SVE Flow:** Uses SVE ACLE implementation. (Faster Implementation) **Here are the performance improvements.** **Single core perf numbers:** ![image](https://github.com/user-attachments/assets/c2f4bcb6-11bc-4af1-b5eb-278a4cc4a69d) **Metric:** CPU time avg time per iteration (In ms) As you can see with both gcc and clang compilers, we see a significant performance gain with SVE ACLE implementation over current OSS Implementation (Sleef) and also Neon. **Hardware:** m7g.8xlarge (Graviton 3 Instance) **Script used in benchmarking:** ```python import os #os.environ["ATEN_CPU_CAPABILITY"] = "default" os.environ["ATEN_CPU_CAPABILITY"] = "sve256" import torch import torch.nn as nn #Set the random seed for reproducibility torch.manual_seed(1) #Create a tensor of shape (8521, 50) x = torch.randn(8521, 50) for i in range(10): output = x.tanh() #Perform the tanh operation 1000 times and profile the performance print("### CPU tanh") with torch.autograd.profiler.profile(record_shapes=True) as prof: for i in range(1000): output = x.tanh() #Print the profiling results sorted by self CPU time print(prof.key_averages().table(sort_by="self_cpu_time_total")) #Optionally print the final output (if needed, uncomment the following line) print(output) ``` Pull Request resolved: pytorch#143741 Approved by: https://github.com/malfet
diff --git a/aten/src/ATen/cpu/vec/sve/vec_float.h b/aten/src/ATen/cpu/vec/sve/vec_float.h
@@ -85,6 +85,58 @@ template <> class Vectorized<float> {
     }
     return b;
   }
+  //Implementation is picked from https://github.com/ARM-software/ComputeLibrary/blob/v25.01/src/core/NEON/SVEMath.inl#L105
+  inline svfloat32_t svexp_f32_z(svbool_t pg, svfloat32_t x) const {
+    const auto c1 = svreinterpret_f32_u32(svdup_n_u32(0x3f7ffff6)); // x^1: 0x1.ffffecp-1f
+    const auto c2 = svreinterpret_f32_u32(svdup_n_u32(0x3efffedb)); // x^2: 0x1.fffdb6p-2f
+    const auto c3 = svreinterpret_f32_u32(svdup_n_u32(0x3e2aaf33)); // x^3: 0x1.555e66p-3f
+    const auto c4 = svreinterpret_f32_u32(svdup_n_u32(0x3d2b9f17)); // x^4: 0x1.573e2ep-5f
+    const auto c5 = svreinterpret_f32_u32(svdup_n_u32(0x3c072010)); // x^5: 0x1.0e4020p-7f
+    const auto shift   = svreinterpret_f32_u32(svdup_n_u32(0x4b00007f)); // 2^23 + 127 = 0x1.0000fep23f
+    const auto inv_ln2 = svreinterpret_f32_u32(svdup_n_u32(0x3fb8aa3b)); // 1 / ln(2) = 0x1.715476p+0f
+    const auto neg_ln2_hi =
+        svreinterpret_f32_u32(svdup_n_u32(0xbf317200)); // -ln(2) from bits  -1 to -19: -0x1.62e400p-1f
+    const auto neg_ln2_lo =
+        svreinterpret_f32_u32(svdup_n_u32(0xb5bfbe8e)); // -ln(2) from bits -20 to -42: -0x1.7f7d1cp-20f
+    const auto inf       = svdup_n_f32(std::numeric_limits<float>::infinity());
+    const auto max_input = svdup_n_f32(88.37f); // Approximately ln(2^127.5)
+    const auto zero      = svdup_n_f32(0.f);
+    const auto min_input = svdup_n_f32(-86.64f); // Approximately ln(2^-125)
+    // Range reduction:
+    //   e^x = 2^n * e^r
+    // where:
+    //   n = floor(x / ln(2))
+    //   r = x - n * ln(2)
+    //
+    // By adding x / ln(2) with 2^23 + 127 (shift):
+    //   * As FP32 fraction part only has 23-bits, the addition of 2^23 + 127 forces decimal part
+    //     of x / ln(2) out of the result. The integer part of x / ln(2) (i.e. n) + 127 will occupy
+    //     the whole fraction part of z in FP32 format.
+    //     Subtracting 2^23 + 127 (shift) from z will result in the integer part of x / ln(2)
+    //     (i.e. n) because the decimal part has been pushed out and lost.
+    //   * The addition of 127 makes the FP32 fraction part of z ready to be used as the exponent
+    //     in FP32 format. Left shifting z by 23 bits will result in 2^n.
+    const auto z     = svmla_f32_z(pg, shift, x, inv_ln2);
+    const auto n     = svsub_f32_z(pg, z, shift);
+    const auto scale = svreinterpret_f32_u32(svlsl_n_u32_z(pg, svreinterpret_u32_f32(z), 23)); // 2^n
+    // The calculation of n * ln(2) is done using 2 steps to achieve accuracy beyond FP32.
+    // This outperforms longer Taylor series (3-4 tabs) both in term of accuracy and performance.
+    const auto r_hi = svmla_f32_z(pg, x, n, neg_ln2_hi);
+    const auto r    = svmla_f32_z(pg, r_hi, n, neg_ln2_lo);
+    // Compute the truncated Taylor series of e^r.
+    //   poly = scale * (1 + c1 * r + c2 * r^2 + c3 * r^3 + c4 * r^4 + c5 * r^5)
+    const auto r2 = svmul_f32_z(pg, r, r);
+    const auto p1     = svmul_f32_z(pg, c1, r);
+    const auto p23    = svmla_f32_z(pg, c2, c3, r);
+    const auto p45    = svmla_f32_z(pg, c4, c5, r);
+    const auto p2345  = svmla_f32_z(pg, p23, p45, r2);
+    const auto p12345 = svmla_f32_z(pg, p1, p2345, r2);
+    auto poly = svmla_f32_z(pg, scale, p12345, scale);
+    // Handle underflow and overflow.
+    poly = svsel_f32(svcmplt_f32(pg, x, min_input), zero, poly);
+    poly = svsel_f32(svcmpgt_f32(pg, x, max_input), inf, poly);
+    return poly;
+  }
   static Vectorized<float> loadu(const void* ptr, int64_t count = size()) {
     if (count == size())
       return svld1_f32(ptrue, reinterpret_cast<const float*>(ptr));
@@ -333,8 +385,34 @@ template <> class Vectorized<float> {
   Vectorized<float> tan() const {
     return USE_SLEEF(Vectorized<float>(Sleef_tanfx_u10sve(values)),map(std::tan));
   }
+  //Implementation is picked from https://github.com/ARM-software/ComputeLibrary/blob/v25.01/src/core/NEON/SVEMath.inl#L179
   Vectorized<float> tanh() const {
-    return USE_SLEEF(Vectorized<float>(Sleef_tanhfx_u10sve(values)),map(std::tanh));
+    // Constants used for the tanh calculation.
+    const svfloat32_t CONST_1        = svdup_n_f32(1.f);  // Constant 1.0f for the tanh formula.
+    const svfloat32_t CONST_2        = svdup_n_f32(2.f);  // Constant 2.0f for the tanh formula (used in exp(2x)).
+    const svfloat32_t CONST_MIN_TANH = svdup_n_f32(-10.f); // Minimum threshold for input values to prevent overflow.
+    const svfloat32_t CONST_MAX_TANH = svdup_n_f32(10.f); // Maximum threshold for input values to prevent overflow.
+
+    // Step 1: Clamp the values within the range [-10, 10] to prevent overflow during exponentiation.
+    // The tanh function approaches ±1 rapidly as the input grows large, so we limit the input range to avoid numerical instability.
+    // svmax_f32_z ensures values are greater than -10, and svmin_f32_z ensures they are less than 10.
+    svfloat32_t x     = svmin_f32_z(ptrue, svmax_f32_z(ptrue, values, CONST_MIN_TANH), CONST_MAX_TANH);
+
+    // Step 2: Calculate exp(2 * x), where x is the clamped value.
+    // svmul_f32_z computes 2 * x, and svexp_f32_z computes the exponential of the result.
+    svfloat32_t exp2x = svexp_f32_z(ptrue, svmul_f32_z(ptrue, CONST_2, x));
+
+    // Step 3: Calculate the numerator of the tanh function, which is exp(2x) - 1.
+    svfloat32_t num   = svsub_f32_z(ptrue, exp2x, CONST_1);
+
+    // Step 4: Calculate the denominator of the tanh function, which is exp(2x) + 1.
+    svfloat32_t den   = svadd_f32_z(ptrue, exp2x, CONST_1);
+
+    // Step 5: Calculate the tanh function as the ratio of the numerator and denominator: num / den.
+    svfloat32_t tanh  = svdiv_f32_z(ptrue, num, den);
+
+    // Return the calculated tanh values.
+    return tanh;
   }
   Vectorized<float> trunc() const {
     return svrintz_f32_x(ptrue, values);
diff --git a/aten/src/ATen/test/vec_test_all_types.cpp b/aten/src/ATen/test/vec_test_all_types.cpp
@@ -371,11 +371,22 @@ namespace {
     }
     TYPED_TEST(Hyperbolic, Tanh) {
         using vec = TypeParam;
+// NOTE: Because SVE uses ACL logic, the precision changes, hence the adjusted tolerance.
+#if defined(CPU_CAPABILITY_SVE)
+        using UVT = UvalueType<vec>;
+        UVT tolerance = getDefaultTolerance<UVT>();
+        test_unary<vec>(
+            NAME_INFO(tanH),
+            RESOLVE_OVERLOAD(std::tanh),
+            [](vec v) { return v.tanh(); },
+            createDefaultUnaryTestCase<vec>(TestSeed(), tolerance));
+#else
         test_unary<vec>(
             NAME_INFO(tanH),
             RESOLVE_OVERLOAD(std::tanh),
             [](vec v) { return v.tanh(); },
             createDefaultUnaryTestCase<vec>(TestSeed()));
+#endif
     }
     TYPED_TEST(Hyperbolic, Sinh) {
         using vec = TypeParam;