Implement tanh

SuperFluffy · gnzlbg · commit 5719e4b6c15a · 2019-03-16T12:55:10.000+01:00
This implements tanh for packed vectors. This is primarily interesting when using sleef-sys for its simd implemenations of tanh. Since llvm does not contain tanh intrinsics, the libm implementation is used for primitives, and packed vectors are transmuted into slices before applying the libm tanh to each of its elements.
diff --git a/Cargo.toml b/Cargo.toml
@@ -23,6 +23,7 @@ maintenance = { status = "experimental" }
 [dependencies]
 cfg-if = "^0.1.6"
 core_arch = { version = "^0.1.4", optional = true }
+libm = "0.1.2"
 
 [features]
 default = []
@@ -39,4 +40,4 @@ optional = true
 
 [target.wasm32-unknown-unknown.dev-dependencies]
 wasm-bindgen = "=0.2.19"
-wasm-bindgen-test = "=0.2.19"
+wasm-bindgen-test = "=0.2.19"
diff --git a/src/api.rs b/src/api.rs
@@ -213,6 +213,7 @@ macro_rules! impl_f {
         impl_math_float_sin!([$elem_ty; $elem_n]: $tuple_id | $test_tt);
         impl_math_float_sqrt!([$elem_ty; $elem_n]: $tuple_id | $test_tt);
         impl_math_float_sqrte!([$elem_ty; $elem_n]: $tuple_id | $test_tt);
+        impl_math_float_tanh!([$elem_ty; $elem_n]: $tuple_id | $test_tt);
         impl_cmp_vertical!(
             [$elem_ty; $elem_n]: $tuple_id, $mask_ty, false, (1., 0.)
                 | $test_tt
diff --git a/src/api/math/float.rs b/src/api/math/float.rs
@@ -39,6 +39,9 @@ mod sqrt;
 #[macro_use]
 mod sqrte;
 
+#[macro_use]
+mod tanh;
+
 macro_rules! impl_float_category {
     ([$elem_ty:ident; $elem_count:expr]: $id:ident, $mask_ty:ident) => {
         impl $id {
diff --git a/src/api/math/float/tanh.rs b/src/api/math/float/tanh.rs
@@ -0,0 +1,29 @@
+//! Implements vertical (lane-wise) floating-point `tanh`.
+
+macro_rules! impl_math_float_tanh {
+    ([$elem_ty:ident; $elem_count:expr]: $id:ident | $test_tt:tt) => {
+        impl $id {
+            /// Tanh.
+            #[inline]
+            pub fn tanh(self) -> Self {
+                use crate::codegen::math::float::tanh::Tanh;
+                Tanh::tanh(self)
+            }
+        }
+
+        test_if!{
+            $test_tt:
+            paste::item! {
+                pub mod [<$id _math_tanh>] {
+                    use super::*;
+                    #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
+                    fn tanh() {
+                        let z = $id::splat(0 as $elem_ty);
+
+                        assert_eq!(z, z.tanh());
+                    }
+                }
+            }
+        }
+    };
+}
diff --git a/src/codegen/math/float.rs b/src/codegen/math/float.rs
@@ -16,3 +16,4 @@ crate mod sin_cos_pi;
 crate mod sin_pi;
 crate mod sqrt;
 crate mod sqrte;
+crate mod tanh;
diff --git a/src/codegen/math/float/tanh.rs b/src/codegen/math/float/tanh.rs
@@ -0,0 +1,117 @@
+//! Vertical floating-point `tanh`
+#![allow(unused)]
+
+// FIXME 64-bit 1 elem vectors tanh
+
+use crate::*;
+
+crate trait Tanh {
+    fn tanh(self) -> Self;
+}
+
+macro_rules! define_tanh {
+
+    ($name:ident, $basetype:ty, $simdtype:ty, $lanes:expr, $trait:path) => {
+        fn $name(x: $simdtype) -> $simdtype {
+            use core::intrinsics::transmute;
+            let mut buf: [$basetype; $lanes] = unsafe { transmute(x) };
+            for elem in &mut buf {
+                *elem = <$basetype as $trait>::tanh(*elem);
+            }
+            unsafe { transmute(buf) }
+        }
+    };
+
+    (f32 => $name:ident, $type:ty, $lanes:expr) => {
+        define_tanh!($name, f32, $type, $lanes, libm::F32Ext);
+    };
+
+    (f64 => $name:ident, $type:ty, $lanes:expr) => {
+        define_tanh!($name, f64, $type, $lanes, libm::F64Ext);
+    };
+}
+
+// llvm does not seem to expose the hyperbolic versions of trigonometric functions;
+// we thus call the classical rust versions on all of them (which stem from cmath).
+define_tanh!(f32 => tanh_v2f32, f32x2, 2);
+define_tanh!(f32 => tanh_v4f32, f32x4, 4);
+define_tanh!(f32 => tanh_v8f32, f32x8, 8);
+define_tanh!(f32 => tanh_v16f32, f32x16, 16);
+
+define_tanh!(f64 => tanh_v2f64, f64x2, 2);
+define_tanh!(f64 => tanh_v4f64, f64x4, 4);
+define_tanh!(f64 => tanh_v8f64, f64x8, 8);
+
+fn tanh_f32(x: f32) -> f32 {
+    libm::F32Ext::tanh(x)
+}
+
+fn tanh_f64(x: f64) -> f64 {
+    libm::F64Ext::tanh(x)
+}
+
+gen_unary_impl_table!(Tanh, tanh);
+
+cfg_if! {
+    if #[cfg(target_arch = "s390x")] {
+        // FIXME: https://github.com/rust-lang-nursery/packed_simd/issues/14
+        impl_unary!(f32x2[f32; 2]: tanh_f32);
+        impl_unary!(f32x4[f32; 4]: tanh_f32);
+        impl_unary!(f32x8[f32; 8]: tanh_f32);
+        impl_unary!(f32x16[f32; 16]: tanh_f32);
+
+        impl_unary!(f64x2[f64; 2]: tanh_f64);
+        impl_unary!(f64x4[f64; 4]: tanh_f64);
+        impl_unary!(f64x8[f64; 8]: tanh_f64);
+    } else if #[cfg(all(target_arch = "x86_64", feature = "sleef-sys"))] {
+        use sleef_sys::*;
+        cfg_if! {
+            if #[cfg(target_feature = "avx2")] {
+                impl_unary!(f32x2[t => f32x4]: Sleef_tanhf4_u10avx2128);
+                impl_unary!(f32x16[h => f32x8]: Sleef_tanhf8_u10avx2);
+                impl_unary!(f64x8[h => f64x4]: Sleef_tanhd4_u10avx2);
+
+                impl_unary!(f32x4: Sleef_tanhf4_u10avx2128);
+                impl_unary!(f32x8: Sleef_tanhf8_u10avx2);
+                impl_unary!(f64x2: Sleef_tanhd2_u10avx2128);
+                impl_unary!(f64x4: Sleef_tanhd4_u10avx2);
+            } else if #[cfg(target_feature = "avx")] {
+                impl_unary!(f32x2[t => f32x4]: Sleef_tanhf4_u10sse4);
+                impl_unary!(f32x16[h => f32x8]: Sleef_tanhf8_u10avx);
+                impl_unary!(f64x8[h => f64x4]: Sleef_tanhd4_u10avx);
+
+                impl_unary!(f32x4: Sleef_tanhf4_u10sse4);
+                impl_unary!(f32x8: Sleef_tanhf8_u10avx);
+                impl_unary!(f64x2: Sleef_tanhd2_u10sse4);
+                impl_unary!(f64x4: Sleef_tanhd4_u10avx);
+            } else if #[cfg(target_feature = "sse4.2")] {
+                impl_unary!(f32x2[t => f32x4]: Sleef_tanhf4_u10sse4);
+                impl_unary!(f32x16[q => f32x4]: Sleef_tanhf4_u10sse4);
+                impl_unary!(f64x8[q => f64x2]: Sleef_tanhd2_u10sse4);
+
+                impl_unary!(f32x4: Sleef_tanhf4_u10sse4);
+                impl_unary!(f32x8[h => f32x4]: Sleef_tanhf4_u10sse4);
+                impl_unary!(f64x2: Sleef_tanhd2_u10sse4);
+                impl_unary!(f64x4[h => f64x2]: Sleef_tanhd2_u10sse4);
+            } else {
+                impl_unary!(f32x2[f32; 2]: tanh_f32);
+                impl_unary!(f32x16: tanh_v16f32);
+                impl_unary!(f64x8: tanh_v8f64);
+
+                impl_unary!(f32x4: tanh_v4f32);
+                impl_unary!(f32x8: tanh_v8f32);
+                impl_unary!(f64x2: tanh_v2f64);
+                impl_unary!(f64x4: tanh_v4f64);
+            }
+        }
+    } else {
+        impl_unary!(f32x2[f32; 2]: tanh_f32);
+        impl_unary!(f32x4: tanh_v4f32);
+        impl_unary!(f32x8: tanh_v8f32);
+        impl_unary!(f32x16: tanh_v16f32);
+
+        impl_unary!(f64x2: tanh_v2f64);
+        impl_unary!(f64x4: tanh_v4f64);
+        impl_unary!(f64x8: tanh_v8f64);
+    }
+}