cos, sin and start of a*

andy-thomason · andy-thomason · commit 8db38850572c · 2022-01-25T14:25:34.000Z
diff --git a/crates/std_float/src/lib.rs b/crates/std_float/src/lib.rs
@@ -117,6 +117,16 @@ pub trait StdFloat: Sealed + Sized {
     fn fract(self) -> Self;
 
     fn sin(self) -> Self;
+
+    fn cos(self) -> Self;
+
+    fn tan(self) -> Self;
+
+    fn asin(self) -> Self;
+
+    fn acos(self) -> Self;
+
+    fn atan(self) -> Self;
 }
 
 impl<const N: usize> Sealed for Simd<f32, N> where LaneCount<N>: SupportedLaneCount {}
@@ -135,6 +145,8 @@ where
     }
 
     /// Calculate the sine of the angle
+    /// Note: this is hand-edited from generated scalar code.
+    /// In an ideal world, we would generate this directly by code transformation.
     #[inline]
     fn sin(self) -> Self {
         #[allow(non_snake_case)]
@@ -150,6 +162,93 @@ where
             .mul_add(x * x, Self::splat(6.28318452581127506328))
             * x
     }
+
+    fn cos(self) -> Self {
+        #[allow(non_snake_case)]
+        let RECIP_2PI = Self::splat(0.15915494);
+
+        let scaled = self * RECIP_2PI;
+        let x = scaled - scaled.round();
+        Self::splat(6.52865816174499269880)
+            .mul_add(x * x, Self::splat(-25.97327546890330396608))
+            .mul_add(x * x, Self::splat(60.17118230812820383560))
+            .mul_add(x * x, Self::splat(-85.45091743827674607508))
+            .mul_add(x * x, Self::splat(64.93918704099473042873))
+            .mul_add(x * x, Self::splat(-19.73920667935656472596))
+            .mul_add(x * x, Self::splat(1.00000000000000000000))
+    }
+
+    fn tan(self) -> Self {
+        use core::f32::consts::PI;
+        let scaled: Self = self * Self::splat(1.0 / PI);
+        let x: Self = scaled - scaled.round();
+        let recip: Self = (x * x - Self::splat(0.25)).recip();
+        let y: Self = Self::splat(0.01439730036301634345)
+            .mul_add(x * x, Self::splat(0.02101734538976238579))
+            .mul_add(x * x, Self::splat(0.05285888255895108345))
+            .mul_add(x * x, Self::splat(0.13475448281475060771))
+            .mul_add(x * x, Self::splat(0.55773663386075044866))
+            .mul_add(x * x, Self::splat(-0.78539816491781455948))
+            * x;
+        y * recip
+    }
+    
+    fn asin(self) -> Self {
+        use core::f32::consts::PI;
+        let lim: Self = Self::splat(0.9);
+        let c: Self = self.lanes_lt(Self::splat(0.0)).select(Self::splat(-PI / 2.0), Self::splat(PI / 2.0));
+        let s: Self = self.lanes_lt(Self::splat(0.0)).select(Self::splat(-1.0), Self::splat(1.0));
+        let x: Self = (self * self).lanes_lt(lim * lim).select(self, (Self::splat(1.0) - self * self).sqrt());
+        let y: Self = Self::splat(4374.97702992533695457424)
+            .mul_add(x * x, Self::splat(-13781.55764426881951685974))
+            .mul_add(x * x, Self::splat(17105.69475701115952774357))
+            .mul_add(x * x, Self::splat(-10486.64894150265898388567))
+            .mul_add(x * x, Self::splat(3231.76028705607279348342))
+            .mul_add(x * x, Self::splat(-447.56480696327035255708))
+            .mul_add(x * x, Self::splat(21.78206149264184872939))
+            .mul_add(x * x, Self::splat(0.84158415752395745675))
+            * x;
+        (self * self).lanes_lt(lim * lim).select(y, c - y * s)
+    }
+    
+    fn acos(self) -> Self {
+        use core::f32::consts::PI;
+        let lim: Self = Self::splat(0.9);
+        let c: Self = self.lanes_lt(Self::splat(0.0)).select(Self::splat(PI), Self::splat(0.0));
+        let s: Self = self.lanes_lt(Self::splat(0.0)).select(Self::splat(1.0), Self::splat(-1.0));
+        let x: Self = (self * self).lanes_lt(lim * lim).select(self, (Self::splat(1.0) - self * self).sqrt());
+        // let c: Self = select(self < 0.0, PI, 0.0);
+        // let s: Self = select(self < 0.0, 1.0, -1.0);
+        // let x: Self = select(self * self < lim * lim, self, (1.0 - self * self).sqrt());
+        let y: Self = Self::splat(4374.97702992533695457424)
+            .mul_add(x * x, Self::splat(-13781.55764426881951685974))
+            .mul_add(x * x, Self::splat(17105.69475701115952774357))
+            .mul_add(x * x, Self::splat(-10486.64894150265898388567))
+            .mul_add(x * x, Self::splat(3231.76028705607279348342))
+            .mul_add(x * x, Self::splat(-447.56480696327035255708))
+            .mul_add(x * x, Self::splat(21.78206149264184872939))
+            .mul_add(x * x, Self::splat(0.84158415752395745675))
+            * x;
+        (self * self).lanes_lt(lim * lim).select(y, c - y * s)
+    }
+    
+    fn atan(self) -> Self {
+        use core::f32::consts::PI;
+        let lim: Self = Self::splat(1.0);
+        let c: Self = self.lanes_lt(Self::splat(0.0)).select(Self::splat(-PI / 2.0), Self::splat(PI / 2.0));
+        let small = self.abs().lanes_lt(lim);
+        let x: Self = small.select(self, self.recip());
+        let y: Self = Self::splat(95.70126383842530559360)
+            .mul_add(x * x, Self::splat(424.99907022806059540464))
+            .mul_add(x * x, Self::splat(-767.48259680040570156003))
+            .mul_add(x * x, Self::splat(714.51953012224223415829))
+            .mul_add(x * x, Self::splat(-354.32654395426962592865))
+            .mul_add(x * x, Self::splat(83.96179897148539189638))
+            .mul_add(x * x, Self::splat(-6.23958170715441509270))
+            .mul_add(x * x, Self::splat(1.05498514186427524914))
+            * x;
+        small.select(y, c - y)
+    }
 }
 
 impl<const N: usize> StdFloat for Simd<f64, N>
@@ -167,6 +266,31 @@ where
     fn sin(self) -> Self {
         self
     }
+
+    #[inline]
+    fn cos(self) -> Self {
+        self
+    }
+
+    #[inline]
+    fn tan(self) -> Self {
+        self
+    }
+
+    #[inline]
+    fn asin(self) -> Self {
+        self
+    }
+
+    #[inline]
+    fn acos(self) -> Self {
+        self
+    }
+
+    #[inline]
+    fn atan(self) -> Self {
+        self
+    }
 }
 
 #[cfg(test)]
@@ -188,6 +312,8 @@ mod tests {
         let _ = x.sin();
     }
 
+    const NUM_ITER: usize = 0x10000;
+
     macro_rules! test_range {
         (
                 min: $min: expr,
@@ -198,7 +324,6 @@ mod tests {
                 scalar_type: $scalar_type: ty,
                 vector_type: $vector_type: ty,
             ) => {{
-            const NUM_ITER: usize = 0x10000;
             let limit = <$vector_type>::splat($limit);
             let b = (($max) - ($min)) * (1.0 / NUM_ITER as $scalar_type);
             let a = $min;
@@ -213,49 +338,129 @@ mod tests {
                     (fi + 3.0) * b + a,
                 ]);
                 let yref = <$vector_type>::from_array([sf(x[0]), sf(x[1]), sf(x[2]), sf(x[3])]);
-                assert!(((vf(x) - yref).abs().lanes_le(limit)).all());
+                let y = vf(x);
+                let e = (y - yref);
+                if !(e.abs().lanes_le(limit)).all() {
+                    panic!("\nx     ={:20.16?}\ne     ={:20.16?}\nlimit ={:20.16?}\nvector={:20.16?}\nscalar={:20.16?}\nvector_fn={}", x, e, limit, y, yref, stringify!($vector_fn));
+                }
             }
         }};
     }
 
     #[test]
     fn sin_f32() {
         use core::f32::consts::PI;
-        let ulp = (2.0_f32).powi(-23);
+        let one_ulp = (2.0_f32).powi(-23);
 
-        // In the range +/- pi/4 the input has 1 ulp of error.
         test_range!(
             min: -PI/4.0,
             max: PI/4.0,
-            limit: ulp * 1.0,
+            limit: one_ulp * 1.0,
             scalar_fn: |x : f32| x.sin(),
             vector_fn: |x : f32x4| x.sin(),
             scalar_type: f32,
             vector_type: f32x4,
         );
 
-        // In the range +/- pi/2 the input and output has 2 ulp of error.
         test_range!(
             min: -PI/2.0,
             max: PI/2.0,
-            limit: ulp * 2.0,
+            limit: one_ulp * 2.0,
             scalar_fn: |x : f32| x.sin(),
             vector_fn: |x : f32x4| x.sin(),
             scalar_type: f32,
             vector_type: f32x4,
         );
 
-        // In the range +/- pi the input has 2 ulp of error and the output has 5.
-        // Note that the scalar sin also has this error but the implementation
-        // is different.
         test_range!(
             min: -PI,
             max: PI,
-            limit: ulp * 5.0,
+            limit: one_ulp * 8.0,
             scalar_fn: |x : f32| x.sin(),
             vector_fn: |x : f32x4| x.sin(),
             scalar_type: f32,
             vector_type: f32x4,
         );
     }
+
+    #[test]
+    fn cos_f32() {
+        use core::f32::consts::PI;
+        let one_ulp = (2.0_f32).powi(-23);
+
+        // In the range +/- pi/4 the input has 1 ulp of error.
+        test_range!(
+            min: -PI/4.0,
+            max: PI/4.0,
+            limit: one_ulp * 1.0,
+            scalar_fn: |x : f32| x.cos(),
+            vector_fn: |x : f32x4| x.cos(),
+            scalar_type: f32,
+            vector_type: f32x4,
+        );
+
+        // In the range +/- pi/2 the input and output has 2 ulp of error.
+        test_range!(
+            min: -PI/2.0,
+            max: PI/2.0,
+            limit: one_ulp * 2.0,
+            scalar_fn: |x : f32| x.cos(),
+            vector_fn: |x : f32x4| x.cos(),
+            scalar_type: f32,
+            vector_type: f32x4,
+        );
+
+        // In the range +/- pi the input has 4 ulp of error and the output has 5.
+        // Note that the scalar cos also has this error but the implementation
+        // is different.
+        test_range!(
+            min: -PI,
+            max: PI,
+            limit: one_ulp * 8.0,
+            scalar_fn: |x : f32| x.cos(),
+            vector_fn: |x : f32x4| x.cos(),
+            scalar_type: f32,
+            vector_type: f32x4,
+        );
+    }
+
+    #[test]
+    fn tan_f32() {
+        use core::f32::consts::PI;
+        let one_ulp = (2.0_f32).powi(-23);
+
+        // For the outsides, reciprocal accuracy is important.
+        // Note that the vector function correctly gets -inf for -PI/2
+        // but the scalar function does not.
+        test_range!(
+            min: -PI/2.0 + 0.00001,
+            max: -PI/4.0,
+            limit: one_ulp * 3.0,
+            scalar_fn: |x : f32| x.tan().recip(),
+            vector_fn: |x : f32x4| x.tan().recip(),
+            scalar_type: f32,
+            vector_type: f32x4,
+        );
+
+        // For the insides, absolute accuracy is important.
+        test_range!(
+            min: -PI/4.0,
+            max: PI/4.0,
+            limit: one_ulp * 2.0,
+            scalar_fn: |x : f32| x.tan(),
+            vector_fn: |x : f32x4| x.tan(),
+            scalar_type: f32,
+            vector_type: f32x4,
+        );
+
+        test_range!(
+            min: PI/4.0,
+            max: PI/2.0 - 0.00001,
+            limit: one_ulp * 3.0,
+            scalar_fn: |x : f32| x.tan().recip(),
+            vector_fn: |x : f32x4| x.tan().recip(),
+            scalar_type: f32,
+            vector_type: f32x4,
+        );
+    }
 }