optimized sinf and cosf, ensured result is [-1,+1]. Optimized atan and atan2

ZERICO2005 · ZERICO2005 · commit 184b2a92a54c · 2025-04-01T21:08:51.000-06:00
math opt 3, sin, cos, atan, atan2

part 2
diff --git a/src/libc/atan2f.c b/src/libc/atan2f.c
@@ -2,26 +2,18 @@
 #include <math.h>
 #include "__float32_constants.h"
 
-float _atan2f_c(float arg1, float arg2) {
-    float satan(float);
-
-    if ((arg1+arg2)==arg1) {
-        if (arg1 >= 0.0f) {
-            return (F32_PI2);
-        } else {
-            return (-F32_PI2);
-        }
-    } else if (arg2 < 0.0f) {
-        if(arg1 >= 0.0f) {
-            return (F32_PI - satan(-arg1/arg2));
-        } else {
-            return (-F32_PI + satan(arg1/arg2));
-        }
-    } else if (arg1 > 0.0f) {
-        return (satan(arg1/arg2));
-    } else {
-        return (-satan(-arg1/arg2));
+static float _positive_atan2f(float y, float x) {
+    float _f32_satan(float);
+    if ((y+x)==y) {
+        return F32_PI2;
+    } else if (signbit(x)) {
+        return F32_PI - _f32_satan(-y / x);
     }
+    return _f32_satan(y / x);
+}
+
+float _atan2f_c(float y, float x) {
+    return copysignf(_positive_atan2f(fabsf(y), x), y);
 }
 
 double _atan2_c(double, double) __attribute__((alias("_atan2f_c")));
diff --git a/src/libc/atan2l.c b/src/libc/atan2l.c
@@ -2,24 +2,16 @@
 #include <math.h>
 #include "__float64_constants.h"
 
-long double atan2l(long double arg1, long double arg2) {
-    long double f64_satan(long double);
-
-    if ((arg1+arg2)==arg1) {
-        if (arg1 >= 0.0L) {
-            return (F64_PI2);
-        } else {
-            return (-F64_PI2);
-        }
-    } else if (arg2 < 0.0L) {
-        if(arg1 >= 0.0L) {
-            return (F64_PI - f64_satan(-arg1/arg2));
-        } else {
-            return (-F64_PI + f64_satan(arg1/arg2));
-        }
-    } else if (arg1 > 0.0L) {
-        return (f64_satan(arg1/arg2));
-    } else {
-        return (-f64_satan(-arg1/arg2));
+static long double _positive_atan2l(long double y, long double x) {
+    long double _f64_satan(long double);
+    if ((y+x)==y) {
+        return F64_PI2;
+    } else if (signbit(x)) {
+        return F64_PI - _f64_satan(-y / x);
     }
+    return _f64_satan(y / x);
+}
+
+long double atan2l(long double y, long double x) {
+    return copysignl(_positive_atan2l(fabsl(y), x), y);
 }
diff --git a/src/libc/atanf.c b/src/libc/atanf.c
@@ -38,13 +38,8 @@
  * ulp of +4 at +0x1.a85846p-2
  */
 float _atanf_c(float arg) {
-    float satan(float);
-
-    if (signbit(arg)) {
-        return (-satan(-arg));
-    } else {
-        return (satan(arg));
-    }
+    float _f32_satan(float);
+    return copysignf(_f32_satan(fabsf(arg)), arg);
 }
 
 double _atan_c(double) __attribute__((alias("_atanf_c")));
@@ -58,8 +53,7 @@ double _atan_c(double) __attribute__((alias("_atanf_c")));
  * xatan evaluates a series valid in the
  * range [-0.414...,+0.414...].
  */
-
-static float xatan(float arg) {
+static float _f32_xatan(float arg) {
     float argsq;
     float value;
 
@@ -73,17 +67,16 @@ static float xatan(float arg) {
  * satan reduces its argument (known to be positive)
  * to the range [0,0.414...] and calls xatan.
  */
-
-float satan(float arg) {
+float _f32_satan(float arg) {
     if (arg < F32_SQRT2_MINUS_1) {
-        return (xatan(arg));
+        return (_f32_xatan(arg));
     } else if (arg > F32_SQRT2_PLUS_1) {
         if (arg > 0x1.0p+25f) {
             /* rounds to pi/2 */
             return F32_PI2;
         }
-        return (F32_PI2 - xatan(1.0f/arg));
+        return (F32_PI2 - _f32_xatan(1.0f/arg));
     } else {
-        return (F32_PI4 + xatan((arg-1.0f)/(arg+1.0f)));
+        return (F32_PI4 + _f32_xatan((arg-1.0f)/(arg+1.0f)));
     }
 }
diff --git a/src/libc/atanl.c b/src/libc/atanl.c
@@ -35,13 +35,8 @@
  * 2^-46.95 at +2.438776493e+00
  */
 long double atanl(long double arg) {
-    long double f64_satan(long double);
-
-    if (signbit(arg)) {
-        return (-f64_satan(-arg));
-    } else {
-        return (f64_satan(arg));
-    }
+    long double _f64_satan(long double);
+    return copysignl(_f64_satan(fabsl(arg)), arg);
 }
 
 /**
@@ -54,7 +49,7 @@ long double atanl(long double arg) {
  * range [-0.414...,+0.414...].
  */
 
-static long double f64_xatan(long double arg) {
+static long double _f64_xatan(long double arg) {
     long double argsq;
     long double value;
 
@@ -69,16 +64,16 @@ static long double f64_xatan(long double arg) {
  * to the range [0,0.414...] and calls xatan.
  */
 
-long double f64_satan(long double arg) {
+long double _f64_satan(long double arg) {
     if (arg < F64_SQRT2_MINUS_1) {
-        return f64_xatan(arg);
+        return _f64_xatan(arg);
     } else if (arg > F64_SQRT2_PLUS_1) {
         if (arg > 0x1.0p+54L) {
             /* rounds to pi/2 */
             return F64_PI2;
         }
-        return (F64_PI2 - f64_xatan(1.0L / arg));
+        return (F64_PI2 - _f64_xatan(1.0L / arg));
     } else {
-        return (F64_PI4 + f64_xatan((arg - 1.0L) / (arg + 1.0L)));
+        return (F64_PI4 + _f64_xatan((arg - 1.0L) / (arg + 1.0L)));
     }
 }
diff --git a/src/libc/cosf.src b/src/libc/cosf.src
@@ -10,35 +10,20 @@ _cos := _cosf
 
 else
 
+; float _f32_sinus(int quad, float arg)
 _cos:
 _cosf:
 	call	__frameset0
-	ld	hl,(ix+6)
-	ld	e,(ix+9)
-	ld	bc,0
-	xor	a,a
+	ld	e, (ix + 9) ; exponent
+	ld	hl, (ix + 6) ; mantissa
+	res	7, e	; fabsf(x)
 	push	de
 	push	hl
-	call	__fcmp
-	pop	bc
-	pop	de
-	ld	a,e
-	jp	p,l_1
-	call	__fneg
+	scf	; quad 1, N reset, C set
+	push	af
+	jp	_sinf.hijack
 
-l_1:	ld	hl,1
-	push	hl
-	ld	l,a
-	push	hl
-	push	bc
-	call	_sinus
-	ld	sp,ix
-	pop	ix
-	ret
-
-	extern	_sinus
 	extern	__frameset0
-	extern	__fcmp
-	extern	__fneg
+	extern	_sinf.hijack
 
 end if
diff --git a/src/libc/floorf.c b/src/libc/floorf.c
@@ -10,17 +10,16 @@
 float _floorf_c(float d) {
     float fraction;
 
-    if (d<0.0) {
+    if (d < 0.0f) {
         d = -d;
-            fraction = modff(d, &d);
-        if (fraction != 0.0) {
-            d += 1;
+        fraction = modff(d, &d);
+        if (fraction != 0.0f) {
+            d += 1.0f;
         }
         d = -d;
-    } else {
-        fraction = modff(d, &d);
+        return d;
     }
-    return(d);
+    return truncf(d);
 }
 
 double _floor_c(double) __attribute__((alias("_floorf_c")));
diff --git a/src/libc/include/__math_def.h b/src/libc/include/__math_def.h
@@ -230,9 +230,9 @@ long        lround(double);
 long        lroundf(float);
 long        lroundl(long double);
 
-double      modf(double, double *);
-float       modff(float, float *);
-long double modfl(long double, long double *);
+double      modf(double, double *) __attribute__((nonnull(2)));
+float       modff(float, float *) __attribute__((nonnull(2)));
+long double modfl(long double, long double *) __attribute__((nonnull(2)));
 
 double      nan(const char *);
 float       nanf(const char *);
diff --git a/src/libc/sinf.c b/src/libc/sinf.c
@@ -22,53 +22,31 @@
 #define q2           0.946309610153821e4f
 #define q3           0.132653490878614e3f
 
-float sinus(float arg, int quad)
-{
-    float e, f;
-    int k;
+/**
+ * @remarks Minimum ulp:
+ * ulp of -5 at +0x1.fe2dd0p-9 (2^-10 < |x| < pi/2)
+ *
+ * @note positive arguments only
+ * @warning undefined behaviour if |x| > LONG_MAX
+ */
+float _f32_sinus(unsigned char quad, float x) {
+    float x_trunc;
     float ysq;
-    float x,y;
+    float y;
     float temp1, temp2;
 
-    x = arg;
-    if (x<0.0f) {
-        x = -x;
-        quad = quad + 2;
-    }
-    x = x * two_over_pi; /* underflow? */
-    if (x > 32764.0f) {
-        y = modff(x,&e);
-        e = e + quad;
-        modff(0.25f * e,&f);
-        quad = e - 4.0f * f;
-    } else {
-        k = x;
-        y = x - k;
-        quad = (quad + k) & 0x3;
-    }
+    x = x * two_over_pi;
+    y = modff(x, &x_trunc);
+    quad = (quad + (unsigned char)x_trunc) & 0x3;
     if (quad & 0x1) {
         y = 1.0f - y;
     }
-    if (quad > 1) {
+    if (quad & 0x2) {
         y = -y;
     }
 
-    ysq = y*y;
+    ysq = y * y;
     temp1 = ((((p4*ysq+p3)*ysq+p2)*ysq+p1)*ysq+p0)*y;
     temp2 = ((((ysq+q3)*ysq+q2)*ysq+q1)*ysq+q0);
     return(temp1/temp2);
 }
-
-/**
- * @remarks Minimum ulp:
- * ulp of -5 at +0x1.fe2dd0p-9 (|x| < pi/2)
- */
-float _sinf_c(float arg) {
-    if (fabsf(arg) < 0x1.0p-11f) {
-        return arg;
-    }
-    return sinus(arg, 0);
-}
-
-
-double _sin_c(double) __attribute__((alias("_sinf_c")));
diff --git a/src/libc/sinf.src b/src/libc/sinf.src
@@ -1,6 +1,7 @@
 	assume	adl=1
 
 	section	.text
+
 	public	_sinf
 	public	_sin
 
@@ -11,10 +12,45 @@ _sin := _sinf
 
 else
 
-_sinf := __sinf_c
-_sin := __sin_c
+	public	_sinf.hijack
+
+; float _f32_sinus(int quad, float arg)
+_sin:
+_sinf:
+	call	__frameset0
+	ld	e, (ix + 9) ; exponent
+	ld	hl, (ix + 6) ; mantissa
+	ld	a, e
+	add	a, a	; clear signbit
+	sub	a, 117	; |x| < 2^-10 or 0x3affffff
+	jr	c, .small_arg
+	ld	a, e
+	res	7, e	; x = fabsf(x)
+	push	de	; exponent
+	push	hl	; mantissa
+	rlca
+	add	a, a
+	ld	e, a
+	push	de
+.hijack:
+	call	__f32_sinus
+.small_arg:
+	ld	sp, ix
+	pop	ix
+	; you can ret here if clamping is not needed
+	; clamp the result to [-1.0, +1.0]
+	ld	a, e
+	add	a, a
+	sub	a, 126
+	ret	nz	; |y| < 0.5f
+	push	hl
+	add	hl, hl
+	pop	hl
+	ret	nc	; |y| < 1.0f
+	ld	l, h	; zero out the lower 8 bits of the mantissa
+	ret
 
-	extern	__sinf_c
-	extern	__sin_c
+	extern	__frameset0
+	extern	__f32_sinus
 
 end if