From b9234d8d7b2070cf3a680c8c9f074528416639fc Mon Sep 17 00:00:00 2001
From: quaternic <57393910+quaternic@users.noreply.github.com>
Date: Mon, 22 Sep 2025 14:41:36 +0300
Subject: [PATCH 1/3] implement libm::exp and its variants for i586 with inline
 assembly to fix precision issues

---
 libm-test/src/precision.rs | 12 +++++++--
 libm/src/math/arch/i586.rs | 53 ++++++++++++++++++++++++++++++++++++++
 libm/src/math/arch/mod.rs  |  5 ++++
 libm/src/math/exp.rs       |  6 +++++
 libm/src/math/exp10.rs     |  6 +++++
 libm/src/math/exp10f.rs    |  6 +++++
 libm/src/math/exp2.rs      |  6 +++++
 libm/src/math/exp2f.rs     |  6 +++++
 libm/src/math/expf.rs      |  6 +++++
 9 files changed, 104 insertions(+), 2 deletions(-)

diff --git a/libm-test/src/precision.rs b/libm-test/src/precision.rs
index c441922d3..ceaeeb3ac 100644
--- a/libm-test/src/precision.rs
+++ b/libm-test/src/precision.rs
@@ -83,6 +83,16 @@ pub fn default_ulp(ctx: &CheckCtx) -> u32 {
         Bn::Tgamma => 20,
     };
 
+    // These have a separate implementation on i586
+    if cfg!(x86_no_sse) {
+        match ctx.base_name {
+            Bn::Exp => ulp = 1,
+            Bn::Exp2 => ulp = 1,
+            Bn::Exp10 => ulp = 1,
+            _ => (),
+        }
+    }
+
     // There are some cases where musl's approximation is less accurate than ours. For these
     // cases, increase the ULP.
     if ctx.basis == Musl {
@@ -124,8 +134,6 @@ pub fn default_ulp(ctx: &CheckCtx) -> u32 {
             Id::Asinh => ulp = 3,
             Id::Asinhf => ulp = 3,
             Id::Cbrt => ulp = 1,
-            Id::Exp10 | Id::Exp10f => ulp = 1_000_000,
-            Id::Exp2 | Id::Exp2f => ulp = 10_000_000,
             Id::Log1p | Id::Log1pf => ulp = 2,
             Id::Tan => ulp = 2,
             _ => (),
diff --git a/libm/src/math/arch/i586.rs b/libm/src/math/arch/i586.rs
index b9a667620..982251ae0 100644
--- a/libm/src/math/arch/i586.rs
+++ b/libm/src/math/arch/i586.rs
@@ -60,3 +60,56 @@ pub fn floor(mut x: f64) -> f64 {
     }
     x
 }
+
+macro_rules! x87exp {
+    ($float_ty:ident, $word_size:literal, $fn_name:ident,  $load_op:literal) => {
+        pub fn $fn_name(mut x: $float_ty) -> $float_ty { unsafe {
+            core::arch::asm!(
+                // Prepare the register stack as
+                // ```
+                // st(0) = y = x*log2(base)
+                // st(1) = 1.0
+                // st(2) = round(y)
+                // ```
+                concat!($load_op, " ", $word_size, " ptr [{x}]"),
+                "fld1",
+                "fld st(1)",
+                "frndint",
+                "fxch st(2)",
+
+                // Compare y with round(y) to determine if y is finite and
+                // not an integer. If so, compute `exp2(y - round(y))` into
+                // st(1). Otherwise skip ahead with `st(1) = 1.0`
+                "fucom st(2)",
+                "fstsw ax",
+                "test ax, 0x4000",
+                "jnz 2f",
+                "fsub st(0), st(2)", // st(0) = y - round(y)
+                "f2xm1",             // st(0) = 2^st(0) - 1.0
+                "fadd st(1), st(0)", // st(1) = 1 + st(0) = exp2(y - round(y))
+                "2:",
+
+                // Finally, scale by `exp2(round(y))` and clear the stack.
+                "fstp st(0)",
+                "fscale",
+                concat!("fstp ", $word_size, " ptr [{x}]"),
+                "fstp st(0)",
+                x = in(reg) &mut x,
+                out("ax") _,
+                out("st(0)") _, out("st(1)") _,
+                out("st(2)") _, out("st(3)") _,
+                out("st(4)") _, out("st(5)") _,
+                out("st(6)") _, out("st(7)") _,
+                options(nostack),
+            );
+            x
+        }}
+    };
+}
+
+x87exp!(f32, "dword", x87_exp2f, "fld");
+x87exp!(f64, "qword", x87_exp2, "fld");
+x87exp!(f32, "dword", x87_exp10f, "fldl2t\nfmul");
+x87exp!(f64, "qword", x87_exp10, "fldl2t\nfmul");
+x87exp!(f32, "dword", x87_expf, "fldl2e\nfmul");
+x87exp!(f64, "qword", x87_exp, "fldl2e\nfmul");
diff --git a/libm/src/math/arch/mod.rs b/libm/src/math/arch/mod.rs
index 984ae7f31..ba859c679 100644
--- a/libm/src/math/arch/mod.rs
+++ b/libm/src/math/arch/mod.rs
@@ -48,3 +48,8 @@ cfg_if! {
         pub use i586::{ceil, floor};
     }
 }
+cfg_if! {
+    if #[cfg(x86_no_sse)] {
+        pub use i586::{x87_exp10f, x87_exp10, x87_expf, x87_exp, x87_exp2f, x87_exp2};
+    }
+}
diff --git a/libm/src/math/exp.rs b/libm/src/math/exp.rs
index 78ce5dd13..cb939ad5d 100644
--- a/libm/src/math/exp.rs
+++ b/libm/src/math/exp.rs
@@ -83,6 +83,12 @@ const P5: f64 = 4.13813679705723846039e-08; /* 0x3E663769, 0x72BEA4D0 */
 /// (where *e* is the base of the natural system of logarithms, approximately 2.71828).
 #[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn exp(mut x: f64) -> f64 {
+    select_implementation! {
+        name: x87_exp,
+        use_arch_required: x86_no_sse,
+        args: x,
+    }
+
     let x1p1023 = f64::from_bits(0x7fe0000000000000); // 0x1p1023 === 2 ^ 1023
     let x1p_149 = f64::from_bits(0x36a0000000000000); // 0x1p-149 === 2 ^ -149
 
diff --git a/libm/src/math/exp10.rs b/libm/src/math/exp10.rs
index 1f49f5e96..e0af1945b 100644
--- a/libm/src/math/exp10.rs
+++ b/libm/src/math/exp10.rs
@@ -9,6 +9,12 @@ const P10: &[f64] = &[
 /// Calculates 10 raised to the power of `x` (f64).
 #[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn exp10(x: f64) -> f64 {
+    select_implementation! {
+        name: x87_exp10,
+        use_arch_required: x86_no_sse,
+        args: x,
+    }
+
     let (mut y, n) = modf(x);
     let u: u64 = n.to_bits();
     /* fabs(n) < 16 without raising invalid on nan */
diff --git a/libm/src/math/exp10f.rs b/libm/src/math/exp10f.rs
index 22a264211..f0a311c2d 100644
--- a/libm/src/math/exp10f.rs
+++ b/libm/src/math/exp10f.rs
@@ -9,6 +9,12 @@ const P10: &[f32] = &[
 /// Calculates 10 raised to the power of `x` (f32).
 #[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn exp10f(x: f32) -> f32 {
+    select_implementation! {
+        name: x87_exp10f,
+        use_arch_required: x86_no_sse,
+        args: x,
+    }
+
     let (mut y, n) = modff(x);
     let u = n.to_bits();
     /* fabsf(n) < 8 without raising invalid on nan */
diff --git a/libm/src/math/exp2.rs b/libm/src/math/exp2.rs
index 6e4cbc29d..08b71587f 100644
--- a/libm/src/math/exp2.rs
+++ b/libm/src/math/exp2.rs
@@ -324,6 +324,12 @@ static TBL: [u64; TBLSIZE * 2] = [
 /// Calculate `2^x`, that is, 2 raised to the power `x`.
 #[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn exp2(mut x: f64) -> f64 {
+    select_implementation! {
+        name: x87_exp2,
+        use_arch_required: x86_no_sse,
+        args: x,
+    }
+
     let redux = f64::from_bits(0x4338000000000000) / TBLSIZE as f64;
     let p1 = f64::from_bits(0x3fe62e42fefa39ef);
     let p2 = f64::from_bits(0x3fcebfbdff82c575);
diff --git a/libm/src/math/exp2f.rs b/libm/src/math/exp2f.rs
index 733d2f1a8..ceff6822c 100644
--- a/libm/src/math/exp2f.rs
+++ b/libm/src/math/exp2f.rs
@@ -75,6 +75,12 @@ static EXP2FT: [u64; TBLSIZE] = [
 /// Calculate `2^x`, that is, 2 raised to the power `x`.
 #[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn exp2f(mut x: f32) -> f32 {
+    select_implementation! {
+        name: x87_exp2f,
+        use_arch_required: x86_no_sse,
+        args: x,
+    }
+
     let redux = f32::from_bits(0x4b400000) / TBLSIZE as f32;
     let p1 = f32::from_bits(0x3f317218);
     let p2 = f32::from_bits(0x3e75fdf0);
diff --git a/libm/src/math/expf.rs b/libm/src/math/expf.rs
index dbbfdbba9..5541ab79a 100644
--- a/libm/src/math/expf.rs
+++ b/libm/src/math/expf.rs
@@ -32,6 +32,12 @@ const P2: f32 = -2.7667332906e-3; /* -0xb55215.0p-32 */
 /// (where *e* is the base of the natural system of logarithms, approximately 2.71828).
 #[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn expf(mut x: f32) -> f32 {
+    select_implementation! {
+        name: x87_expf,
+        use_arch_required: x86_no_sse,
+        args: x,
+    }
+
     let x1p127 = f32::from_bits(0x7f000000); // 0x1p127f === 2 ^ 127
     let x1p_126 = f32::from_bits(0x800000); // 0x1p-126f === 2 ^ -126  /*original 0x1p-149f    ??????????? */
     let mut hx = x.to_bits();

From 6932f3f12cb92966cd24c193f79dee155fe26e99 Mon Sep 17 00:00:00 2001
From: quaternic <57393910+quaternic@users.noreply.github.com>
Date: Sat, 27 Sep 2025 03:07:25 +0300
Subject: [PATCH 2/3] remove overrides to actually check the precision

---
 libm-test/src/precision.rs | 28 +++++++---------------------
 1 file changed, 7 insertions(+), 21 deletions(-)

diff --git a/libm-test/src/precision.rs b/libm-test/src/precision.rs
index ceaeeb3ac..283718a78 100644
--- a/libm-test/src/precision.rs
+++ b/libm-test/src/precision.rs
@@ -85,10 +85,13 @@ pub fn default_ulp(ctx: &CheckCtx) -> u32 {
 
     // These have a separate implementation on i586
     if cfg!(x86_no_sse) {
-        match ctx.base_name {
-            Bn::Exp => ulp = 1,
-            Bn::Exp2 => ulp = 1,
-            Bn::Exp10 => ulp = 1,
+        match ctx.fn_ident {
+            Id::Exp => ulp = 1,
+            Id::Exp2 => ulp = 1,
+            Id::Exp10 => ulp = 1,
+            Id::Expf => ulp = 0,
+            Id::Exp2f => ulp = 0,
+            Id::Exp10f => ulp = 0,
             _ => (),
         }
     }
@@ -226,15 +229,6 @@ impl MaybeOverride<(f32,)> for SpecialCase {
             return XFAIL("expm1 representable numbers");
         }
 
-        if cfg!(x86_no_sse)
-            && ctx.base_name == BaseName::Exp2
-            && !expected.is_infinite()
-            && actual.is_infinite()
-        {
-            // We return infinity when there is a representable value. Test input: 127.97238
-            return XFAIL("586 exp2 representable numbers");
-        }
-
         if ctx.base_name == BaseName::Sinh && input.0.abs() > 80.0 && actual.is_nan() {
             // we return some NaN that should be real values or infinite
             if ctx.basis == CheckBasis::Musl {
@@ -286,14 +280,6 @@ impl MaybeOverride<(f64,)> for SpecialCase {
             return XFAIL("i586 rint rounding mode");
         }
 
-        if cfg!(x86_no_sse)
-            && (ctx.fn_ident == Identifier::Exp10 || ctx.fn_ident == Identifier::Exp2)
-        {
-            // FIXME: i586 has very imprecise results with ULP > u32::MAX for these
-            // operations so we can't reasonably provide a limit.
-            return XFAIL_NOCHECK;
-        }
-
         if ctx.base_name == BaseName::J0 && input.0 < -1e300 {
             // Errors get huge close to -inf
             return XFAIL_NOCHECK;

From 984c6e15241f871e2b5b6139ca4e79e3e2b14ad5 Mon Sep 17 00:00:00 2001
From: quaternic <57393910+quaternic@users.noreply.github.com>
Date: Sat, 27 Sep 2025 03:35:34 +0300
Subject: [PATCH 3/3] allow musl to have its slight error again

---
 libm-test/src/precision.rs | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/libm-test/src/precision.rs b/libm-test/src/precision.rs
index 283718a78..968ac7170 100644
--- a/libm-test/src/precision.rs
+++ b/libm-test/src/precision.rs
@@ -111,6 +111,8 @@ pub fn default_ulp(ctx: &CheckCtx) -> u32 {
             Id::Cbrt => ulp = 2,
             // FIXME(#401): musl has an incorrect result here.
             Id::Fdim => ulp = 2,
+            Id::Exp2f => ulp = 1,
+            Id::Expf => ulp = 1,
             Id::Sincosf => ulp = 500,
             Id::Tgamma => ulp = 20,
             _ => (),