Skip to content

Commit facbc6f

Browse files
committed
optimize fmod performance
1 parent d78b5da commit facbc6f

File tree

4 files changed

+57
-13
lines changed

4 files changed

+57
-13
lines changed

libm/src/math/generic/fmod.rs

Lines changed: 57 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,12 @@
11
/* SPDX-License-Identifier: MIT OR Apache-2.0 */
2-
use crate::support::{CastFrom, Float, Int, MinInt};
2+
use crate::support::{CastFrom, CastInto, Float, HInt, Int, MinInt, NarrowingDiv};
33

44
#[inline]
5-
pub fn fmod<F: Float>(x: F, y: F) -> F {
5+
pub fn fmod<F: Float>(x: F, y: F) -> F
6+
where
7+
F::Int: HInt,
8+
<F::Int as HInt>::D: NarrowingDiv,
9+
{
610
let _1 = F::Int::ONE;
711
let sx = x.to_bits() & F::SIGN_MASK;
812
let ux = x.to_bits() & !F::SIGN_MASK;
@@ -29,7 +33,7 @@ pub fn fmod<F: Float>(x: F, y: F) -> F {
2933

3034
// To compute `(num << ex) % (div << ey)`, first
3135
// evaluate `rem = (num << (ex - ey)) % div` ...
32-
let rem = reduction(num, ex - ey, div);
36+
let rem = reduction::<F>(num, ex - ey, div);
3337
// ... so the result will be `rem << ey`
3438

3539
if rem.is_zero() {
@@ -58,11 +62,55 @@ fn into_sig_exp<F: Float>(mut bits: F::Int) -> (F::Int, u32) {
5862
}
5963

6064
/// Compute the remainder `(x * 2.pow(e)) % y` without overflow.
61-
fn reduction<I: Int>(mut x: I, e: u32, y: I) -> I {
62-
x %= y;
63-
for _ in 0..e {
64-
x <<= 1;
65-
x = x.checked_sub(y).unwrap_or(x);
65+
fn reduction<F>(mut x: F::Int, e: u32, y: F::Int) -> F::Int
66+
where
67+
F: Float,
68+
F::Int: HInt,
69+
<<F as Float>::Int as HInt>::D: NarrowingDiv,
70+
{
71+
// `f16` only has 5 exponent bits, so even `f16::MAX = 65504.0` is only
72+
// a 40-bit integer multiple of the smallest subnormal.
73+
if F::BITS == 16 {
74+
debug_assert!(F::EXP_MAX - F::EXP_MIN == 29);
75+
debug_assert!(e <= 29);
76+
let u: u16 = x.cast();
77+
let v: u16 = y.cast();
78+
let u = (u as u64) << e;
79+
let v = v as u64;
80+
return F::Int::cast_from((u % v) as u16);
6681
}
67-
x
82+
83+
// Ensure `x < 2y` for later steps
84+
if x >= (y << 1) {
85+
// This case is only reached with subnormal divisors,
86+
// but it might be better to just normalize all significands
87+
// to make this unnecessary. The further calls could potentially
88+
// benefit from assuming a specific fixed leading bit position.
89+
x %= y;
90+
}
91+
92+
// The simple implementation seems to be fastest for a short reduction
93+
// at this size. The limit here was chosen empirically on an Intel Nehalem.
94+
// Less old CPUs that have faster `u64 * u64 -> u128` might not benefit,
95+
// and 32-bit systems or architectures without hardware multipliers might
96+
// want to do this in more cases.
97+
if F::BITS == 64 && e < 32 {
98+
// Assumes `x < 2y`
99+
for _ in 0..e {
100+
x = x.checked_sub(y).unwrap_or(x);
101+
x <<= 1;
102+
}
103+
return x.checked_sub(y).unwrap_or(x);
104+
}
105+
106+
// Fast path for short reductions
107+
if e < F::BITS {
108+
let w = x.widen() << e;
109+
if let Some((_, r)) = w.checked_narrowing_div_rem(y) {
110+
return r;
111+
}
112+
}
113+
114+
// Assumes `x < 2y`
115+
crate::support::linear_mul_reduction(x, e, y)
68116
}

libm/src/math/support/int_traits/narrowing_div.rs

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@ use crate::support::{DInt, HInt, Int, MinInt, u256};
66
/// This is the inverse of widening multiplication:
77
/// - for any `x` and nonzero `y`: `x.widen_mul(y).checked_narrowing_div_rem(y) == Some((x, 0))`,
88
/// - and for any `r in 0..y`: `x.carrying_mul(y, r).checked_narrowing_div_rem(y) == Some((x, r))`,
9-
#[allow(dead_code)]
109
pub trait NarrowingDiv: DInt + MinInt<Unsigned = Self> {
1110
/// Computes `(self / n, self % n))`
1211
///

libm/src/math/support/mod.rs

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,9 +29,7 @@ pub use hex_float::hf16;
2929
pub use hex_float::hf128;
3030
#[allow(unused_imports)]
3131
pub use hex_float::{hf32, hf64};
32-
#[allow(unused_imports)]
3332
pub use int_traits::{CastFrom, CastInto, DInt, HInt, Int, MinInt, NarrowingDiv};
34-
#[allow(unused_imports)]
3533
pub use modular::linear_mul_reduction;
3634

3735
/// Hint to the compiler that the current path is cold.

libm/src/math/support/modular.rs

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -108,7 +108,6 @@ where
108108

109109
/// Compute the remainder `(x << e) % y` with unbounded integers.
110110
/// Requires `x < 2y` and `y.leading_zeros() >= 2`
111-
#[allow(dead_code)]
112111
pub fn linear_mul_reduction<U>(x: U, mut e: u32, y: U) -> U
113112
where
114113
U: HInt + Int<Unsigned = U>,

0 commit comments

Comments
 (0)