wip

tgross35 · tgross35 · commit d1aa79192eb4 · 2024-09-26T00:03:47.000-04:00
diff --git a/src/float/conv.rs b/src/float/conv.rs
@@ -14,25 +14,33 @@ use super::Float;
 /// - Calculate a base mantissa by shifting the integer into mantissa position
 /// - Figure out if rounding needs to occour by classifying truncated bits. Some patterns apply
 ///   here, so they may be "squashed" into smaller numbers to simplify the classification.
+///
+/// # Terminology
+///
+/// - `i`: the original integer
+/// - `i_m`: the integer, shifted fully left (no leading zeros)
+/// - `n`: number of leading zeroes
+/// - `e`: the resulting exponent
+/// - `m`: the resulting mantissa
+/// - `m_base`: the mantissa before adjusting for truncated bits
 mod int_to_float {
     use super::*;
 
     /// Calculate the exponent from the number of leading zeros.
     fn exp<I: Int, F: Float<Int: CastFrom<u32>>>(n: u32) -> F::Int {
-        F::Int::cast_from(I::BITS + F::EXPONENT_BIAS - 2 - n)
+        F::Int::cast_from(F::EXPONENT_BIAS - 1 + I::BITS - n)
     }
 
-    /// Shift the integer into the float's mantissa bits. Keep the lowest exponent bit intact.
-    fn m_base<I: Int, F: Float<Int: CastFrom<I>>>(i_m: I) -> F::Int {
+    /// Calculate the mantissa in cases where the float size is less than integer size. An
+    /// adjustment of the final mantissa will be needed, but it is calculated separately.
+    fn m_f_lt_i<I: Int, F: Float<Int: CastFrom<I>>>(i_m: I) -> F::Int {
+        // `i_m` already has no leading zeros. Just shift it into the float's mantissa bits,
+        // retaining the highest bits.
         F::Int::cast_from(i_m >> ((I::BITS - F::BITS) + F::EXPONENT_BITS))
     }
 
-    /// Calculate the mantissa in cases where the float size is greater than integer size
-    fn m_f_gt_i<I: Int, F: Float<Int: CastFrom<I>>>(i: I, n: u32) -> F::Int {
-        F::Int::cast_from(i) << (F::SIGNIFICAND_BITS - I::BITS + 1 + n)
-    }
-
-    /// Calculate the mantissa and a dropped bit adjustment  when `f` and `i` are equal sizes
+    /// Calculate the mantissa and a dropped bit adjustment  when `f` and `i` are equal sizes.
+    /// Returns the mantissa and necessary adjustment.
     fn m_f_eq_i<I: Int + CastInto<F::Int>, F: Float<Int = I>>(i: I, n: u32) -> (F::Int, F::Int) {
         let base = (i << n) >> F::EXPONENT_BITS;
 
@@ -43,16 +51,23 @@ mod int_to_float {
         (base, adj)
     }
 
+    /// Calculate the mantissa in cases where the float size is greater than integer size
+    fn m_f_gt_i<I: Int, F: Float<Int: CastFrom<I>>>(i: I, n: u32) -> F::Int {
+        F::Int::cast_from(i) << (F::SIGNIFICAND_BITS - I::BITS + 1 + n)
+    }
+
     /// Adjust a mantissa with dropped bits
-    fn m_adj<F: Float>(m_base: F::Int, dropped_bits: F::Int) -> F::Int {
+    fn m_adj<F: Float<Int: CastInto<i32>>>(m_base: F::Int, dropped_bits: F::Int) -> F::Int {
+        // fn m_adj<F: Float>(m_base: F::Int, dropped_bits: F::Int) -> F::Int {
         // Branchlessly extract a `1` if rounding up should happen
         let adj = (dropped_bits - (dropped_bits >> (F::BITS - 1) & !m_base)) >> (F::BITS - 1);
 
         // Add one when we need to round up. Break ties to even.
         m_base + adj
     }
 
-    /// Combine a final float repr from an exponent and mantissa.
+    /// Shift the exponent to its position and add the mantissa. Allows adjusting an off by one
+    /// exponent with an overflowing mantissa.
     fn repr<F: Float>(e: F::Int, m: F::Int) -> F::Int {
         // + rather than | so the mantissa can overflow into the exponent
         (e << F::SIGNIFICAND_BITS) + m
@@ -77,7 +92,7 @@ mod int_to_float {
         let n = i.leading_zeros();
         let (m_base, adj) = m_f_eq_i::<u32, f32>(i, n);
         let m = m_adj::<f32>(m_base, adj);
-        let e = exp::<u32, f32>(n);
+        let e = exp::<u32, f32>(n) - 1;
         repr::<f32>(e, m)
     }
 
@@ -87,7 +102,7 @@ mod int_to_float {
         }
         let n = i.leading_zeros();
         let m = m_f_gt_i::<_, f64>(i, n);
-        let e = exp::<u32, f64>(n);
+        let e = exp::<u32, f64>(n) - 1;
         repr::<f64>(e, m)
     }
 
@@ -111,12 +126,12 @@ mod int_to_float {
     pub fn u64_to_f32_bits(i: u64) -> u32 {
         let n = i.leading_zeros();
         let i_m = i.wrapping_shl(n); // Mantissa, shifted so the first bit is nonzero
-        let m_base = m_base::<_, f32>(i_m);
+        let m_base = m_f_lt_i::<u64, f32>(i_m);
         // The entire lower half of `i` will be truncated (masked portion), plus the
         // next `EXPONENT_BITS` bits.
         let adj = (i_m >> f32::EXPONENT_BITS | i_m & 0xFFFF) as u32;
         let m = m_adj::<f32>(m_base, adj);
-        let e = if i == 0 { 0 } else { exp::<u64, f32>(n) };
+        let e = if i == 0 { 0 } else { exp::<u64, f32>(n) - 1 };
         repr::<f32>(e, m)
     }
 
@@ -127,7 +142,7 @@ mod int_to_float {
         let n = i.leading_zeros();
         let (m_base, adj) = m_f_eq_i::<u64, f64>(i, n);
         let m = m_adj::<f64>(m_base, adj);
-        let e = exp::<u64, f64>(n);
+        let e = exp::<u64, f64>(n) - 1;
         repr::<f64>(e, m)
     }
 
@@ -138,14 +153,14 @@ mod int_to_float {
         }
         let n = i.leading_zeros();
         let m = m_f_gt_i::<_, f128>(i, n);
-        let e = exp::<u64, f128>(n);
+        let e = exp::<u64, f128>(n) - 1;
         repr::<f128>(e, m)
     }
 
     pub fn u128_to_f32_bits(i: u128) -> u32 {
         let n = i.leading_zeros();
         let i_m = i.wrapping_shl(n); // Mantissa, shifted so the first bit is nonzero
-        let m_base = m_base::<_, f32>(i_m);
+        let m_base = m_f_lt_i::<u128, f32>(i_m);
 
         // Within the upper `F::BITS`, everything except for the signifcand
         // gets truncated
@@ -157,19 +172,19 @@ mod int_to_float {
         let adj = d1 | d2;
 
         let m = m_adj::<f32>(m_base, adj);
-        let e = if i == 0 { 0 } else { exp::<u128, f32>(n) };
+        let e = if i == 0 { 0 } else { exp::<u128, f32>(n) - 1 };
         repr::<f32>(e, m)
     }
 
     pub fn u128_to_f64_bits(i: u128) -> u64 {
         let n = i.leading_zeros();
         let i_m = i.wrapping_shl(n); // Mantissa, shifted so the first bit is nonzero
-        let m_base = m_base::<_, f64>(i_m);
+        let m_base = m_f_lt_i::<u128, f64>(i_m);
         // The entire lower half of `i` will be truncated (masked portion), plus the
         // next `EXPONENT_BITS` bits.
         let adj = (i_m >> f64::EXPONENT_BITS | i_m & 0xFFFF_FFFF) as u64;
         let m = m_adj::<f64>(m_base, adj);
-        let e = if i == 0 { 0 } else { exp::<u128, f64>(n) };
+        let e = if i == 0 { 0 } else { exp::<u128, f64>(n) - 1 };
         repr::<f64>(e, m)
     }
 
@@ -181,7 +196,7 @@ mod int_to_float {
         let n = i.leading_zeros();
         let (m_base, adj) = m_f_eq_i::<u128, f128>(i, n);
         let m = m_adj::<f128>(m_base, adj);
-        let e = exp::<u128, f128>(n);
+        let e = exp::<u128, f128>(n) - 1;
         repr::<f128>(e, m)
     }
 }