Move schoolbook squaring implementation into a macro (#648)

andrewwhitehead · web-flow · commit a0b43e018312 · 2024-08-13T12:01:22.000-06:00
Signed-off-by: Andrew Whitehead &lt;cywolf@gmail.com&gt;
diff --git a/benches/boxed_uint.rs b/benches/boxed_uint.rs
@@ -43,6 +43,31 @@ fn bench_shifts(c: &mut Criterion) {
     group.finish();
 }
 
+fn bench_mul(c: &mut Criterion) {
+    let mut group = c.benchmark_group("wrapping ops");
+
+    group.bench_function("boxed_mul", |b| {
+        b.iter_batched(
+            || {
+                (
+                    BoxedUint::random_bits(&mut OsRng, UINT_BITS),
+                    BoxedUint::random_bits(&mut OsRng, UINT_BITS),
+                )
+            },
+            |(x, y)| black_box(x.mul(&y)),
+            BatchSize::SmallInput,
+        )
+    });
+
+    group.bench_function("boxed_square", |b| {
+        b.iter_batched(
+            || BoxedUint::random_bits(&mut OsRng, UINT_BITS),
+            |x| black_box(x.square()),
+            BatchSize::SmallInput,
+        )
+    });
+}
+
 fn bench_division(c: &mut Criterion) {
     let mut group = c.benchmark_group("wrapping ops");
 
@@ -156,6 +181,12 @@ fn bench_boxed_sqrt(c: &mut Criterion) {
     });
 }
 
-criterion_group!(benches, bench_division, bench_shifts, bench_boxed_sqrt);
+criterion_group!(
+    benches,
+    bench_mul,
+    bench_division,
+    bench_shifts,
+    bench_boxed_sqrt
+);
 
 criterion_main!(benches);
diff --git a/benches/uint.rs b/benches/uint.rs
@@ -1,7 +1,43 @@
 use criterion::{black_box, criterion_group, criterion_main, BatchSize, Criterion};
-use crypto_bigint::{Limb, NonZero, Odd, Random, Reciprocal, Uint, U128, U2048, U256};
+use crypto_bigint::{Limb, NonZero, Odd, Random, Reciprocal, Uint, U128, U2048, U256, U4096};
 use rand_core::OsRng;
 
+fn bench_mul(c: &mut Criterion) {
+    let mut group = c.benchmark_group("wrapping ops");
+
+    group.bench_function("split_mul, U256xU256", |b| {
+        b.iter_batched(
+            || (U256::random(&mut OsRng), U256::random(&mut OsRng)),
+            |(x, y)| black_box(x.split_mul(&y)),
+            BatchSize::SmallInput,
+        )
+    });
+
+    group.bench_function("split_mul, U4096xU4096", |b| {
+        b.iter_batched(
+            || (U4096::random(&mut OsRng), U4096::random(&mut OsRng)),
+            |(x, y)| black_box(x.split_mul(&y)),
+            BatchSize::SmallInput,
+        )
+    });
+
+    group.bench_function("square_wide, U256", |b| {
+        b.iter_batched(
+            || U256::random(&mut OsRng),
+            |x| black_box(x.square_wide()),
+            BatchSize::SmallInput,
+        )
+    });
+
+    group.bench_function("square_wide, U4096", |b| {
+        b.iter_batched(
+            || U4096::random(&mut OsRng),
+            |x| black_box(x.square_wide()),
+            BatchSize::SmallInput,
+        )
+    });
+}
+
 fn bench_division(c: &mut Criterion) {
     let mut group = c.benchmark_group("wrapping ops");
 
@@ -288,6 +324,7 @@ fn bench_sqrt(c: &mut Criterion) {
 
 criterion_group!(
     benches,
+    bench_mul,
     bench_division,
     bench_gcd,
     bench_shl,
diff --git a/src/uint/boxed/mul.rs b/src/uint/boxed/mul.rs
@@ -1,7 +1,8 @@
 //! [`BoxedUint`] multiplication operations.
 
 use crate::{
-    uint::mul::mul_limbs, BoxedUint, CheckedMul, Limb, WideningMul, Wrapping, WrappingMul, Zero,
+    uint::mul::{mul_limbs, square_limbs},
+    BoxedUint, CheckedMul, Limb, WideningMul, Wrapping, WrappingMul, Zero,
 };
 use core::ops::{Mul, MulAssign};
 use subtle::{Choice, CtOption};
@@ -23,8 +24,9 @@ impl BoxedUint {
 
     /// Multiply `self` by itself.
     pub fn square(&self) -> Self {
-        // TODO(tarcieri): more optimized implementation (shared with `Uint`?)
-        self.mul(self)
+        let mut limbs = vec![Limb::ZERO; self.nlimbs() * 2];
+        square_limbs(&self.limbs, &mut limbs);
+        limbs.into()
     }
 }
 
diff --git a/src/uint/mul.rs b/src/uint/mul.rs
@@ -8,7 +8,7 @@ use crate::{
 use core::ops::{Mul, MulAssign};
 use subtle::CtOption;
 
-/// Impl the core schoolbook multiplication algorithm.
+/// Implement the core schoolbook multiplication algorithm.
 ///
 /// This is implemented as a macro to abstract over `const fn` and boxed use cases, since the latter
 /// needs mutable references and thus the unstable `const_mut_refs` feature (rust-lang/rust#57349).
@@ -53,113 +53,136 @@ macro_rules! impl_schoolbook_multiplication {
     }};
 }
 
-impl<const LIMBS: usize> Uint<LIMBS> {
-    /// Multiply `self` by `rhs`, returning a concatenated "wide" result.
-    pub const fn widening_mul<const RHS_LIMBS: usize, const WIDE_LIMBS: usize>(
-        &self,
-        rhs: &Uint<RHS_LIMBS>,
-    ) -> Uint<WIDE_LIMBS>
-    where
-        Self: ConcatMixed<Uint<RHS_LIMBS>, MixedOutput = Uint<WIDE_LIMBS>>,
-    {
-        let (lo, hi) = self.split_mul(rhs);
-        Uint::concat_mixed(&lo, &hi)
-    }
-
-    /// Compute "wide" multiplication as a 2-tuple containing the `(lo, hi)` components of the product, whose sizes
-    /// correspond to the sizes of the operands.
-    pub const fn split_mul<const RHS_LIMBS: usize>(
-        &self,
-        rhs: &Uint<RHS_LIMBS>,
-    ) -> (Self, Uint<RHS_LIMBS>) {
-        let mut lo = Self::ZERO;
-        let mut hi = Uint::<RHS_LIMBS>::ZERO;
-        impl_schoolbook_multiplication!(&self.limbs, &rhs.limbs, lo.limbs, hi.limbs);
-        (lo, hi)
-    }
-
-    /// Perform wrapping multiplication, discarding overflow.
-    pub const fn wrapping_mul<const H: usize>(&self, rhs: &Uint<H>) -> Self {
-        self.split_mul(rhs).0
-    }
-
-    /// Perform saturating multiplication, returning `MAX` on overflow.
-    pub const fn saturating_mul<const RHS_LIMBS: usize>(&self, rhs: &Uint<RHS_LIMBS>) -> Self {
-        let (res, overflow) = self.split_mul(rhs);
-        Self::select(&res, &Self::MAX, overflow.is_nonzero())
-    }
-
-    /// Square self, returning a "wide" result in two parts as (lo, hi).
-    pub const fn square_wide(&self) -> (Self, Self) {
+/// Implement the schoolbook method for squaring.
+///
+/// Like schoolbook multiplication, but only considering half of the multiplication grid.
+// TODO: change this into a `const fn` when `const_mut_refs` is stable.
+macro_rules! impl_schoolbook_squaring {
+    ($limbs:expr, $lo:expr, $hi:expr) => {{
         // Translated from https://github.com/ucbrise/jedi-pairing/blob/c4bf151/include/core/bigint.hpp#L410
         //
         // Permission to relicense the resulting translation as Apache 2.0 + MIT was given
         // by the original author Sam Kumar: https://github.com/RustCrypto/crypto-bigint/pull/133#discussion_r1056870411
-        let mut lo = Self::ZERO;
-        let mut hi = Self::ZERO;
 
-        // Schoolbook multiplication, but only considering half of the multiplication grid
+        if $limbs.len() != $lo.len() || $lo.len() != $hi.len() {
+            panic!("schoolbook squaring length mismatch");
+        }
+
         let mut i = 1;
-        while i < LIMBS {
+        while i < $limbs.len() {
             let mut j = 0;
             let mut carry = Limb::ZERO;
 
             while j < i {
                 let k = i + j;
 
-                if k >= LIMBS {
-                    let (n, c) = hi.limbs[k - LIMBS].mac(self.limbs[i], self.limbs[j], carry);
-                    hi.limbs[k - LIMBS] = n;
+                if k >= $limbs.len() {
+                    let (n, c) = $hi[k - $limbs.len()].mac($limbs[i], $limbs[j], carry);
+                    $hi[k - $limbs.len()] = n;
                     carry = c;
                 } else {
-                    let (n, c) = lo.limbs[k].mac(self.limbs[i], self.limbs[j], carry);
-                    lo.limbs[k] = n;
+                    let (n, c) = $lo[k].mac($limbs[i], $limbs[j], carry);
+                    $lo[k] = n;
                     carry = c;
                 }
 
                 j += 1;
             }
 
-            if (2 * i) < LIMBS {
-                lo.limbs[2 * i] = carry;
+            if (2 * i) < $limbs.len() {
+                $lo[2 * i] = carry;
             } else {
-                hi.limbs[2 * i - LIMBS] = carry;
+                $hi[2 * i - $limbs.len()] = carry;
             }
 
             i += 1;
         }
 
         // Double the current result, this accounts for the other half of the multiplication grid.
-        // TODO: The top word is empty so we can also use a special purpose shl.
-        (lo, hi) = Self::overflowing_shl_vartime_wide((lo, hi), 1).expect("shift within range");
+        // The top word is empty, so we use a special purpose shl.
+        let mut carry = Limb::ZERO;
+        let mut i = 0;
+        while i < $limbs.len() {
+            ($lo[i].0, carry) = ($lo[i].0 << 1 | carry.0, $lo[i].shr(Limb::BITS - 1));
+            i += 1;
+        }
+        i = 0;
+        while i < $limbs.len() - 1 {
+            ($hi[i].0, carry) = ($hi[i].0 << 1 | carry.0, $hi[i].shr(Limb::BITS - 1));
+            i += 1;
+        }
+        $hi[$limbs.len() - 1] = carry;
 
         // Handle the diagonal of the multiplication grid, which finishes the multiplication grid.
         let mut carry = Limb::ZERO;
         let mut i = 0;
-        while i < LIMBS {
-            if (i * 2) < LIMBS {
-                let (n, c) = lo.limbs[i * 2].mac(self.limbs[i], self.limbs[i], carry);
-                lo.limbs[i * 2] = n;
+        while i < $limbs.len() {
+            if (i * 2) < $limbs.len() {
+                let (n, c) = $lo[i * 2].mac($limbs[i], $limbs[i], carry);
+                $lo[i * 2] = n;
                 carry = c;
             } else {
-                let (n, c) = hi.limbs[i * 2 - LIMBS].mac(self.limbs[i], self.limbs[i], carry);
-                hi.limbs[i * 2 - LIMBS] = n;
+                let (n, c) = $hi[i * 2 - $limbs.len()].mac($limbs[i], $limbs[i], carry);
+                $hi[i * 2 - $limbs.len()] = n;
                 carry = c;
             }
 
-            if (i * 2 + 1) < LIMBS {
-                let (n, c) = lo.limbs[i * 2 + 1].overflowing_add(carry);
-                lo.limbs[i * 2 + 1] = n;
+            if (i * 2 + 1) < $limbs.len() {
+                let (n, c) = $lo[i * 2 + 1].overflowing_add(carry);
+                $lo[i * 2 + 1] = n;
                 carry = c;
             } else {
-                let (n, c) = hi.limbs[i * 2 + 1 - LIMBS].overflowing_add(carry);
-                hi.limbs[i * 2 + 1 - LIMBS] = n;
+                let (n, c) = $hi[i * 2 + 1 - $limbs.len()].overflowing_add(carry);
+                $hi[i * 2 + 1 - $limbs.len()] = n;
                 carry = c;
             }
 
             i += 1;
         }
+    }};
+}
+
+impl<const LIMBS: usize> Uint<LIMBS> {
+    /// Multiply `self` by `rhs`, returning a concatenated "wide" result.
+    pub const fn widening_mul<const RHS_LIMBS: usize, const WIDE_LIMBS: usize>(
+        &self,
+        rhs: &Uint<RHS_LIMBS>,
+    ) -> Uint<WIDE_LIMBS>
+    where
+        Self: ConcatMixed<Uint<RHS_LIMBS>, MixedOutput = Uint<WIDE_LIMBS>>,
+    {
+        let (lo, hi) = self.split_mul(rhs);
+        Uint::concat_mixed(&lo, &hi)
+    }
+
+    /// Compute "wide" multiplication as a 2-tuple containing the `(lo, hi)` components of the product, whose sizes
+    /// correspond to the sizes of the operands.
+    pub const fn split_mul<const RHS_LIMBS: usize>(
+        &self,
+        rhs: &Uint<RHS_LIMBS>,
+    ) -> (Self, Uint<RHS_LIMBS>) {
+        let mut lo = Self::ZERO;
+        let mut hi = Uint::<RHS_LIMBS>::ZERO;
+        impl_schoolbook_multiplication!(&self.limbs, &rhs.limbs, lo.limbs, hi.limbs);
+        (lo, hi)
+    }
+
+    /// Perform wrapping multiplication, discarding overflow.
+    pub const fn wrapping_mul<const H: usize>(&self, rhs: &Uint<H>) -> Self {
+        self.split_mul(rhs).0
+    }
+
+    /// Perform saturating multiplication, returning `MAX` on overflow.
+    pub const fn saturating_mul<const RHS_LIMBS: usize>(&self, rhs: &Uint<RHS_LIMBS>) -> Self {
+        let (res, overflow) = self.split_mul(rhs);
+        Self::select(&res, &Self::MAX, overflow.is_nonzero())
+    }
 
+    /// Square self, returning a "wide" result in two parts as (lo, hi).
+    pub const fn square_wide(&self) -> (Self, Self) {
+        let mut lo = Self::ZERO;
+        let mut hi = Self::ZERO;
+        impl_schoolbook_squaring!(&self.limbs, lo.limbs, hi.limbs);
         (lo, hi)
     }
 }
@@ -280,6 +303,14 @@ pub(crate) fn mul_limbs(lhs: &[Limb], rhs: &[Limb], out: &mut [Limb]) {
     impl_schoolbook_multiplication!(lhs, rhs, lo, hi);
 }
 
+/// Wrapper function used by `BoxedUint`
+#[cfg(feature = "alloc")]
+pub(crate) fn square_limbs(limbs: &[Limb], out: &mut [Limb]) {
+    debug_assert_eq!(limbs.len() * 2, out.len());
+    let (lo, hi) = out.split_at_mut(limbs.len());
+    impl_schoolbook_squaring!(limbs, lo, hi);
+}
+
 #[cfg(test)]
 mod tests {
     use crate::{CheckedMul, Zero, U128, U192, U256, U64};