Skip to content

Commit 90432fc

Browse files
authored
Boxed Bernstein-Yang: reduce allocations (#502)
Performs more operations in-place Boxed Montgomery arithmetic/invert, 4096-bit time: [208.34 µs 208.70 µs 209.08 µs] change: [-23.092% -22.837% -22.579%] (p = 0.00 < 0.05) Performance has improved.
1 parent 6a293f9 commit 90432fc

File tree

2 files changed

+128
-116
lines changed

2 files changed

+128
-116
lines changed

benches/boxed_monty.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ fn to_biguint(uint: &BoxedUint) -> BigUint {
1919
fn bench_montgomery_ops<M: Measurement>(group: &mut BenchmarkGroup<'_, M>) {
2020
let params = BoxedMontyParams::new(Odd::<BoxedUint>::random(&mut OsRng, UINT_BITS));
2121

22-
group.bench_function("invert, U256", |b| {
22+
group.bench_function("invert, 4096-bit", |b| {
2323
b.iter_batched(
2424
|| {
2525
let modulus = NonZero::new(params.modulus().clone()).unwrap();

src/modular/bernstein_yang/boxed.rs

Lines changed: 127 additions & 115 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
use super::{inv_mod2_62, jump, Matrix};
77
use crate::{BoxedUint, Inverter, Limb, Odd, Word};
88
use alloc::{boxed::Box, vec::Vec};
9+
use core::ops::{AddAssign, Mul, Neg};
910
use subtle::{Choice, ConstantTimeEq, CtOption};
1011

1112
/// Modular multiplicative inverter based on the Bernstein-Yang method.
@@ -40,15 +41,15 @@ impl BoxedBernsteinYangInverter {
4041
/// formula. The input integer lies in the interval (-2 * M, M).
4142
fn norm(&self, mut value: BoxedInt62L, negate: bool) -> BoxedInt62L {
4243
if value.is_negative() {
43-
value = value.add(&self.modulus);
44+
value += &self.modulus;
4445
}
4546

4647
if negate {
4748
value = value.neg();
4849
}
4950

5051
if value.is_negative() {
51-
value = value.add(&self.modulus);
52+
value += &self.modulus;
5253
}
5354

5455
value
@@ -59,8 +60,8 @@ impl Inverter for BoxedBernsteinYangInverter {
5960
type Output = BoxedUint;
6061

6162
fn invert(&self, value: &BoxedUint) -> CtOption<Self::Output> {
62-
let mut d = BoxedInt62L::zero(self.modulus.0.len());
63-
let mut g = BoxedInt62L::from(value).widen(d.0.len());
63+
let mut d = BoxedInt62L::zero(self.modulus.nlimbs());
64+
let mut g = BoxedInt62L::from(value).widen(d.nlimbs());
6465
let f = divsteps(&mut d, &self.adjuster, &self.modulus, &mut g, self.inverse);
6566

6667
// At this point the absolute value of "f" equals the greatest common divisor of the
@@ -80,8 +81,8 @@ pub(crate) fn gcd(f: &BoxedUint, g: &BoxedUint) -> BoxedUint {
8081
let inverse = inv_mod2_62(f.as_words());
8182
let f = BoxedInt62L::from(f);
8283
let mut g = BoxedInt62L::from(g);
83-
let mut d = BoxedInt62L::zero(f.0.len());
84-
let e = BoxedInt62L::one(f.0.len());
84+
let mut d = BoxedInt62L::zero(f.nlimbs());
85+
let e = BoxedInt62L::one(f.nlimbs());
8586

8687
let mut f = divsteps(&mut d, &e, &f, &mut g, inverse);
8788

@@ -101,7 +102,7 @@ fn divsteps(
101102
g: &mut BoxedInt62L,
102103
inverse: i64,
103104
) -> BoxedInt62L {
104-
debug_assert_eq!(f_0.0.len(), g.0.len());
105+
debug_assert_eq!(f_0.nlimbs(), g.nlimbs());
105106

106107
let mut e = e.clone();
107108
let mut f = f_0.clone();
@@ -122,8 +123,14 @@ fn divsteps(
122123
/// "matrix * (f, g)' / 2^62", where "'" is the transpose operator.
123124
fn fg(f: &mut BoxedInt62L, g: &mut BoxedInt62L, t: Matrix) {
124125
// TODO(tarcieri): reduce allocations
125-
let f2 = f.mul(t[0][0]).add(&g.mul(t[0][1])).shr();
126-
let g2 = f.mul(t[1][0]).add(&g.mul(t[1][1])).shr();
126+
let mut f2 = &*f * t[0][0];
127+
f2 += &*g * t[0][1];
128+
f2.shr_assign();
129+
130+
let mut g2 = &*f * t[1][0];
131+
g2 += &*g * t[1][1];
132+
g2.shr_assign();
133+
127134
*f = f2;
128135
*g = g2;
129136
}
@@ -151,11 +158,18 @@ fn de(modulus: &BoxedInt62L, inverse: i64, t: Matrix, d: &mut BoxedInt62L, e: &m
151158
md -= (inverse.wrapping_mul(cd).wrapping_add(md)) & mask;
152159
me -= (inverse.wrapping_mul(ce).wrapping_add(me)) & mask;
153160

154-
let cd = d.mul(t[0][0]).add(&e.mul(t[0][1])).add(&modulus.mul(md));
155-
let ce = d.mul(t[1][0]).add(&e.mul(t[1][1])).add(&modulus.mul(me));
161+
let mut cd = d.mul(t[0][0]);
162+
cd += &e.mul(t[0][1]);
163+
cd += &modulus.mul(md);
164+
cd.shr_assign();
156165

157-
*d = cd.shr();
158-
*e = ce.shr();
166+
let mut ce = d.mul(t[1][0]);
167+
ce += &e.mul(t[1][1]);
168+
ce += &modulus.mul(me);
169+
ce.shr_assign();
170+
171+
*d = cd;
172+
*e = ce;
159173
}
160174

161175
/// "Bigint"-like (62 * LIMBS)-bit signed integer type, whose variables store numbers in the two's
@@ -216,7 +230,7 @@ impl BoxedInt62L {
216230
}
217231

218232
debug_assert_eq!(
219-
self.0.len(),
233+
self.nlimbs(),
220234
bernstein_yang_nlimbs!(bits_precision as usize)
221235
);
222236
assert!(
@@ -240,99 +254,15 @@ impl BoxedInt62L {
240254
ret
241255
}
242256

243-
/// Add.
244-
#[must_use]
245-
pub fn add(&self, other: &Self) -> Self {
246-
let nlimbs = self.0.len();
247-
debug_assert_eq!(nlimbs, other.0.len());
248-
249-
let mut ret = Self::zero(nlimbs);
250-
let mut carry = 0;
251-
let mut i = 0;
252-
253-
while i < nlimbs {
254-
let sum = self.0[i] + other.0[i] + carry;
255-
ret.0[i] = sum & Self::MASK;
256-
carry = sum >> Self::LIMB_BITS;
257-
i += 1;
258-
}
259-
260-
ret
261-
}
262-
263-
/// Mul.
264-
#[must_use]
265-
pub fn mul(&self, other: i64) -> Self {
266-
let nlimbs = self.0.len();
267-
let mut ret = Self::zero(nlimbs);
268-
269-
// If the short multiplicand is non-negative, the standard multiplication algorithm is
270-
// performed. Otherwise, the product of the additively negated multiplicands is found as
271-
// follows.
272-
//
273-
// Since for the two's complement code the additive negation is the result of adding 1 to
274-
// the bitwise inverted argument's representation, for any encoded integers x and y we have
275-
// x * y = (-x) * (-y) = (!x + 1) * (-y) = !x * (-y) + (-y), where "!" is the bitwise
276-
// inversion and arithmetic operations are performed according to the rules of the code.
277-
//
278-
// If the short multiplicand is negative, the algorithm below uses this formula by
279-
// substituting the short multiplicand for y and turns into the modified standard
280-
// multiplication algorithm, where the carry flag is initialized with the additively
281-
// negated short multiplicand and the chunks of the long multiplicand are bitwise inverted.
282-
let (other, mut carry, mask) = if other < 0 {
283-
(-other, -other as u64, Self::MASK)
284-
} else {
285-
(other, 0, 0)
286-
};
287-
288-
let mut i = 0;
289-
while i < nlimbs {
290-
let sum = (carry as u128) + ((self.0[i] ^ mask) as u128) * (other as u128);
291-
ret.0[i] = sum as u64 & Self::MASK;
292-
carry = (sum >> Self::LIMB_BITS) as u64;
293-
i += 1;
294-
}
295-
296-
ret
297-
}
298-
299-
/// Negate.
300-
#[must_use]
301-
pub fn neg(&self) -> Self {
302-
// For the two's complement code the additive negation is the result of adding 1 to the
303-
// bitwise inverted argument's representation.
304-
let nlimbs = self.0.len();
305-
let mut ret = Self::zero(nlimbs);
306-
let mut carry = 1;
307-
let mut i = 0;
257+
/// Apply 62-bit right arithmetical shift in-place.
258+
pub fn shr_assign(&mut self) {
259+
let is_negative = self.is_negative();
308260

309-
while i < nlimbs {
310-
let sum = (self.0[i] ^ Self::MASK) + carry;
311-
ret.0[i] = sum & Self::MASK;
312-
carry = sum >> Self::LIMB_BITS;
313-
i += 1;
314-
}
315-
316-
ret
317-
}
318-
319-
/// Returns the result of applying 62-bit right arithmetical shift to the current number.
320-
#[must_use]
321-
pub fn shr(&self) -> Self {
322-
let nlimbs = self.0.len();
323-
let mut ret = Self::zero(nlimbs);
324-
325-
if self.is_negative() {
326-
ret.0[nlimbs - 1] = Self::MASK;
261+
for i in 0..(self.nlimbs() - 1) {
262+
self.0[i] = self.0[i + 1];
327263
}
328264

329-
let mut i = 0;
330-
while i < nlimbs - 1 {
331-
ret.0[i] = self.0[i + 1];
332-
i += 1;
333-
}
334-
335-
ret
265+
self.0[self.nlimbs() - 1] = if is_negative { Self::MASK } else { 0 };
336266
}
337267

338268
/// Get the value zero for the given number of limbs.
@@ -364,7 +294,7 @@ impl BoxedInt62L {
364294

365295
/// Returns "true" iff the current number is negative.
366296
pub fn is_negative(&self) -> bool {
367-
self.0[self.0.len() - 1] > (Self::MASK >> 1)
297+
self.0[self.nlimbs() - 1] > (Self::MASK >> 1)
368298
}
369299

370300
/// Is the current value zero?
@@ -383,6 +313,86 @@ impl BoxedInt62L {
383313
pub fn lowest(&self) -> u64 {
384314
self.0[0]
385315
}
316+
317+
/// Returns the number of limbs used by this integer.
318+
pub fn nlimbs(&self) -> usize {
319+
self.0.len()
320+
}
321+
}
322+
323+
impl AddAssign<BoxedInt62L> for BoxedInt62L {
324+
fn add_assign(&mut self, rhs: BoxedInt62L) {
325+
self.add_assign(&rhs);
326+
}
327+
}
328+
329+
impl AddAssign<&BoxedInt62L> for BoxedInt62L {
330+
fn add_assign(&mut self, rhs: &BoxedInt62L) {
331+
debug_assert_eq!(self.nlimbs(), rhs.nlimbs());
332+
let mut carry = 0;
333+
334+
for i in 0..self.nlimbs() {
335+
let sum = self.0[i] + rhs.0[i] + carry;
336+
self.0[i] = sum & Self::MASK;
337+
carry = sum >> Self::LIMB_BITS;
338+
}
339+
}
340+
}
341+
342+
impl Mul<i64> for &BoxedInt62L {
343+
type Output = BoxedInt62L;
344+
345+
fn mul(self, other: i64) -> BoxedInt62L {
346+
let nlimbs = self.nlimbs();
347+
let mut ret = BoxedInt62L::zero(nlimbs);
348+
349+
// If the short multiplicand is non-negative, the standard multiplication algorithm is
350+
// performed. Otherwise, the product of the additively negated multiplicands is found as
351+
// follows.
352+
//
353+
// Since for the two's complement code the additive negation is the result of adding 1 to
354+
// the bitwise inverted argument's representation, for any encoded integers x and y we have
355+
// x * y = (-x) * (-y) = (!x + 1) * (-y) = !x * (-y) + (-y), where "!" is the bitwise
356+
// inversion and arithmetic operations are performed according to the rules of the code.
357+
//
358+
// If the short multiplicand is negative, the algorithm below uses this formula by
359+
// substituting the short multiplicand for y and turns into the modified standard
360+
// multiplication algorithm, where the carry flag is initialized with the additively
361+
// negated short multiplicand and the chunks of the long multiplicand are bitwise inverted.
362+
let (other, mut carry, mask) = if other < 0 {
363+
(-other, -other as u64, BoxedInt62L::MASK)
364+
} else {
365+
(other, 0, 0)
366+
};
367+
368+
for i in 0..nlimbs {
369+
let sum = (carry as u128) + ((self.0[i] ^ mask) as u128) * (other as u128);
370+
ret.0[i] = sum as u64 & BoxedInt62L::MASK;
371+
carry = (sum >> BoxedInt62L::LIMB_BITS) as u64;
372+
}
373+
374+
ret
375+
}
376+
}
377+
378+
impl Neg for BoxedInt62L {
379+
type Output = Self;
380+
381+
fn neg(self) -> Self {
382+
// For the two's complement code the additive negation is the result of adding 1 to the
383+
// bitwise inverted argument's representation.
384+
let nlimbs = self.nlimbs();
385+
let mut ret = Self::zero(nlimbs);
386+
let mut carry = 1;
387+
388+
for i in 0..nlimbs {
389+
let sum = (self.0[i] ^ Self::MASK) + carry;
390+
ret.0[i] = sum & Self::MASK;
391+
carry = sum >> Self::LIMB_BITS;
392+
}
393+
394+
ret
395+
}
386396
}
387397

388398
impl PartialEq for BoxedInt62L {
@@ -395,6 +405,7 @@ impl PartialEq for BoxedInt62L {
395405
mod tests {
396406
use super::BoxedInt62L;
397407
use crate::{modular::bernstein_yang::Int62L, BoxedUint, Inverter, PrecomputeInverter, U256};
408+
use core::ops::Neg;
398409
use proptest::prelude::*;
399410

400411
#[test]
@@ -499,8 +510,8 @@ mod tests {
499510
}
500511

501512
#[test]
502-
fn int62l_shr() {
503-
let n = BoxedInt62L(
513+
fn int62l_shr_assign() {
514+
let mut n = BoxedInt62L(
504515
vec![
505516
0,
506517
1211048314408256470,
@@ -511,9 +522,10 @@ mod tests {
511522
]
512523
.into(),
513524
);
525+
n.shr_assign();
514526

515527
assert_eq!(
516-
&*n.shr().0,
528+
&*n.0,
517529
&[
518530
1211048314408256470,
519531
1344008336933394898,
@@ -536,12 +548,12 @@ mod tests {
536548
fn boxed_int62l_add(x in u256(), y in u256()) {
537549
let x_ref = Int62L::<{ bernstein_yang_nlimbs!(256usize) }>::from_uint(&x);
538550
let y_ref = Int62L::<{ bernstein_yang_nlimbs!(256usize) }>::from_uint(&y);
539-
let x_boxed = BoxedInt62L::from(&x.into());
551+
let mut x_boxed = BoxedInt62L::from(&x.into());
540552
let y_boxed = BoxedInt62L::from(&y.into());
541553

542554
let expected = x_ref.add(&y_ref);
543-
let actual = x_boxed.add(&y_boxed);
544-
prop_assert_eq!(&expected.0, &*actual.0);
555+
x_boxed += &y_boxed;
556+
prop_assert_eq!(&expected.0, &*x_boxed.0);
545557
}
546558

547559
#[test]
@@ -550,7 +562,7 @@ mod tests {
550562
let x_boxed = BoxedInt62L::from(&x.into());
551563

552564
let expected = x_ref.mul(y);
553-
let actual = x_boxed.mul(y);
565+
let actual = &x_boxed * y;
554566
prop_assert_eq!(&expected.0, &*actual.0);
555567
}
556568

@@ -567,11 +579,11 @@ mod tests {
567579
#[test]
568580
fn boxed_int62l_shr(x in u256()) {
569581
let x_ref = Int62L::<{ bernstein_yang_nlimbs!(256usize) }>::from_uint(&x);
570-
let x_boxed = BoxedInt62L::from(&x.into());
582+
let mut x_boxed = BoxedInt62L::from(&x.into());
583+
x_boxed.shr_assign();
571584

572585
let expected = x_ref.shr();
573-
let actual = x_boxed.shr();
574-
prop_assert_eq!(&expected.0, &*actual.0);
586+
prop_assert_eq!(&expected.0, &*x_boxed.0);
575587
}
576588

577589
#[test]

0 commit comments

Comments
 (0)