Skip to content

Commit b42fcc9

Browse files
committed
Merge branch 'feature/even-better-widening-mul' into 'main'
Even better windowed widening multiplication See merge request tokend/alpenlabs/fastmul!8
2 parents 892b45d + 227a1b9 commit b42fcc9

File tree

13 files changed

+3564
-57
lines changed

13 files changed

+3564
-57
lines changed

Cargo.lock

Lines changed: 7 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ lazy_static = "1.4.0"
2525
bitcoin-script-stack = { git = "https://github.com/FairgateLabs/rust-bitcoin-script-stack"}
2626
prettytable-rs = "0.10.0"
2727
paste = "1.0"
28+
seq-macro = "0.3.5"
2829

2930
[dev-dependencies]
3031
rand_chacha = "0.3.1"

README.md

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -16,10 +16,10 @@ specify where you can find the corresponding unit test in the project.
1616

1717
| Command | Description | Location |
1818
| --- | --- | --- |
19-
| `cargo test -- --nocapture test_254_bit_windowed_widening_mul` | Test our widening multiplication algorithm | [`test.rs`](src/bigint/arithmetics/test.rs#L517) |
20-
| `cargo test -- --nocapture test_mul_w_width_254bit` | Test our narrow multiplication algorithm | [`test.rs`](src/bigint/arithmetics/test.rs#L487) |
21-
| `cargo test -- --nocapture test_254_bit_widening_mul` | Test _BitVM_'s widening multiplication algorithm (extended by us) | [`test.rs`](src/bigint/arithmetics/test.rs#L457) |
22-
| `cargo test -- --nocapture test_64_and_254_bit_mul` | Test _BitVM_'s narrow multiplication algorithm (a bit optimized by us) | [`test.rs`](src/bigint/arithmetics/test.rs#L414) |
19+
| `cargo test -- --nocapture test_254_bit_windowed_widening_optimized_mul` | Test our widening multiplication algorithm | [`test.rs`](src/bigint/arithmetics/test.rs#L517) |
20+
| `cargo test -- --nocapture test_254_bit_narrow_mul_w_width` | Test our narrow multiplication algorithm | [`test.rs`](src/bigint/arithmetics/test.rs#L487) |
21+
| `cargo test -- --nocapture test_254_bit_windowed_lazy_widening_mul` | Test _BitVM_'s widening multiplication algorithm (extended by us) | [`test.rs`](src/bigint/arithmetics/test.rs#L457) |
22+
| `cargo test -- --nocapture test_254_bit_naive_widening_mul` | Test _BitVM_'s narrow multiplication algorithm (a bit optimized by us) | [`test.rs`](src/bigint/arithmetics/test.rs#L414) |
2323
| `cargo test -- --nocapture test_255_bit_cmpeq_widening_mul` | Test [`cmpeq`](https://bitcointalk.org/index.php?topic=5477449.0)'s widening multiplication algorithm | [`test.rs`](src/bigint/cmpeq/test.rs#L56) |
2424
| `cargo test -- --nocapture --ignored debug_mul_performance_comparison` | Compare the performance of several multiplication algorithms used | [`test.rs`](src/bigint/performance.rs#L14) |
2525

sage/optimized.sage

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
import random
2+
3+
def to_window_w_form(n: Integer, w: Integer) -> Integer:
4+
"""
5+
Converts the given integer n into the w-width representation
6+
"""
7+
8+
decomposition = []
9+
while n >= 1:
10+
c = n % (1 << w)
11+
decomposition.append(c)
12+
n = n - c
13+
n = n // (1 << w)
14+
15+
return decomposition
16+
17+
def mul_window_w_form(x: Integer, y: Integer, w: Integer) -> Integer:
18+
"""
19+
Multiplies two integers using window-w form
20+
"""
21+
22+
d = to_window_w_form(y, w)
23+
if len(d) == 63: # Ensuring that the decomposition is 64-bit
24+
d.append(0)
25+
26+
precompute_table = [i*x for i in range(1<<w)]
27+
28+
r = precompute_table[d[63]]
29+
print(r.nbits())
30+
for i in range(1, 64):
31+
k = 256 + 4*i
32+
for _ in range(4):
33+
r = 2*r
34+
r = r + precompute_table[d[63-i]]
35+
print(r.nbits())
36+
assert k == r.nbits()
37+
38+
return r
39+
40+
a = (1<<254)-1 # First random 254-bit integer
41+
b = (1<<254)-1 # Second random 254-bit integer
42+
43+
w = 4 # Window width
44+
45+
# Asserting that the decomposition is correct
46+
b_decomposition = to_window_w_form(b, w)
47+
if len(b_decomposition) == 63:
48+
b_decomposition.append(0)
49+
assert sum([2**(w*i)*c for i, c in enumerate(b_decomposition)]) == b, 'decomposition is wrong'
50+
print('decomposition is correct')
51+
52+
# Asserting that the multiplication is correct
53+
c = a * b
54+
assert mul_window_w_form(a, b, 4) == c, 'multiplication is wrong'
55+
print('multiplication is correct')

sage/u29x9.sage

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
import random
2+
3+
N_BITS = 254
4+
LIMB_SIZE = 29 # 30-bit limbs
5+
6+
def to_limbs(a: Integer) -> list[Integer]:
7+
"""
8+
Converts the given integer a into a list of 254-bit limbs
9+
"""
10+
limbs = []
11+
while a >= 1:
12+
c = a % (1 << LIMB_SIZE)
13+
limbs.append(c)
14+
a = a - c
15+
a = a // (1 << LIMB_SIZE)
16+
17+
return limbs
18+
19+
# Validating the correctness of the conversion
20+
a = Integer(random.randint(0, (1<<N_BITS)-1))
21+
b = Integer(random.randint(0, (1<<N_BITS)-1))
22+
c = a * b
23+
24+
print('a', to_limbs(a))
25+
print('b', to_limbs(b))
26+
print('c', to_limbs(c))

src/bigint/arithmetics/mod.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
pub mod add;
22
pub mod mul;
3+
pub mod u29x9;
34

45
#[cfg(test)]
56
pub mod test;

src/bigint/arithmetics/mul.rs

Lines changed: 165 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,13 @@
1+
use bitcoin::opcodes::all::{OP_ADD, OP_FROMALTSTACK, OP_SUB, OP_SWAP};
2+
use bitcoin_script_stack::debugger::pushable::Builder;
3+
use seq_macro::seq;
4+
15
use crate::bigint::window::precompute::WindowedPrecomputeTable;
26
use crate::bigint::window::NonNativeWindowedBigIntImpl;
3-
use crate::traits::integer::NonNativeLimbInteger;
7+
use crate::bigint::{U254, U508};
8+
use crate::pseudo::OP_4MUL;
9+
use crate::traits::arithmeticable::Arithmeticable;
10+
use crate::traits::integer::{NonNativeInteger, NonNativeLimbInteger};
411
use crate::traits::window::Windowable;
512
use crate::{
613
bigint::NonNativeBigIntImpl,
@@ -99,12 +106,18 @@ where
99106
{ <Self as Windowable>::OP_TOBEWINDOWEDFORM_TOALTSTACK() }
100107

101108
// Precomputing {0*z, 1*z, ..., ((1<<WIDTH)-1)*z}
102-
{ WindowedPrecomputeTable::<T, WIDTH>::initialize() }
109+
{ WindowedPrecomputeTable::<T, WIDTH, false>::initialize() }
103110

104111
// We initialize the result
105-
{ T::OP_0() }
112+
// Note that we can simply pick the precomputed value
113+
// since 0*16 is still 0, so we omit the doubling :)
114+
OP_FROMALTSTACK 1 OP_ADD
115+
{ 1<<WIDTH }
116+
OP_SWAP
117+
OP_SUB
118+
{ T::OP_PICKSTACK() }
106119

107-
for _ in 0..Self::DECOMPOSITION_SIZE {
120+
for _ in 1..Self::DECOMPOSITION_SIZE {
108121
// Double the result WIDTH times
109122
for _ in 0..WIDTH {
110123
{ T::OP_2MUL(0) }
@@ -137,7 +150,9 @@ where
137150
/// Multiplies the top two big integers on the stack
138151
/// represented as little-endian 32-bit limbs
139152
/// using w-width decomposition to get twice as large integer.
140-
pub(in super::super) fn handle_OP_WIDENINGMUL<Q>() -> Script
153+
/// Note: this is done lazily, that is operations are from the very
154+
/// beginning are performed over U508.
155+
pub(in super::super) fn handle_lazy_OP_WIDENINGMUL<Q>() -> Script
141156
where
142157
Q: NonNativeLimbInteger,
143158
{
@@ -149,12 +164,22 @@ where
149164
{ T::OP_EXTEND::<Q>() }
150165

151166
// Precomputing {0*z, 1*z, ..., ((1<<WIDTH)-1)*z}
152-
{ WindowedPrecomputeTable::<Q, WIDTH>::initialize() }
167+
{ WindowedPrecomputeTable::<Q, WIDTH, true>::initialize() }
153168

154-
// We initialize the result
155-
{ Q::OP_0() }
169+
// Picking di from the stack
170+
OP_FROMALTSTACK 1 OP_ADD
171+
172+
// Add the precomputed value to the result.
173+
// Since currently stack looks like:
174+
// {0*z, 1*z, ..., ((1<<WIDTH)-1)*z, di} with
175+
// r being the result, we need to copy
176+
// (1<<WIDTH - di)th element to the top of the stack.
177+
{ 1<<WIDTH }
178+
OP_SWAP
179+
OP_SUB
180+
{ Q::OP_PICKSTACK() }
156181

157-
for _ in 0..Self::DECOMPOSITION_SIZE {
182+
for _ in 1..Self::DECOMPOSITION_SIZE {
158183
// Double the result WIDTH times
159184
for _ in 0..WIDTH {
160185
{ Q::OP_2MUL_NOOVERFLOW(0) }
@@ -183,4 +208,135 @@ where
183208
{ Q::OP_FROMALTSTACK() }
184209
}
185210
}
211+
212+
/// Multiplies the top two big integers on the stack
213+
/// represented as little-endian 32-bit limbs
214+
/// using w-width decomposition to get twice as large integer. Chooses
215+
/// the most optimal method if present.
216+
pub(in super::super) fn handle_OP_WIDENINGMUL<Q>() -> Script
217+
where Q: NonNativeLimbInteger,
218+
{
219+
match Self::N_BITS {
220+
U254::N_BITS => NonNativeWindowedBigIntImpl::<U254, 4>::handle_optimized_OP_WIDENINGMUL(),
221+
_ => Self::handle_lazy_OP_WIDENINGMUL::<Q>(),
222+
}
223+
}
224+
}
225+
226+
/// Special optimized implementation for U254 Windowed method
227+
#[allow(non_snake_case)]
228+
impl NonNativeWindowedBigIntImpl<U254, 4> {
229+
/// Since copy operation requires input depth to be equal to
230+
/// `Self::TOP_STACK_INT_LIMBS + Self::OTHER_LIMBS * depth`, this function normalizes the depth
231+
/// to the required value.
232+
fn normalize_stack_depth<Q>() -> Script
233+
where Q: NonNativeLimbInteger{
234+
let n_limbs = (Q::N_BITS + Q::LIMB_SIZE - 1) / Q::LIMB_SIZE;
235+
236+
script! {
237+
OP_DUP OP_4MUL {crate::pseudo::OP_2MUL()} // Multiplying depth by 8
238+
OP_ADD // Adding depth to 8*depth to get 9*depth
239+
{ n_limbs }
240+
OP_ADD
241+
}
242+
}
243+
244+
/// Copies the big integer located at depth to the top of the stack.
245+
/// Works similarly to `OP_PICK`, but for big integers.
246+
///
247+
/// For example, calling `copy(0)` will copy the top element to the top of the stack, while
248+
/// calling `copy(1)` will copy the second element to the top of the stack.
249+
pub(in super::super) fn handle_OP_PICKSTACK<Q: NonNativeLimbInteger>() -> Script {
250+
let n_limbs = (Self::N_BITS + Self::LIMB_SIZE - 1) / Self::LIMB_SIZE;
251+
252+
script! {
253+
{ Self::normalize_stack_depth::<Q>() }
254+
255+
for _ in 0..n_limbs - 1 {
256+
OP_DUP OP_PICK OP_SWAP
257+
}
258+
OP_1SUB OP_PICK
259+
}
260+
}
261+
262+
/// Multiplies the top two big integers on the stack
263+
/// represented as little-endian 32-bit limbs
264+
/// using w-width decomposition to get twice as large integer.
265+
pub(in super::super) fn handle_optimized_OP_WIDENINGMUL() -> Script {
266+
// The main loop script, see explanation in the returned script down below
267+
let main_loop_script = {
268+
let mut script_var = Vec::new();
269+
// Iterating 63 times (omitting the first iteration, we have already done it)
270+
seq!(N in 1..64 { #(
271+
let next_script = Builder::new()
272+
// Extending the result to 256+4*N bits from 256*4(N-1) bits
273+
.push_expression(NonNativeBigIntImpl::<{ 256 + 4*(N-1) }, 30>::OP_EXTEND::<NonNativeBigIntImpl::<{ 256 + 4*N }, 30>>())
274+
// First, multiply by 16 without caring for overflow
275+
.push_expression({
276+
let mut script_var = Vec::new();
277+
for _ in 0..4 {
278+
let next_script = Builder::new()
279+
.push_expression(NonNativeBigIntImpl::<{ 256 + 4*N }, 30>::OP_2MUL_NOOVERFLOW(0))
280+
.0
281+
.into_script();
282+
script_var.extend_from_slice(next_script.as_bytes());
283+
}
284+
Script::from(script_var)
285+
})
286+
// Taking coefficient, finding 16-coefficient and picking it
287+
.push_opcode(OP_FROMALTSTACK)
288+
.push_expression(1<<4)
289+
.push_opcode(OP_SWAP)
290+
.push_opcode(OP_SUB)
291+
.push_expression(Self::handle_OP_PICKSTACK::<NonNativeBigIntImpl::<{ 256 + 4*N }, 30>>())
292+
// Since we need to only care about last limbs,
293+
// we do not extend the result
294+
.push_expression(NonNativeBigIntImpl::<256, 30>::OP_ADD_NOOVERFLOW(0, 1))
295+
.0
296+
.into_script();
297+
script_var.extend_from_slice(next_script.as_bytes());
298+
)* });
299+
300+
Script::from(script_var)
301+
};
302+
303+
pushable::Builder::new()
304+
// Push w-width form to the stack
305+
.push_expression(Self::OP_TOBEWINDOWEDFORM_TOALTSTACK())
306+
// Initialize precompute table to the stack
307+
// Since 256 bits fits in 9x30 limbs, we do not need
308+
// to extend anything
309+
.push_expression(WindowedPrecomputeTable::<Self, 4, true>::initialize())
310+
// Making the first iteration of the loop (without the initial doubling step)
311+
// Taking coefficient, finding 16-coefficient and picking
312+
// corresponding precomputed value
313+
.push_opcode(OP_FROMALTSTACK)
314+
.push_expression(1)
315+
.push_opcode(OP_ADD)
316+
.push_expression(1<<4)
317+
.push_opcode(OP_SWAP)
318+
.push_opcode(OP_SUB)
319+
.push_expression(Self::OP_PICKSTACK())
320+
// At this point, we have a 256-bit number in the stack
321+
// Now the interesting part: the loop
322+
.push_expression(main_loop_script)
323+
// Moving result to the altstack
324+
.push_expression(U508::OP_TOALTSTACK())
325+
.push_expression({
326+
// Remvoing precomputed values from the stack
327+
let mut script_var = Vec::new();
328+
for _ in 0..1<<4 {
329+
let next_script = Builder::new()
330+
.push_expression(Self::OP_DROP())
331+
.0
332+
.into_script();
333+
script_var.extend_from_slice(next_script.as_bytes());
334+
}
335+
Script::from(script_var)
336+
})
337+
// Returning our element to the stack
338+
.push_expression(U508::OP_FROMALTSTACK())
339+
.0
340+
.into_script()
341+
}
186342
}

0 commit comments

Comments
 (0)