Skip to content

Commit bae8e98

Browse files
move arch-dependent impls to block_mix module
Signed-off-by: eternal-flame-AD <[email protected]>
1 parent 804c622 commit bae8e98

File tree

10 files changed

+403
-263
lines changed

10 files changed

+403
-263
lines changed

Cargo.lock

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

scrypt/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ edition = "2024"
1414
rust-version = "1.85"
1515

1616
[dependencies]
17+
cfg-if = "1.0"
1718
pbkdf2 = { version = "0.13.0-rc.0", path = "../pbkdf2" }
1819
salsa20 = { version = "0.11.0-rc.0", default-features = false }
1920
sha2 = { version = "0.11.0-rc.0", default-features = false }

scrypt/src/block_mix/mod.rs

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
#[cfg(any(
2+
test,
3+
not(any(
4+
all(
5+
any(target_arch = "x86", target_arch = "x86_64"),
6+
target_feature = "sse2"
7+
),
8+
all(target_arch = "aarch64", target_feature = "neon"),
9+
all(target_arch = "wasm32", target_feature = "simd128"),
10+
))
11+
))]
12+
mod soft;
13+
14+
cfg_if::cfg_if! {
15+
if #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] {
16+
mod pivot;
17+
mod neon;
18+
pub(crate) use neon::{scrypt_block_mix, shuffle_in, shuffle_out};
19+
} else if #[cfg(all(target_arch = "wasm32", target_feature = "simd128"))] {
20+
mod pivot;
21+
mod simd128;
22+
pub(crate) use simd128::{scrypt_block_mix, shuffle_in, shuffle_out};
23+
} else if #[cfg(all(any(target_arch = "x86", target_arch = "x86_64"), target_feature = "sse2"))] {
24+
mod pivot;
25+
mod sse2;
26+
pub(crate) use sse2::{scrypt_block_mix, shuffle_in, shuffle_out};
27+
} else {
28+
pub(crate) use soft::scrypt_block_mix;
29+
30+
pub(crate) fn shuffle_in(_input: &mut [u8]) {}
31+
pub(crate) fn shuffle_out(_input: &mut [u8]) {}
32+
}
33+
}
34+
35+
#[cfg(test)]
36+
mod tests {
37+
use super::*;
38+
39+
#[test]
40+
fn test_scrypt_block_mix_abcd_against_soft() {
41+
let mut input: [u8; 128] = core::array::from_fn(|i| i as u8);
42+
for _round in 0..10 {
43+
let mut output = [0u8; 128];
44+
45+
let mut expected0 = [0u8; 128];
46+
let mut expected1 = [0u8; 128]; // check shuffle_out is a correct inverse of shuffle_in
47+
soft::scrypt_block_mix(&input, &mut expected0);
48+
shuffle_in(&mut input);
49+
scrypt_block_mix(&input, &mut output);
50+
shuffle_out(&mut input);
51+
soft::scrypt_block_mix(&input, &mut expected1);
52+
shuffle_out(&mut output);
53+
assert_eq!(
54+
expected0, expected1,
55+
"expected0 != expected1, shuffle_out is not a correct inverse of shuffle_in?"
56+
);
57+
assert_eq!(
58+
output, expected0,
59+
"output != expected0, scrypt_block_mix is not correct?"
60+
);
61+
62+
input
63+
.iter_mut()
64+
.zip(output.iter())
65+
.for_each(|(a, b)| *a = a.wrapping_add(*b));
66+
}
67+
}
68+
}

scrypt/src/block_mix/neon.rs

Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,88 @@
1+
use crate::block_mix::pivot::{INVERSE_PIVOT_ABCD, PIVOT_ABCD};
2+
3+
pub(crate) fn shuffle_in(b: &mut [u8]) {
4+
for chunk in b.chunks_exact_mut(64) {
5+
let mut t = [0u32; 16];
6+
for (c, b) in chunk.chunks_exact(4).zip(t.iter_mut()) {
7+
*b = u32::from_ne_bytes(c.try_into().unwrap());
8+
}
9+
chunk.chunks_exact_mut(4).enumerate().for_each(|(i, b)| {
10+
b.copy_from_slice(&t[PIVOT_ABCD[i]].to_ne_bytes());
11+
});
12+
}
13+
}
14+
15+
pub(crate) fn shuffle_out(b: &mut [u8]) {
16+
for chunk in b.chunks_exact_mut(64) {
17+
let mut t = [0u32; 16];
18+
for (c, b) in chunk.chunks_exact(4).zip(t.iter_mut()) {
19+
*b = u32::from_ne_bytes(c.try_into().unwrap());
20+
}
21+
chunk.chunks_exact_mut(4).enumerate().for_each(|(i, b)| {
22+
b.copy_from_slice(&t[INVERSE_PIVOT_ABCD[i]].to_ne_bytes());
23+
});
24+
}
25+
}
26+
27+
pub(crate) fn scrypt_block_mix(input: &[u8], output: &mut [u8]) {
28+
use core::arch::aarch64::*;
29+
30+
macro_rules! vrol_u32 {
31+
($w:expr, $amt:literal) => {{
32+
let w = $w;
33+
vsraq_n_u32(vshlq_n_u32(w, $amt), w, 32 - $amt)
34+
}};
35+
}
36+
37+
let last_block = &input[input.len() - 64..];
38+
39+
let mut a = unsafe { vld1q_u32(last_block.as_ptr().cast()) };
40+
let mut b = unsafe { vld1q_u32(last_block.as_ptr().add(16).cast()) };
41+
let mut c = unsafe { vld1q_u32(last_block.as_ptr().add(32).cast()) };
42+
let mut d = unsafe { vld1q_u32(last_block.as_ptr().add(48).cast()) };
43+
44+
for (i, chunk) in input.chunks(64).enumerate() {
45+
let pos = if i % 2 == 0 {
46+
(i / 2) * 64
47+
} else {
48+
(i / 2) * 64 + input.len() / 2
49+
};
50+
51+
unsafe {
52+
let chunk_a = vld1q_u32(chunk.as_ptr().cast());
53+
let chunk_b = vld1q_u32(chunk.as_ptr().add(16).cast());
54+
let chunk_c = vld1q_u32(chunk.as_ptr().add(32).cast());
55+
let chunk_d = vld1q_u32(chunk.as_ptr().add(48).cast());
56+
57+
a = veorq_u32(a, chunk_a);
58+
b = veorq_u32(b, chunk_b);
59+
c = veorq_u32(c, chunk_c);
60+
d = veorq_u32(d, chunk_d);
61+
62+
let saves = [a, b, c, d];
63+
64+
for _ in 0..8 {
65+
b = veorq_u32(b, vrol_u32!(vaddq_u32(a, d), 7));
66+
c = veorq_u32(c, vrol_u32!(vaddq_u32(b, a), 9));
67+
d = veorq_u32(d, vrol_u32!(vaddq_u32(c, b), 13));
68+
a = veorq_u32(a, vrol_u32!(vaddq_u32(d, c), 18));
69+
70+
d = vextq_u32(d, d, 1);
71+
c = vextq_u32(c, c, 2);
72+
b = vextq_u32(b, b, 3);
73+
74+
(b, d) = (d, b);
75+
}
76+
77+
a = vaddq_u32(a, saves[0]);
78+
b = vaddq_u32(b, saves[1]);
79+
c = vaddq_u32(c, saves[2]);
80+
d = vaddq_u32(d, saves[3]);
81+
82+
vst1q_u32(output.as_mut_ptr().add(pos).cast(), a);
83+
vst1q_u32(output.as_mut_ptr().add(pos + 16).cast(), b);
84+
vst1q_u32(output.as_mut_ptr().add(pos + 32).cast(), c);
85+
vst1q_u32(output.as_mut_ptr().add(pos + 48).cast(), d);
86+
}
87+
}
88+
}

scrypt/src/block_mix/pivot.rs

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
/// Permute Salsa20 block to column major order
2+
pub(crate) const PIVOT_ABCD: [usize; 16] = [0, 5, 10, 15, 4, 9, 14, 3, 8, 13, 2, 7, 12, 1, 6, 11];
3+
4+
/// Inverse of PIVOT_ABCD
5+
pub(crate) const INVERSE_PIVOT_ABCD: [usize; 16] = const {
6+
let mut index = [0; 16];
7+
let mut i = 0;
8+
while i < 16 {
9+
let mut inverse = 0;
10+
while inverse < 16 {
11+
if PIVOT_ABCD[inverse] == i {
12+
index[i] = inverse;
13+
break;
14+
}
15+
inverse += 1;
16+
}
17+
i += 1;
18+
}
19+
index
20+
};

scrypt/src/block_mix/simd128.rs

Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,88 @@
1+
use crate::block_mix::pivot::{INVERSE_PIVOT_ABCD, PIVOT_ABCD};
2+
3+
pub(crate) fn shuffle_in(b: &mut [u8]) {
4+
for chunk in b.chunks_exact_mut(64) {
5+
let mut t = [0u32; 16];
6+
for (c, b) in chunk.chunks_exact(4).zip(t.iter_mut()) {
7+
*b = u32::from_ne_bytes(c.try_into().unwrap());
8+
}
9+
chunk.chunks_exact_mut(4).enumerate().for_each(|(i, b)| {
10+
b.copy_from_slice(&t[PIVOT_ABCD[i]].to_ne_bytes());
11+
});
12+
}
13+
}
14+
15+
pub(crate) fn shuffle_out(b: &mut [u8]) {
16+
for chunk in b.chunks_exact_mut(64) {
17+
let mut t = [0u32; 16];
18+
for (c, b) in chunk.chunks_exact(4).zip(t.iter_mut()) {
19+
*b = u32::from_ne_bytes(c.try_into().unwrap());
20+
}
21+
chunk.chunks_exact_mut(4).enumerate().for_each(|(i, b)| {
22+
b.copy_from_slice(&t[INVERSE_PIVOT_ABCD[i]].to_ne_bytes());
23+
});
24+
}
25+
}
26+
27+
pub(crate) fn scrypt_block_mix(input: &[u8], output: &mut [u8]) {
28+
use core::arch::wasm32::*;
29+
30+
macro_rules! u32x4_rol {
31+
($w:expr, $amt:literal) => {{
32+
let w = $w;
33+
v128_or(u32x4_shl(w, $amt), u32x4_shr(w, 32 - $amt))
34+
}};
35+
}
36+
37+
let last_block = &input[input.len() - 64..];
38+
39+
let mut a = unsafe { v128_load(last_block.as_ptr().cast()) };
40+
let mut b = unsafe { v128_load(last_block.as_ptr().add(16).cast()) };
41+
let mut c = unsafe { v128_load(last_block.as_ptr().add(32).cast()) };
42+
let mut d = unsafe { v128_load(last_block.as_ptr().add(48).cast()) };
43+
44+
for (i, chunk) in input.chunks(64).enumerate() {
45+
let pos = if i % 2 == 0 {
46+
(i / 2) * 64
47+
} else {
48+
(i / 2) * 64 + input.len() / 2
49+
};
50+
51+
unsafe {
52+
let chunk_a = v128_load(chunk.as_ptr().cast());
53+
let chunk_b = v128_load(chunk.as_ptr().add(16).cast());
54+
let chunk_c = v128_load(chunk.as_ptr().add(32).cast());
55+
let chunk_d = v128_load(chunk.as_ptr().add(48).cast());
56+
57+
a = v128_xor(a, chunk_a);
58+
b = v128_xor(b, chunk_b);
59+
c = v128_xor(c, chunk_c);
60+
d = v128_xor(d, chunk_d);
61+
62+
let saves = [a, b, c, d];
63+
64+
for _ in 0..8 {
65+
b = v128_xor(b, u32x4_rol!(u32x4_add(a, d), 7));
66+
c = v128_xor(c, u32x4_rol!(u32x4_add(b, a), 9));
67+
d = v128_xor(d, u32x4_rol!(u32x4_add(c, b), 13));
68+
a = v128_xor(a, u32x4_rol!(u32x4_add(d, c), 18));
69+
70+
d = i32x4_shuffle::<1, 2, 3, 0>(d, d);
71+
c = i32x4_shuffle::<2, 3, 0, 1>(c, c);
72+
b = i32x4_shuffle::<3, 0, 1, 2>(b, b);
73+
74+
(b, d) = (d, b);
75+
}
76+
77+
a = u32x4_add(a, saves[0]);
78+
b = u32x4_add(b, saves[1]);
79+
c = u32x4_add(c, saves[2]);
80+
d = u32x4_add(d, saves[3]);
81+
82+
v128_store(output.as_mut_ptr().add(pos).cast(), a);
83+
v128_store(output.as_mut_ptr().add(pos + 16).cast(), b);
84+
v128_store(output.as_mut_ptr().add(pos + 32).cast(), c);
85+
v128_store(output.as_mut_ptr().add(pos + 48).cast(), d);
86+
}
87+
}
88+
}

scrypt/src/block_mix/soft.rs

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
/// Execute the BlockMix operation
2+
/// input - the input vector. The length must be a multiple of 128.
3+
/// output - the output vector. Must be the same length as input.
4+
pub(crate) fn scrypt_block_mix(input: &[u8], output: &mut [u8]) {
5+
use salsa20::{
6+
SalsaCore,
7+
cipher::{StreamCipherCore, typenum::U4},
8+
};
9+
10+
type Salsa20_8 = SalsaCore<U4>;
11+
12+
let mut x = [0u8; 64];
13+
x.copy_from_slice(&input[input.len() - 64..]);
14+
15+
let mut t = [0u8; 64];
16+
17+
for (i, chunk) in input.chunks(64).enumerate() {
18+
xor(&x, chunk, &mut t);
19+
20+
let mut t2 = [0u32; 16];
21+
22+
for (c, b) in t.chunks_exact(4).zip(t2.iter_mut()) {
23+
*b = u32::from_le_bytes(c.try_into().unwrap());
24+
}
25+
26+
Salsa20_8::from_raw_state(t2).write_keystream_block((&mut x).into());
27+
28+
let pos = if i % 2 == 0 {
29+
(i / 2) * 64
30+
} else {
31+
(i / 2) * 64 + input.len() / 2
32+
};
33+
34+
output[pos..pos + 64].copy_from_slice(&x);
35+
}
36+
}
37+
38+
fn xor(x: &[u8], y: &[u8], output: &mut [u8]) {
39+
for ((out, &x_i), &y_i) in output.iter_mut().zip(x.iter()).zip(y.iter()) {
40+
*out = x_i ^ y_i;
41+
}
42+
}

0 commit comments

Comments
 (0)