RustCrypto
diff --git a/‎Cargo.lock‎
Lines changed: 1 addition & 0 deletions b/‎Cargo.lock‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎scrypt/Cargo.toml‎
Lines changed: 1 addition & 0 deletions b/‎scrypt/Cargo.toml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎scrypt/src/block_mix/mod.rs‎
Lines changed: 68 additions & 0 deletions b/‎scrypt/src/block_mix/mod.rs‎
Lines changed: 68 additions & 0 deletions
diff --git a/‎scrypt/src/block_mix/neon.rs‎
Lines changed: 88 additions & 0 deletions b/‎scrypt/src/block_mix/neon.rs‎
Lines changed: 88 additions & 0 deletions
diff --git a/‎scrypt/src/block_mix/pivot.rs‎
Lines changed: 20 additions & 0 deletions b/‎scrypt/src/block_mix/pivot.rs‎
Lines changed: 20 additions & 0 deletions
diff --git a/‎scrypt/src/block_mix/simd128.rs‎
Lines changed: 88 additions & 0 deletions b/‎scrypt/src/block_mix/simd128.rs‎
Lines changed: 88 additions & 0 deletions
diff --git a/‎scrypt/src/block_mix/soft.rs‎
Lines changed: 42 additions & 0 deletions b/‎scrypt/src/block_mix/soft.rs‎
Lines changed: 42 additions & 0 deletions
@@ -14,6 +14,7 @@ edition = "2024"
 rust-version = "1.85"
 
 [dependencies]
+cfg-if = "1.0"
 pbkdf2 = { version = "0.13.0-rc.0", path = "../pbkdf2" }
 salsa20 = { version = "0.11.0-rc.0", default-features = false }
 sha2 = { version = "0.11.0-rc.0", default-features = false }
 
@@ -0,0 +1,68 @@
+#[cfg(any(
+    test,
+    not(any(
+        all(
+            any(target_arch = "x86", target_arch = "x86_64"),
+            target_feature = "sse2"
+        ),
+        all(target_arch = "aarch64", target_feature = "neon"),
+        all(target_arch = "wasm32", target_feature = "simd128"),
+    ))
+))]
+mod soft;
+
+cfg_if::cfg_if! {
+    if #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] {
+        mod pivot;
+        mod neon;
+        pub(crate) use neon::{scrypt_block_mix, shuffle_in, shuffle_out};
+    } else if #[cfg(all(target_arch = "wasm32", target_feature = "simd128"))] {
+        mod pivot;
+        mod simd128;
+        pub(crate) use simd128::{scrypt_block_mix, shuffle_in, shuffle_out};
+    } else if #[cfg(all(any(target_arch = "x86", target_arch = "x86_64"), target_feature = "sse2"))] {
+        mod pivot;
+        mod sse2;
+        pub(crate) use sse2::{scrypt_block_mix, shuffle_in, shuffle_out};
+    } else {
+        pub(crate) use soft::scrypt_block_mix;
+
+        pub(crate) fn shuffle_in(_input: &mut [u8]) {}
+        pub(crate) fn shuffle_out(_input: &mut [u8]) {}
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_scrypt_block_mix_abcd_against_soft() {
+        let mut input: [u8; 128] = core::array::from_fn(|i| i as u8);
+        for _round in 0..10 {
+            let mut output = [0u8; 128];
+
+            let mut expected0 = [0u8; 128];
+            let mut expected1 = [0u8; 128]; // check shuffle_out is a correct inverse of shuffle_in
+            soft::scrypt_block_mix(&input, &mut expected0);
+            shuffle_in(&mut input);
+            scrypt_block_mix(&input, &mut output);
+            shuffle_out(&mut input);
+            soft::scrypt_block_mix(&input, &mut expected1);
+            shuffle_out(&mut output);
+            assert_eq!(
+                expected0, expected1,
+                "expected0 != expected1, shuffle_out is not a correct inverse of shuffle_in?"
+            );
+            assert_eq!(
+                output, expected0,
+                "output != expected0, scrypt_block_mix is not correct?"
+            );
+
+            input
+                .iter_mut()
+                .zip(output.iter())
+                .for_each(|(a, b)| *a = a.wrapping_add(*b));
+        }
+    }
+}
@@ -0,0 +1,88 @@
+use crate::block_mix::pivot::{INVERSE_PIVOT_ABCD, PIVOT_ABCD};
+
+pub(crate) fn shuffle_in(b: &mut [u8]) {
+    for chunk in b.chunks_exact_mut(64) {
+        let mut t = [0u32; 16];
+        for (c, b) in chunk.chunks_exact(4).zip(t.iter_mut()) {
+            *b = u32::from_ne_bytes(c.try_into().unwrap());
+        }
+        chunk.chunks_exact_mut(4).enumerate().for_each(|(i, b)| {
+            b.copy_from_slice(&t[PIVOT_ABCD[i]].to_ne_bytes());
+        });
+    }
+}
+
+pub(crate) fn shuffle_out(b: &mut [u8]) {
+    for chunk in b.chunks_exact_mut(64) {
+        let mut t = [0u32; 16];
+        for (c, b) in chunk.chunks_exact(4).zip(t.iter_mut()) {
+            *b = u32::from_ne_bytes(c.try_into().unwrap());
+        }
+        chunk.chunks_exact_mut(4).enumerate().for_each(|(i, b)| {
+            b.copy_from_slice(&t[INVERSE_PIVOT_ABCD[i]].to_ne_bytes());
+        });
+    }
+}
+
+pub(crate) fn scrypt_block_mix(input: &[u8], output: &mut [u8]) {
+    use core::arch::aarch64::*;
+
+    macro_rules! vrol_u32 {
+        ($w:expr, $amt:literal) => {{
+            let w = $w;
+            vsraq_n_u32(vshlq_n_u32(w, $amt), w, 32 - $amt)
+        }};
+    }
+
+    let last_block = &input[input.len() - 64..];
+
+    let mut a = unsafe { vld1q_u32(last_block.as_ptr().cast()) };
+    let mut b = unsafe { vld1q_u32(last_block.as_ptr().add(16).cast()) };
+    let mut c = unsafe { vld1q_u32(last_block.as_ptr().add(32).cast()) };
+    let mut d = unsafe { vld1q_u32(last_block.as_ptr().add(48).cast()) };
+
+    for (i, chunk) in input.chunks(64).enumerate() {
+        let pos = if i % 2 == 0 {
+            (i / 2) * 64
+        } else {
+            (i / 2) * 64 + input.len() / 2
+        };
+
+        unsafe {
+            let chunk_a = vld1q_u32(chunk.as_ptr().cast());
+            let chunk_b = vld1q_u32(chunk.as_ptr().add(16).cast());
+            let chunk_c = vld1q_u32(chunk.as_ptr().add(32).cast());
+            let chunk_d = vld1q_u32(chunk.as_ptr().add(48).cast());
+
+            a = veorq_u32(a, chunk_a);
+            b = veorq_u32(b, chunk_b);
+            c = veorq_u32(c, chunk_c);
+            d = veorq_u32(d, chunk_d);
+
+            let saves = [a, b, c, d];
+
+            for _ in 0..8 {
+                b = veorq_u32(b, vrol_u32!(vaddq_u32(a, d), 7));
+                c = veorq_u32(c, vrol_u32!(vaddq_u32(b, a), 9));
+                d = veorq_u32(d, vrol_u32!(vaddq_u32(c, b), 13));
+                a = veorq_u32(a, vrol_u32!(vaddq_u32(d, c), 18));
+
+                d = vextq_u32(d, d, 1);
+                c = vextq_u32(c, c, 2);
+                b = vextq_u32(b, b, 3);
+
+                (b, d) = (d, b);
+            }
+
+            a = vaddq_u32(a, saves[0]);
+            b = vaddq_u32(b, saves[1]);
+            c = vaddq_u32(c, saves[2]);
+            d = vaddq_u32(d, saves[3]);
+
+            vst1q_u32(output.as_mut_ptr().add(pos).cast(), a);
+            vst1q_u32(output.as_mut_ptr().add(pos + 16).cast(), b);
+            vst1q_u32(output.as_mut_ptr().add(pos + 32).cast(), c);
+            vst1q_u32(output.as_mut_ptr().add(pos + 48).cast(), d);
+        }
+    }
+}
@@ -0,0 +1,20 @@
+/// Permute Salsa20 block to column major order
+pub(crate) const PIVOT_ABCD: [usize; 16] = [0, 5, 10, 15, 4, 9, 14, 3, 8, 13, 2, 7, 12, 1, 6, 11];
+
+/// Inverse of PIVOT_ABCD
+pub(crate) const INVERSE_PIVOT_ABCD: [usize; 16] = const {
+    let mut index = [0; 16];
+    let mut i = 0;
+    while i < 16 {
+        let mut inverse = 0;
+        while inverse < 16 {
+            if PIVOT_ABCD[inverse] == i {
+                index[i] = inverse;
+                break;
+            }
+            inverse += 1;
+        }
+        i += 1;
+    }
+    index
+};
@@ -0,0 +1,88 @@
+use crate::block_mix::pivot::{INVERSE_PIVOT_ABCD, PIVOT_ABCD};
+
+pub(crate) fn shuffle_in(b: &mut [u8]) {
+    for chunk in b.chunks_exact_mut(64) {
+        let mut t = [0u32; 16];
+        for (c, b) in chunk.chunks_exact(4).zip(t.iter_mut()) {
+            *b = u32::from_ne_bytes(c.try_into().unwrap());
+        }
+        chunk.chunks_exact_mut(4).enumerate().for_each(|(i, b)| {
+            b.copy_from_slice(&t[PIVOT_ABCD[i]].to_ne_bytes());
+        });
+    }
+}
+
+pub(crate) fn shuffle_out(b: &mut [u8]) {
+    for chunk in b.chunks_exact_mut(64) {
+        let mut t = [0u32; 16];
+        for (c, b) in chunk.chunks_exact(4).zip(t.iter_mut()) {
+            *b = u32::from_ne_bytes(c.try_into().unwrap());
+        }
+        chunk.chunks_exact_mut(4).enumerate().for_each(|(i, b)| {
+            b.copy_from_slice(&t[INVERSE_PIVOT_ABCD[i]].to_ne_bytes());
+        });
+    }
+}
+
+pub(crate) fn scrypt_block_mix(input: &[u8], output: &mut [u8]) {
+    use core::arch::wasm32::*;
+
+    macro_rules! u32x4_rol {
+        ($w:expr, $amt:literal) => {{
+            let w = $w;
+            v128_or(u32x4_shl(w, $amt), u32x4_shr(w, 32 - $amt))
+        }};
+    }
+
+    let last_block = &input[input.len() - 64..];
+
+    let mut a = unsafe { v128_load(last_block.as_ptr().cast()) };
+    let mut b = unsafe { v128_load(last_block.as_ptr().add(16).cast()) };
+    let mut c = unsafe { v128_load(last_block.as_ptr().add(32).cast()) };
+    let mut d = unsafe { v128_load(last_block.as_ptr().add(48).cast()) };
+
+    for (i, chunk) in input.chunks(64).enumerate() {
+        let pos = if i % 2 == 0 {
+            (i / 2) * 64
+        } else {
+            (i / 2) * 64 + input.len() / 2
+        };
+
+        unsafe {
+            let chunk_a = v128_load(chunk.as_ptr().cast());
+            let chunk_b = v128_load(chunk.as_ptr().add(16).cast());
+            let chunk_c = v128_load(chunk.as_ptr().add(32).cast());
+            let chunk_d = v128_load(chunk.as_ptr().add(48).cast());
+
+            a = v128_xor(a, chunk_a);
+            b = v128_xor(b, chunk_b);
+            c = v128_xor(c, chunk_c);
+            d = v128_xor(d, chunk_d);
+
+            let saves = [a, b, c, d];
+
+            for _ in 0..8 {
+                b = v128_xor(b, u32x4_rol!(u32x4_add(a, d), 7));
+                c = v128_xor(c, u32x4_rol!(u32x4_add(b, a), 9));
+                d = v128_xor(d, u32x4_rol!(u32x4_add(c, b), 13));
+                a = v128_xor(a, u32x4_rol!(u32x4_add(d, c), 18));
+
+                d = i32x4_shuffle::<1, 2, 3, 0>(d, d);
+                c = i32x4_shuffle::<2, 3, 0, 1>(c, c);
+                b = i32x4_shuffle::<3, 0, 1, 2>(b, b);
+
+                (b, d) = (d, b);
+            }
+
+            a = u32x4_add(a, saves[0]);
+            b = u32x4_add(b, saves[1]);
+            c = u32x4_add(c, saves[2]);
+            d = u32x4_add(d, saves[3]);
+
+            v128_store(output.as_mut_ptr().add(pos).cast(), a);
+            v128_store(output.as_mut_ptr().add(pos + 16).cast(), b);
+            v128_store(output.as_mut_ptr().add(pos + 32).cast(), c);
+            v128_store(output.as_mut_ptr().add(pos + 48).cast(), d);
+        }
+    }
+}
@@ -0,0 +1,42 @@
+/// Execute the BlockMix operation
+/// input - the input vector. The length must be a multiple of 128.
+/// output - the output vector. Must be the same length as input.
+pub(crate) fn scrypt_block_mix(input: &[u8], output: &mut [u8]) {
+    use salsa20::{
+        SalsaCore,
+        cipher::{StreamCipherCore, typenum::U4},
+    };
+
+    type Salsa20_8 = SalsaCore<U4>;
+
+    let mut x = [0u8; 64];
+    x.copy_from_slice(&input[input.len() - 64..]);
+
+    let mut t = [0u8; 64];
+
+    for (i, chunk) in input.chunks(64).enumerate() {
+        xor(&x, chunk, &mut t);
+
+        let mut t2 = [0u32; 16];
+
+        for (c, b) in t.chunks_exact(4).zip(t2.iter_mut()) {
+            *b = u32::from_le_bytes(c.try_into().unwrap());
+        }
+
+        Salsa20_8::from_raw_state(t2).write_keystream_block((&mut x).into());
+
+        let pos = if i % 2 == 0 {
+            (i / 2) * 64
+        } else {
+            (i / 2) * 64 + input.len() / 2
+        };
+
+        output[pos..pos + 64].copy_from_slice(&x);
+    }
+}
+
+fn xor(x: &[u8], y: &[u8], output: &mut [u8]) {
+    for ((out, &x_i), &y_i) in output.iter_mut().zip(x.iter()).zip(y.iter()) {
+        *out = x_i ^ y_i;
+    }
+}