Skip to content

Commit c05a6d4

Browse files
neon: use multiple-register load/store
Signed-off-by: eternal-flame-AD <[email protected]>
1 parent 94bef2f commit c05a6d4

File tree

1 file changed

+15
-21
lines changed

1 file changed

+15
-21
lines changed

scrypt/src/block_mix/neon.rs

Lines changed: 15 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -36,10 +36,7 @@ pub(crate) fn scrypt_block_mix(input: &[u8], output: &mut [u8]) {
3636

3737
let last_block = &input[input.len() - 64..];
3838

39-
let mut a = unsafe { vld1q_u32(last_block.as_ptr().cast()) };
40-
let mut b = unsafe { vld1q_u32(last_block.as_ptr().add(16).cast()) };
41-
let mut c = unsafe { vld1q_u32(last_block.as_ptr().add(32).cast()) };
42-
let mut d = unsafe { vld1q_u32(last_block.as_ptr().add(48).cast()) };
39+
let mut x = unsafe { vld1q_u32_x4(last_block.as_ptr().cast()) };
4340

4441
for (i, chunk) in input.chunks(64).enumerate() {
4542
let pos = if i % 2 == 0 {
@@ -49,17 +46,17 @@ pub(crate) fn scrypt_block_mix(input: &[u8], output: &mut [u8]) {
4946
};
5047

5148
unsafe {
52-
let chunk_a = vld1q_u32(chunk.as_ptr().cast());
53-
let chunk_b = vld1q_u32(chunk.as_ptr().add(16).cast());
54-
let chunk_c = vld1q_u32(chunk.as_ptr().add(32).cast());
55-
let chunk_d = vld1q_u32(chunk.as_ptr().add(48).cast());
49+
let chunk = vld1q_u32_x4(chunk.as_ptr().cast());
5650

57-
a = veorq_u32(a, chunk_a);
58-
b = veorq_u32(b, chunk_b);
59-
c = veorq_u32(c, chunk_c);
60-
d = veorq_u32(d, chunk_d);
51+
x.0 = veorq_u32(x.0, chunk.0);
52+
x.1 = veorq_u32(x.1, chunk.1);
53+
x.2 = veorq_u32(x.2, chunk.2);
54+
x.3 = veorq_u32(x.3, chunk.3);
6155

62-
let saves = [a, b, c, d];
56+
let mut a = x.0;
57+
let mut b = x.1;
58+
let mut c = x.2;
59+
let mut d = x.3;
6360

6461
for _ in 0..8 {
6562
b = veorq_u32(b, vrol_u32!(vaddq_u32(a, d), 7));
@@ -74,15 +71,12 @@ pub(crate) fn scrypt_block_mix(input: &[u8], output: &mut [u8]) {
7471
(b, d) = (d, b);
7572
}
7673

77-
a = vaddq_u32(a, saves[0]);
78-
b = vaddq_u32(b, saves[1]);
79-
c = vaddq_u32(c, saves[2]);
80-
d = vaddq_u32(d, saves[3]);
74+
x.0 = vaddq_u32(x.0, a);
75+
x.1 = vaddq_u32(x.1, b);
76+
x.2 = vaddq_u32(x.2, c);
77+
x.3 = vaddq_u32(x.3, d);
8178

82-
vst1q_u32(output.as_mut_ptr().add(pos).cast(), a);
83-
vst1q_u32(output.as_mut_ptr().add(pos + 16).cast(), b);
84-
vst1q_u32(output.as_mut_ptr().add(pos + 32).cast(), c);
85-
vst1q_u32(output.as_mut_ptr().add(pos + 48).cast(), d);
79+
vst1q_u32_x4(output.as_mut_ptr().add(pos).cast(), x);
8680
}
8781
}
8882
}

0 commit comments

Comments
 (0)