|
10 | 10 |
|
11 | 11 | extern uint32x4x2_t to_bit_interleaving_4x(uint32x4_t, uint32x4_t); |
12 | 12 | extern uint32x4x2_t from_bit_interleaving_4x(uint32x4_t, uint32x4_t); |
13 | | -uint32x4x2_t KeccakF1600x4_LoadBytesInLane(uint32x4_t data_ptrs, uint32_t length, uint32_t offset); |
14 | | -uint32x4_t KeccakF1600x4_StateXORBytes_aligned(uint32_t nvecs, uint8_t* state, uint32x4_t data_ptrs); |
| 13 | +uint32x4x2_t KeccakF1600x4_LoadBytesInLane(uint32x4_t data_ptrs, |
| 14 | + uint32_t length, uint32_t offset); |
| 15 | +uint32x4_t KeccakF1600x4_StateXORBytes_aligned(uint32_t nvecs, uint8_t *state, |
| 16 | + uint32x4_t data_ptrs); |
15 | 17 |
|
16 | | -void KeccakF1600x4_StateXORBytes(void* state, |
17 | | - const uint8_t *data0, const uint8_t *data1, |
18 | | - const uint8_t *data2, const uint8_t *data3, |
19 | | - uint32_t offset, uint32_t length ) |
| 18 | +void KeccakF1600x4_StateXORBytes(void *state, const uint8_t *data0, |
| 19 | + const uint8_t *data1, const uint8_t *data2, |
| 20 | + const uint8_t *data3, uint32_t offset, |
| 21 | + uint32_t length) |
20 | 22 | { |
21 | | - uintptr_t offset_in_lane = offset & 7; |
22 | | - uintptr_t lane_offset = offset & ~7; |
23 | | - uint32x4_t data_ptrs; |
24 | | - __asm__ volatile ( |
25 | | - "vmov %q[o][2], %q[o][0], %[i0], %[i2]\n" |
26 | | - "vmov %q[o][3], %q[o][1], %[i1], %[i3]\n" |
27 | | - : [o] "=&w" (data_ptrs) |
28 | | - : [i0] "r" (data0), [i1] "r" (data1), [i2] "r" (data2), [i3] "r" (data3) |
29 | | - :); |
30 | | - if (offset_in_lane) { |
31 | | - uint32x4x2_t l; |
32 | | - size_t nBytes = length < 8-offset_in_lane ? length : 8-offset_in_lane; |
| 23 | + uintptr_t offset_in_lane = offset & 7; |
| 24 | + uintptr_t lane_offset = offset & ~7; |
| 25 | + uint32x4_t data_ptrs; |
| 26 | + __asm__ volatile( |
| 27 | + "vmov %q[o][2], %q[o][0], %[i0], %[i2]\n" |
| 28 | + "vmov %q[o][3], %q[o][1], %[i1], %[i3]\n" |
| 29 | + : [o] "=&w"(data_ptrs) |
| 30 | + : [i0] "r"(data0), [i1] "r"(data1), [i2] "r"(data2), [i3] "r"(data3) |
| 31 | + :); |
| 32 | + if (offset_in_lane) |
| 33 | + { |
| 34 | + uint32x4x2_t l; |
| 35 | + size_t nBytes = length < 8 - offset_in_lane ? length : 8 - offset_in_lane; |
33 | 36 |
|
34 | | - l = KeccakF1600x4_LoadBytesInLane(data_ptrs, nBytes, offset_in_lane); |
| 37 | + l = KeccakF1600x4_LoadBytesInLane(data_ptrs, nBytes, offset_in_lane); |
35 | 38 |
|
36 | | - // Now convert to bit interleaving |
37 | | - uint32x4x2_t bint = to_bit_interleaving_4x(l.val[0], l.val[1]); |
38 | | - uint32x4_t s0 = vldrwq_u32((uint32_t *)((uintptr_t)state + lane_offset/2 * 4)); |
39 | | - uint32x4_t s1 = vldrwq_u32((uint32_t *)((uintptr_t)state + 400 + lane_offset/2 * 4)); |
40 | | - s0 = veorq_u32(s0, bint.val[0]); |
41 | | - s1 = veorq_u32(s1, bint.val[1]); |
42 | | - vstrwq_u32((uint32_t *)((uintptr_t)state + lane_offset/2 * 4), s0); |
43 | | - vstrwq_u32((uint32_t *)((uintptr_t)state + 400 + lane_offset/2 * 4), s1); |
44 | | - length -= nBytes; |
45 | | - lane_offset += 8; |
46 | | - data_ptrs = vaddq_n_u32(data_ptrs, nBytes); |
47 | | - } |
48 | | - if(length >= 8) { |
49 | | - uint8_t *sp0 = (uint8_t *)((uintptr_t)state + lane_offset/2 * 4 - 16); |
50 | | - uint32_t bytes_left_in_frame = 25*8 - lane_offset; |
51 | | - uint32_t nlanes = (bytes_left_in_frame < length ? bytes_left_in_frame : length)/8; |
52 | | - |
53 | | - data_ptrs = KeccakF1600x4_StateXORBytes_aligned(nlanes, sp0, data_ptrs); |
54 | | - length -= nlanes * 8; |
55 | | - lane_offset += nlanes * 8; |
56 | | - } |
57 | | - if (length) { |
58 | | - uint32x4x2_t l; |
59 | | - l = KeccakF1600x4_LoadBytesInLane(data_ptrs, length, 0); |
| 39 | + // Now convert to bit interleaving |
| 40 | + uint32x4x2_t bint = to_bit_interleaving_4x(l.val[0], l.val[1]); |
| 41 | + uint32x4_t s0 = |
| 42 | + vldrwq_u32((uint32_t *)((uintptr_t)state + lane_offset / 2 * 4)); |
| 43 | + uint32x4_t s1 = |
| 44 | + vldrwq_u32((uint32_t *)((uintptr_t)state + 400 + lane_offset / 2 * 4)); |
| 45 | + s0 = veorq_u32(s0, bint.val[0]); |
| 46 | + s1 = veorq_u32(s1, bint.val[1]); |
| 47 | + vstrwq_u32((uint32_t *)((uintptr_t)state + lane_offset / 2 * 4), s0); |
| 48 | + vstrwq_u32((uint32_t *)((uintptr_t)state + 400 + lane_offset / 2 * 4), s1); |
| 49 | + length -= nBytes; |
| 50 | + lane_offset += 8; |
| 51 | + data_ptrs = vaddq_n_u32(data_ptrs, nBytes); |
| 52 | + } |
| 53 | + if (length >= 8) |
| 54 | + { |
| 55 | + uint8_t *sp0 = (uint8_t *)((uintptr_t)state + lane_offset / 2 * 4 - 16); |
| 56 | + uint32_t bytes_left_in_frame = 25 * 8 - lane_offset; |
| 57 | + uint32_t nlanes = |
| 58 | + (bytes_left_in_frame < length ? bytes_left_in_frame : length) / 8; |
60 | 59 |
|
61 | | - uint32x4x2_t bint = to_bit_interleaving_4x(l.val[0], l.val[1]); |
62 | | - uint32x4_t s0 = vldrwq_u32((uint32_t *)((uintptr_t)state + lane_offset/2 * 4)); |
63 | | - uint32x4_t s1 = vldrwq_u32((uint32_t *)((uintptr_t)state + 400 + lane_offset/2 * 4)); |
64 | | - s0 = veorq_u32(s0, bint.val[0]); |
65 | | - s1 = veorq_u32(s1, bint.val[1]); |
66 | | - vstrwq_u32((uint32_t *)((uintptr_t)state + lane_offset/2 * 4), s0); |
67 | | - vstrwq_u32((uint32_t *)((uintptr_t)state + 400 + lane_offset/2 * 4), s1); |
68 | | - } |
| 60 | + data_ptrs = KeccakF1600x4_StateXORBytes_aligned(nlanes, sp0, data_ptrs); |
| 61 | + length -= nlanes * 8; |
| 62 | + lane_offset += nlanes * 8; |
| 63 | + } |
| 64 | + if (length) |
| 65 | + { |
| 66 | + uint32x4x2_t l; |
| 67 | + l = KeccakF1600x4_LoadBytesInLane(data_ptrs, length, 0); |
| 68 | + |
| 69 | + uint32x4x2_t bint = to_bit_interleaving_4x(l.val[0], l.val[1]); |
| 70 | + uint32x4_t s0 = |
| 71 | + vldrwq_u32((uint32_t *)((uintptr_t)state + lane_offset / 2 * 4)); |
| 72 | + uint32x4_t s1 = |
| 73 | + vldrwq_u32((uint32_t *)((uintptr_t)state + 400 + lane_offset / 2 * 4)); |
| 74 | + s0 = veorq_u32(s0, bint.val[0]); |
| 75 | + s1 = veorq_u32(s1, bint.val[1]); |
| 76 | + vstrwq_u32((uint32_t *)((uintptr_t)state + lane_offset / 2 * 4), s0); |
| 77 | + vstrwq_u32((uint32_t *)((uintptr_t)state + 400 + lane_offset / 2 * 4), s1); |
| 78 | + } |
69 | 79 | } |
70 | 80 |
|
71 | | -static inline |
72 | | -uint32_t extract_bytes_in_lane(void *state, unsigned char *data0, |
73 | | - unsigned char *data1, unsigned char *data2, |
74 | | - unsigned char *data3, unsigned offset, |
75 | | - unsigned length ) { |
76 | | - // For load, need full-lane offset |
77 | | - uint32_t lane_offset = offset & ~7; |
78 | | - // Load the first vector. |
79 | | - uint32x4_t evens = vldrwq_u32((uint32_t *)((uintptr_t)state + lane_offset*2)); |
80 | | - uint32x4_t odds = vldrwq_u32((uint32_t *)((uintptr_t)state + 400 + lane_offset*2)); |
81 | | - // Deinterleave |
82 | | - uint32x4x2_t dint = from_bit_interleaving_4x(evens, odds); |
83 | | - // Transpose the two vectors into four half-vectors |
84 | | - uint32x4_t out[4]; |
85 | | - for (size_t i = 0; i < 4; i++) |
86 | | - { |
87 | | - uint32_t l = vgetq_lane_u32(dint.val[0], i); |
88 | | - uint32_t h = vgetq_lane_u32(dint.val[1], i); |
89 | | - out[i] = vcreateq_u32(l | ((uint64_t) h << 32), 0); |
90 | | - } |
91 | | - // Use predication to write the partial vector |
92 | | - // Make the predicate |
93 | | - uint32_t offset_in_lane = offset & 7; |
94 | | - uint32_t nBytes = 8-offset_in_lane < length? 8-offset_in_lane : length; |
95 | | - mve_pred16_t wr_pred = ((1 << nBytes)-1) << offset_in_lane; |
96 | | - uint8x16_t ov = vidupq_n_u8(0, 1); |
97 | | - vstrbq_scatter_offset_p_u8(data0 - offset_in_lane, ov, (uint8x16_t)out[0], wr_pred); |
98 | | - vstrbq_scatter_offset_p_u8(data1 - offset_in_lane, ov, (uint8x16_t)out[1], wr_pred); |
99 | | - vstrbq_scatter_offset_p_u8(data2 - offset_in_lane, ov, (uint8x16_t)out[2], wr_pred); |
100 | | - vstrbq_scatter_offset_p_u8(data3 - offset_in_lane, ov, (uint8x16_t)out[3], wr_pred); |
101 | | - return nBytes; |
| 81 | +static inline uint32_t extract_bytes_in_lane(void *state, unsigned char *data0, |
| 82 | + unsigned char *data1, |
| 83 | + unsigned char *data2, |
| 84 | + unsigned char *data3, |
| 85 | + unsigned offset, unsigned length) |
| 86 | +{ |
| 87 | + // For load, need full-lane offset |
| 88 | + uint32_t lane_offset = offset & ~7; |
| 89 | + // Load the first vector. |
| 90 | + uint32x4_t evens = |
| 91 | + vldrwq_u32((uint32_t *)((uintptr_t)state + lane_offset * 2)); |
| 92 | + uint32x4_t odds = |
| 93 | + vldrwq_u32((uint32_t *)((uintptr_t)state + 400 + lane_offset * 2)); |
| 94 | + // Deinterleave |
| 95 | + uint32x4x2_t dint = from_bit_interleaving_4x(evens, odds); |
| 96 | + // Transpose the two vectors into four half-vectors |
| 97 | + uint32x4_t out[4]; |
| 98 | + for (size_t i = 0; i < 4; i++) |
| 99 | + { |
| 100 | + uint32_t l = vgetq_lane_u32(dint.val[0], i); |
| 101 | + uint32_t h = vgetq_lane_u32(dint.val[1], i); |
| 102 | + out[i] = vcreateq_u32(l | ((uint64_t)h << 32), 0); |
| 103 | + } |
| 104 | + // Use predication to write the partial vector |
| 105 | + // Make the predicate |
| 106 | + uint32_t offset_in_lane = offset & 7; |
| 107 | + uint32_t nBytes = 8 - offset_in_lane < length ? 8 - offset_in_lane : length; |
| 108 | + mve_pred16_t wr_pred = ((1 << nBytes) - 1) << offset_in_lane; |
| 109 | + uint8x16_t ov = vidupq_n_u8(0, 1); |
| 110 | + vstrbq_scatter_offset_p_u8(data0 - offset_in_lane, ov, (uint8x16_t)out[0], |
| 111 | + wr_pred); |
| 112 | + vstrbq_scatter_offset_p_u8(data1 - offset_in_lane, ov, (uint8x16_t)out[1], |
| 113 | + wr_pred); |
| 114 | + vstrbq_scatter_offset_p_u8(data2 - offset_in_lane, ov, (uint8x16_t)out[2], |
| 115 | + wr_pred); |
| 116 | + vstrbq_scatter_offset_p_u8(data3 - offset_in_lane, ov, (uint8x16_t)out[3], |
| 117 | + wr_pred); |
| 118 | + return nBytes; |
102 | 119 | } |
103 | 120 |
|
104 | 121 | void KeccakF1600x4_StateExtract_bytes(void *state, unsigned char *data0, |
105 | | - unsigned char *data1, unsigned char *data2, |
106 | | - unsigned char *data3, unsigned offset, |
107 | | - unsigned length) |
| 122 | + unsigned char *data1, |
| 123 | + unsigned char *data2, |
| 124 | + unsigned char *data3, unsigned offset, |
| 125 | + unsigned length) |
108 | 126 | { |
109 | | - // Make a data pointer vector |
110 | | - uint32x4_t data_addrs = vcreateq_u32( |
111 | | - (uint64_t)(uintptr_t)data0 | ((uint64_t)(uintptr_t)data1 << 32), |
112 | | - (uint64_t)(uintptr_t)data2 | ((uint64_t)(uintptr_t)data3 << 32) |
113 | | - ); |
114 | | - // Only load full 64-bit values from state |
115 | | - if (offset & 7){ |
116 | | - |
117 | | - uint32_t nBytes = extract_bytes_in_lane(state, data0, data1, data2, data3, offset, length); |
118 | | - data_addrs = vaddq_n_u32(data_addrs, nBytes); |
119 | | - length -= nBytes; |
120 | | - offset += nBytes; |
121 | | - } |
122 | | - // For each full vector |
123 | | - if (length >=8 ) { |
124 | | - data_addrs = vsubq_n_u32(data_addrs,4); |
125 | | - for (; length >= 8; length -=8) { |
126 | | - // Load the vector & increment read pointer |
127 | | - uint32x4_t evens = vldrwq_u32((uint32_t *)((uintptr_t)state + offset * 2)); |
128 | | - uint32x4_t odds = vldrwq_u32((uint32_t *)((uintptr_t)state + 400 + offset * 2)); |
129 | | - offset += 8; |
130 | | - // Deinterleave |
131 | | - uint32x4x2_t dint = from_bit_interleaving_4x(evens, odds); |
132 | | - // Write out & increment the write pointer |
133 | | - __asm__ volatile("vstrw.u32 %q[d], [%q[a], #4]!" : [a]"+w"(data_addrs) : [d]"w"(dint.val[0]) : "memory"); |
134 | | - __asm__ volatile("vstrw.u32 %q[d], [%q[a], #4]!" : [a]"+w"(data_addrs) : [d]"w"(dint.val[1]) : "memory"); |
135 | | - } |
136 | | - data_addrs = vaddq_n_u32(data_addrs, 4); |
137 | | - } |
138 | | - if (length) { |
139 | | - data0 = (uint8_t *) vgetq_lane_u32(data_addrs, 0); |
140 | | - data1 = (uint8_t *) vgetq_lane_u32(data_addrs, 1); |
141 | | - data2 = (uint8_t *) vgetq_lane_u32(data_addrs, 2); |
142 | | - data3 = (uint8_t *) vgetq_lane_u32(data_addrs, 3); |
143 | | - // printf("Remaining length: %u; current offset: %u\r\n", length, offset); |
144 | | - extract_bytes_in_lane(state, data0, data1, data2, data3, offset, length); |
| 127 | + // Make a data pointer vector |
| 128 | + uint32x4_t data_addrs = vcreateq_u32( |
| 129 | + (uint64_t)(uintptr_t)data0 | ((uint64_t)(uintptr_t)data1 << 32), |
| 130 | + (uint64_t)(uintptr_t)data2 | ((uint64_t)(uintptr_t)data3 << 32)); |
| 131 | + // Only load full 64-bit values from state |
| 132 | + if (offset & 7) |
| 133 | + { |
| 134 | + uint32_t nBytes = extract_bytes_in_lane(state, data0, data1, data2, data3, |
| 135 | + offset, length); |
| 136 | + data_addrs = vaddq_n_u32(data_addrs, nBytes); |
| 137 | + length -= nBytes; |
| 138 | + offset += nBytes; |
| 139 | + } |
| 140 | + // For each full vector |
| 141 | + if (length >= 8) |
| 142 | + { |
| 143 | + data_addrs = vsubq_n_u32(data_addrs, 4); |
| 144 | + for (; length >= 8; length -= 8) |
| 145 | + { |
| 146 | + // Load the vector & increment read pointer |
| 147 | + uint32x4_t evens = |
| 148 | + vldrwq_u32((uint32_t *)((uintptr_t)state + offset * 2)); |
| 149 | + uint32x4_t odds = |
| 150 | + vldrwq_u32((uint32_t *)((uintptr_t)state + 400 + offset * 2)); |
| 151 | + offset += 8; |
| 152 | + // Deinterleave |
| 153 | + uint32x4x2_t dint = from_bit_interleaving_4x(evens, odds); |
| 154 | + // Write out & increment the write pointer |
| 155 | + __asm__ volatile("vstrw.u32 %q[d], [%q[a], #4]!" |
| 156 | + : [a] "+w"(data_addrs) |
| 157 | + : [d] "w"(dint.val[0]) |
| 158 | + : "memory"); |
| 159 | + __asm__ volatile("vstrw.u32 %q[d], [%q[a], #4]!" |
| 160 | + : [a] "+w"(data_addrs) |
| 161 | + : [d] "w"(dint.val[1]) |
| 162 | + : "memory"); |
145 | 163 | } |
| 164 | + data_addrs = vaddq_n_u32(data_addrs, 4); |
| 165 | + } |
| 166 | + if (length) |
| 167 | + { |
| 168 | + data0 = (uint8_t *)vgetq_lane_u32(data_addrs, 0); |
| 169 | + data1 = (uint8_t *)vgetq_lane_u32(data_addrs, 1); |
| 170 | + data2 = (uint8_t *)vgetq_lane_u32(data_addrs, 2); |
| 171 | + data3 = (uint8_t *)vgetq_lane_u32(data_addrs, 3); |
| 172 | + // printf("Remaining length: %u; current offset: %u\r\n", length, offset); |
| 173 | + extract_bytes_in_lane(state, data0, data1, data2, data3, offset, length); |
| 174 | + } |
146 | 175 | } |
147 | | - |
|
0 commit comments