|
| 1 | +// SPDX-License-Identifier: GPL-2.0 |
| 2 | + |
| 3 | +#include <linux/linkage.h> |
| 4 | +#include <asm/cache.h> |
| 5 | +#include <asm/assembler.h> |
| 6 | + |
| 7 | + .text |
| 8 | + |
| 9 | +#define state0 v0 |
| 10 | +#define state1 v1 |
| 11 | +#define state2 v2 |
| 12 | +#define state3 v3 |
| 13 | +#define copy0 v4 |
| 14 | +#define copy0_q q4 |
| 15 | +#define copy1 v5 |
| 16 | +#define copy2 v6 |
| 17 | +#define copy3 v7 |
| 18 | +#define copy3_d d7 |
| 19 | +#define one_d d16 |
| 20 | +#define one_q q16 |
| 21 | +#define one_v v16 |
| 22 | +#define tmp v17 |
| 23 | +#define rot8 v18 |
| 24 | + |
| 25 | +/* |
| 26 | + * ARM64 ChaCha20 implementation meant for vDSO. Produces a given positive |
| 27 | + * number of blocks of output with nonce 0, taking an input key and 8-bytes |
| 28 | + * counter. Importantly does not spill to the stack. |
| 29 | + * |
| 30 | + * This implementation avoids d8-d15 because they are callee-save in user |
| 31 | + * space. |
| 32 | + * |
| 33 | + * void __arch_chacha20_blocks_nostack(uint8_t *dst_bytes, |
| 34 | + * const uint8_t *key, |
| 35 | + * uint32_t *counter, |
| 36 | + * size_t nblocks) |
| 37 | + * |
| 38 | + * x0: output bytes |
| 39 | + * x1: 32-byte key input |
| 40 | + * x2: 8-byte counter input/output |
| 41 | + * x3: number of 64-byte block to write to output |
| 42 | + */ |
| 43 | +SYM_FUNC_START(__arch_chacha20_blocks_nostack) |
| 44 | + |
| 45 | + /* copy0 = "expand 32-byte k" */ |
| 46 | + mov_q x8, 0x3320646e61707865 |
| 47 | + mov_q x9, 0x6b20657479622d32 |
| 48 | + mov copy0.d[0], x8 |
| 49 | + mov copy0.d[1], x9 |
| 50 | + |
| 51 | + /* copy1,copy2 = key */ |
| 52 | + ld1 { copy1.4s, copy2.4s }, [x1] |
| 53 | + /* copy3 = counter || zero nonce */ |
| 54 | + ld1 { copy3.2s }, [x2] |
| 55 | + |
| 56 | + movi one_v.2s, #1 |
| 57 | + uzp1 one_v.4s, one_v.4s, one_v.4s |
| 58 | + |
| 59 | +.Lblock: |
| 60 | + /* copy state to auxiliary vectors for the final add after the permute. */ |
| 61 | + mov state0.16b, copy0.16b |
| 62 | + mov state1.16b, copy1.16b |
| 63 | + mov state2.16b, copy2.16b |
| 64 | + mov state3.16b, copy3.16b |
| 65 | + |
| 66 | + mov w4, 20 |
| 67 | +.Lpermute: |
| 68 | + /* |
| 69 | + * Permute one 64-byte block where the state matrix is stored in the four NEON |
| 70 | + * registers state0-state3. It performs matrix operations on four words in parallel, |
| 71 | + * but requires shuffling to rearrange the words after each round. |
| 72 | + */ |
| 73 | + |
| 74 | +.Ldoubleround: |
| 75 | + /* state0 += state1, state3 = rotl32(state3 ^ state0, 16) */ |
| 76 | + add state0.4s, state0.4s, state1.4s |
| 77 | + eor state3.16b, state3.16b, state0.16b |
| 78 | + rev32 state3.8h, state3.8h |
| 79 | + |
| 80 | + /* state2 += state3, state1 = rotl32(state1 ^ state2, 12) */ |
| 81 | + add state2.4s, state2.4s, state3.4s |
| 82 | + eor tmp.16b, state1.16b, state2.16b |
| 83 | + shl state1.4s, tmp.4s, #12 |
| 84 | + sri state1.4s, tmp.4s, #20 |
| 85 | + |
| 86 | + /* state0 += state1, state3 = rotl32(state3 ^ state0, 8) */ |
| 87 | + add state0.4s, state0.4s, state1.4s |
| 88 | + eor tmp.16b, state3.16b, state0.16b |
| 89 | + shl state3.4s, tmp.4s, #8 |
| 90 | + sri state3.4s, tmp.4s, #24 |
| 91 | + |
| 92 | + /* state2 += state3, state1 = rotl32(state1 ^ state2, 7) */ |
| 93 | + add state2.4s, state2.4s, state3.4s |
| 94 | + eor tmp.16b, state1.16b, state2.16b |
| 95 | + shl state1.4s, tmp.4s, #7 |
| 96 | + sri state1.4s, tmp.4s, #25 |
| 97 | + |
| 98 | + /* state1[0,1,2,3] = state1[1,2,3,0] */ |
| 99 | + ext state1.16b, state1.16b, state1.16b, #4 |
| 100 | + /* state2[0,1,2,3] = state2[2,3,0,1] */ |
| 101 | + ext state2.16b, state2.16b, state2.16b, #8 |
| 102 | + /* state3[0,1,2,3] = state3[1,2,3,0] */ |
| 103 | + ext state3.16b, state3.16b, state3.16b, #12 |
| 104 | + |
| 105 | + /* state0 += state1, state3 = rotl32(state3 ^ state0, 16) */ |
| 106 | + add state0.4s, state0.4s, state1.4s |
| 107 | + eor state3.16b, state3.16b, state0.16b |
| 108 | + rev32 state3.8h, state3.8h |
| 109 | + |
| 110 | + /* state2 += state3, state1 = rotl32(state1 ^ state2, 12) */ |
| 111 | + add state2.4s, state2.4s, state3.4s |
| 112 | + eor tmp.16b, state1.16b, state2.16b |
| 113 | + shl state1.4s, tmp.4s, #12 |
| 114 | + sri state1.4s, tmp.4s, #20 |
| 115 | + |
| 116 | + /* state0 += state1, state3 = rotl32(state3 ^ state0, 8) */ |
| 117 | + add state0.4s, state0.4s, state1.4s |
| 118 | + eor tmp.16b, state3.16b, state0.16b |
| 119 | + shl state3.4s, tmp.4s, #8 |
| 120 | + sri state3.4s, tmp.4s, #24 |
| 121 | + |
| 122 | + /* state2 += state3, state1 = rotl32(state1 ^ state2, 7) */ |
| 123 | + add state2.4s, state2.4s, state3.4s |
| 124 | + eor tmp.16b, state1.16b, state2.16b |
| 125 | + shl state1.4s, tmp.4s, #7 |
| 126 | + sri state1.4s, tmp.4s, #25 |
| 127 | + |
| 128 | + /* state1[0,1,2,3] = state1[3,0,1,2] */ |
| 129 | + ext state1.16b, state1.16b, state1.16b, #12 |
| 130 | + /* state2[0,1,2,3] = state2[2,3,0,1] */ |
| 131 | + ext state2.16b, state2.16b, state2.16b, #8 |
| 132 | + /* state3[0,1,2,3] = state3[1,2,3,0] */ |
| 133 | + ext state3.16b, state3.16b, state3.16b, #4 |
| 134 | + |
| 135 | + subs w4, w4, #2 |
| 136 | + b.ne .Ldoubleround |
| 137 | + |
| 138 | + /* output0 = state0 + state0 */ |
| 139 | + add state0.4s, state0.4s, copy0.4s |
| 140 | + /* output1 = state1 + state1 */ |
| 141 | + add state1.4s, state1.4s, copy1.4s |
| 142 | + /* output2 = state2 + state2 */ |
| 143 | + add state2.4s, state2.4s, copy2.4s |
| 144 | + /* output2 = state3 + state3 */ |
| 145 | + add state3.4s, state3.4s, copy3.4s |
| 146 | + st1 { state0.16b - state3.16b }, [x0] |
| 147 | + |
| 148 | + /* |
| 149 | + * ++copy3.counter, the 'add' clears the upper half of the SIMD register |
| 150 | + * which is the expected behaviour here. |
| 151 | + */ |
| 152 | + add copy3_d, copy3_d, one_d |
| 153 | + |
| 154 | + /* output += 64, --nblocks */ |
| 155 | + add x0, x0, 64 |
| 156 | + subs x3, x3, #1 |
| 157 | + b.ne .Lblock |
| 158 | + |
| 159 | + /* counter = copy3.counter */ |
| 160 | + st1 { copy3.2s }, [x2] |
| 161 | + |
| 162 | + /* Zero out the potentially sensitive regs, in case nothing uses these again. */ |
| 163 | + movi state0.16b, #0 |
| 164 | + movi state1.16b, #0 |
| 165 | + movi state2.16b, #0 |
| 166 | + movi state3.16b, #0 |
| 167 | + movi copy1.16b, #0 |
| 168 | + movi copy2.16b, #0 |
| 169 | + ret |
| 170 | +SYM_FUNC_END(__arch_chacha20_blocks_nostack) |
| 171 | + |
| 172 | +emit_aarch64_feature_1_and |
0 commit comments