|
| 1 | +/* SPDX-License-Identifier: GPL-2.0 */ |
| 2 | +/* |
| 3 | + * Copyright (C) 2025 Xi Ruoyao <[email protected]>. All Rights Reserved. |
| 4 | + * |
| 5 | + * Based on arch/loongarch/vdso/vgetrandom-chacha.S. |
| 6 | + */ |
| 7 | + |
| 8 | +#include <asm/asm.h> |
| 9 | +#include <linux/linkage.h> |
| 10 | + |
| 11 | +.text |
| 12 | + |
| 13 | +.macro ROTRI rd rs imm |
| 14 | + slliw t0, \rs, 32 - \imm |
| 15 | + srliw \rd, \rs, \imm |
| 16 | + or \rd, \rd, t0 |
| 17 | +.endm |
| 18 | + |
| 19 | +.macro OP_4REG op d0 d1 d2 d3 s0 s1 s2 s3 |
| 20 | + \op \d0, \d0, \s0 |
| 21 | + \op \d1, \d1, \s1 |
| 22 | + \op \d2, \d2, \s2 |
| 23 | + \op \d3, \d3, \s3 |
| 24 | +.endm |
| 25 | + |
| 26 | +/* |
| 27 | + * a0: output bytes |
| 28 | + * a1: 32-byte key input |
| 29 | + * a2: 8-byte counter input/output |
| 30 | + * a3: number of 64-byte blocks to write to output |
| 31 | + */ |
| 32 | +SYM_FUNC_START(__arch_chacha20_blocks_nostack) |
| 33 | + |
| 34 | +#define output a0 |
| 35 | +#define key a1 |
| 36 | +#define counter a2 |
| 37 | +#define nblocks a3 |
| 38 | +#define i a4 |
| 39 | +#define state0 s0 |
| 40 | +#define state1 s1 |
| 41 | +#define state2 s2 |
| 42 | +#define state3 s3 |
| 43 | +#define state4 s4 |
| 44 | +#define state5 s5 |
| 45 | +#define state6 s6 |
| 46 | +#define state7 s7 |
| 47 | +#define state8 s8 |
| 48 | +#define state9 s9 |
| 49 | +#define state10 s10 |
| 50 | +#define state11 s11 |
| 51 | +#define state12 a5 |
| 52 | +#define state13 a6 |
| 53 | +#define state14 a7 |
| 54 | +#define state15 t1 |
| 55 | +#define cnt t2 |
| 56 | +#define copy0 t3 |
| 57 | +#define copy1 t4 |
| 58 | +#define copy2 t5 |
| 59 | +#define copy3 t6 |
| 60 | + |
| 61 | +/* Packs to be used with OP_4REG */ |
| 62 | +#define line0 state0, state1, state2, state3 |
| 63 | +#define line1 state4, state5, state6, state7 |
| 64 | +#define line2 state8, state9, state10, state11 |
| 65 | +#define line3 state12, state13, state14, state15 |
| 66 | + |
| 67 | +#define line1_perm state5, state6, state7, state4 |
| 68 | +#define line2_perm state10, state11, state8, state9 |
| 69 | +#define line3_perm state15, state12, state13, state14 |
| 70 | + |
| 71 | +#define copy copy0, copy1, copy2, copy3 |
| 72 | + |
| 73 | +#define _16 16, 16, 16, 16 |
| 74 | +#define _20 20, 20, 20, 20 |
| 75 | +#define _24 24, 24, 24, 24 |
| 76 | +#define _25 25, 25, 25, 25 |
| 77 | + |
| 78 | + /* |
| 79 | + * The ABI requires s0-s9 saved. |
| 80 | + * This does not violate the stack-less requirement: no sensitive data |
| 81 | + * is spilled onto the stack. |
| 82 | + */ |
| 83 | + addi sp, sp, -12*SZREG |
| 84 | + REG_S s0, (sp) |
| 85 | + REG_S s1, SZREG(sp) |
| 86 | + REG_S s2, 2*SZREG(sp) |
| 87 | + REG_S s3, 3*SZREG(sp) |
| 88 | + REG_S s4, 4*SZREG(sp) |
| 89 | + REG_S s5, 5*SZREG(sp) |
| 90 | + REG_S s6, 6*SZREG(sp) |
| 91 | + REG_S s7, 7*SZREG(sp) |
| 92 | + REG_S s8, 8*SZREG(sp) |
| 93 | + REG_S s9, 9*SZREG(sp) |
| 94 | + REG_S s10, 10*SZREG(sp) |
| 95 | + REG_S s11, 11*SZREG(sp) |
| 96 | + |
| 97 | + ld cnt, (counter) |
| 98 | + |
| 99 | + li copy0, 0x61707865 |
| 100 | + li copy1, 0x3320646e |
| 101 | + li copy2, 0x79622d32 |
| 102 | + li copy3, 0x6b206574 |
| 103 | + |
| 104 | +.Lblock: |
| 105 | + /* state[0,1,2,3] = "expand 32-byte k" */ |
| 106 | + mv state0, copy0 |
| 107 | + mv state1, copy1 |
| 108 | + mv state2, copy2 |
| 109 | + mv state3, copy3 |
| 110 | + |
| 111 | + /* state[4,5,..,11] = key */ |
| 112 | + lw state4, (key) |
| 113 | + lw state5, 4(key) |
| 114 | + lw state6, 8(key) |
| 115 | + lw state7, 12(key) |
| 116 | + lw state8, 16(key) |
| 117 | + lw state9, 20(key) |
| 118 | + lw state10, 24(key) |
| 119 | + lw state11, 28(key) |
| 120 | + |
| 121 | + /* state[12,13] = counter */ |
| 122 | + mv state12, cnt |
| 123 | + srli state13, cnt, 32 |
| 124 | + |
| 125 | + /* state[14,15] = 0 */ |
| 126 | + mv state14, zero |
| 127 | + mv state15, zero |
| 128 | + |
| 129 | + li i, 10 |
| 130 | +.Lpermute: |
| 131 | + /* odd round */ |
| 132 | + OP_4REG addw line0, line1 |
| 133 | + OP_4REG xor line3, line0 |
| 134 | + OP_4REG ROTRI line3, _16 |
| 135 | + |
| 136 | + OP_4REG addw line2, line3 |
| 137 | + OP_4REG xor line1, line2 |
| 138 | + OP_4REG ROTRI line1, _20 |
| 139 | + |
| 140 | + OP_4REG addw line0, line1 |
| 141 | + OP_4REG xor line3, line0 |
| 142 | + OP_4REG ROTRI line3, _24 |
| 143 | + |
| 144 | + OP_4REG addw line2, line3 |
| 145 | + OP_4REG xor line1, line2 |
| 146 | + OP_4REG ROTRI line1, _25 |
| 147 | + |
| 148 | + /* even round */ |
| 149 | + OP_4REG addw line0, line1_perm |
| 150 | + OP_4REG xor line3_perm, line0 |
| 151 | + OP_4REG ROTRI line3_perm, _16 |
| 152 | + |
| 153 | + OP_4REG addw line2_perm, line3_perm |
| 154 | + OP_4REG xor line1_perm, line2_perm |
| 155 | + OP_4REG ROTRI line1_perm, _20 |
| 156 | + |
| 157 | + OP_4REG addw line0, line1_perm |
| 158 | + OP_4REG xor line3_perm, line0 |
| 159 | + OP_4REG ROTRI line3_perm, _24 |
| 160 | + |
| 161 | + OP_4REG addw line2_perm, line3_perm |
| 162 | + OP_4REG xor line1_perm, line2_perm |
| 163 | + OP_4REG ROTRI line1_perm, _25 |
| 164 | + |
| 165 | + addi i, i, -1 |
| 166 | + bnez i, .Lpermute |
| 167 | + |
| 168 | + /* output[0,1,2,3] = copy[0,1,2,3] + state[0,1,2,3] */ |
| 169 | + OP_4REG addw line0, copy |
| 170 | + sw state0, (output) |
| 171 | + sw state1, 4(output) |
| 172 | + sw state2, 8(output) |
| 173 | + sw state3, 12(output) |
| 174 | + |
| 175 | + /* from now on state[0,1,2,3] are scratch registers */ |
| 176 | + |
| 177 | + /* state[0,1,2,3] = lo(key) */ |
| 178 | + lw state0, (key) |
| 179 | + lw state1, 4(key) |
| 180 | + lw state2, 8(key) |
| 181 | + lw state3, 12(key) |
| 182 | + |
| 183 | + /* output[4,5,6,7] = state[0,1,2,3] + state[4,5,6,7] */ |
| 184 | + OP_4REG addw line1, line0 |
| 185 | + sw state4, 16(output) |
| 186 | + sw state5, 20(output) |
| 187 | + sw state6, 24(output) |
| 188 | + sw state7, 28(output) |
| 189 | + |
| 190 | + /* state[0,1,2,3] = hi(key) */ |
| 191 | + lw state0, 16(key) |
| 192 | + lw state1, 20(key) |
| 193 | + lw state2, 24(key) |
| 194 | + lw state3, 28(key) |
| 195 | + |
| 196 | + /* output[8,9,10,11] = tmp[0,1,2,3] + state[8,9,10,11] */ |
| 197 | + OP_4REG addw line2, line0 |
| 198 | + sw state8, 32(output) |
| 199 | + sw state9, 36(output) |
| 200 | + sw state10, 40(output) |
| 201 | + sw state11, 44(output) |
| 202 | + |
| 203 | + /* output[12,13,14,15] = state[12,13,14,15] + [cnt_lo, cnt_hi, 0, 0] */ |
| 204 | + addw state12, state12, cnt |
| 205 | + srli state0, cnt, 32 |
| 206 | + addw state13, state13, state0 |
| 207 | + sw state12, 48(output) |
| 208 | + sw state13, 52(output) |
| 209 | + sw state14, 56(output) |
| 210 | + sw state15, 60(output) |
| 211 | + |
| 212 | + /* ++counter */ |
| 213 | + addi cnt, cnt, 1 |
| 214 | + |
| 215 | + /* output += 64 */ |
| 216 | + addi output, output, 64 |
| 217 | + /* --nblocks */ |
| 218 | + addi nblocks, nblocks, -1 |
| 219 | + bnez nblocks, .Lblock |
| 220 | + |
| 221 | + /* counter = [cnt_lo, cnt_hi] */ |
| 222 | + sd cnt, (counter) |
| 223 | + |
| 224 | + /* Zero out the potentially sensitive regs, in case nothing uses these |
| 225 | + * again. As at now copy[0,1,2,3] just contains "expand 32-byte k" and |
| 226 | + * state[0,...,11] are s0-s11 those we'll restore in the epilogue, we |
| 227 | + * only need to zero state[12,...,15]. |
| 228 | + */ |
| 229 | + mv state12, zero |
| 230 | + mv state13, zero |
| 231 | + mv state14, zero |
| 232 | + mv state15, zero |
| 233 | + |
| 234 | + REG_L s0, (sp) |
| 235 | + REG_L s1, SZREG(sp) |
| 236 | + REG_L s2, 2*SZREG(sp) |
| 237 | + REG_L s3, 3*SZREG(sp) |
| 238 | + REG_L s4, 4*SZREG(sp) |
| 239 | + REG_L s5, 5*SZREG(sp) |
| 240 | + REG_L s6, 6*SZREG(sp) |
| 241 | + REG_L s7, 7*SZREG(sp) |
| 242 | + REG_L s8, 8*SZREG(sp) |
| 243 | + REG_L s9, 9*SZREG(sp) |
| 244 | + REG_L s10, 10*SZREG(sp) |
| 245 | + REG_L s11, 11*SZREG(sp) |
| 246 | + addi sp, sp, 12*SZREG |
| 247 | + |
| 248 | + ret |
| 249 | +SYM_FUNC_END(__arch_chacha20_blocks_nostack) |
0 commit comments