|
| 1 | +/* SPDX-License-Identifier: GPL-2.0 */ |
| 2 | + |
| 3 | +#include <linux/linkage.h> |
| 4 | +#include <asm/alternative.h> |
| 5 | +#include <asm/fpu-insn.h> |
| 6 | + |
| 7 | +#define STATE0 %v0 |
| 8 | +#define STATE1 %v1 |
| 9 | +#define STATE2 %v2 |
| 10 | +#define STATE3 %v3 |
| 11 | +#define COPY0 %v4 |
| 12 | +#define COPY1 %v5 |
| 13 | +#define COPY2 %v6 |
| 14 | +#define COPY3 %v7 |
| 15 | +#define PERM4 %v16 |
| 16 | +#define PERM8 %v17 |
| 17 | +#define PERM12 %v18 |
| 18 | +#define BEPERM %v19 |
| 19 | +#define TMP0 %v20 |
| 20 | +#define TMP1 %v21 |
| 21 | +#define TMP2 %v22 |
| 22 | +#define TMP3 %v23 |
| 23 | + |
| 24 | + .section .rodata |
| 25 | + |
| 26 | + .balign 128 |
| 27 | +.Lconstants: |
| 28 | + .long 0x61707865,0x3320646e,0x79622d32,0x6b206574 # endian-neutral |
| 29 | + .long 0x04050607,0x08090a0b,0x0c0d0e0f,0x00010203 # rotl 4 bytes |
| 30 | + .long 0x08090a0b,0x0c0d0e0f,0x00010203,0x04050607 # rotl 8 bytes |
| 31 | + .long 0x0c0d0e0f,0x00010203,0x04050607,0x08090a0b # rotl 12 bytes |
| 32 | + .long 0x03020100,0x07060504,0x0b0a0908,0x0f0e0d0c # byte swap |
| 33 | + |
| 34 | + .text |
| 35 | +/* |
| 36 | + * s390 ChaCha20 implementation meant for vDSO. Produces a given positive |
| 37 | + * number of blocks of output with nonce 0, taking an input key and 8-bytes |
| 38 | + * counter. Does not spill to the stack. |
| 39 | + * |
| 40 | + * void __arch_chacha20_blocks_nostack(uint8_t *dst_bytes, |
| 41 | + * const uint8_t *key, |
| 42 | + * uint32_t *counter, |
| 43 | + * size_t nblocks) |
| 44 | + */ |
| 45 | +SYM_FUNC_START(__arch_chacha20_blocks_nostack) |
| 46 | + larl %r1,.Lconstants |
| 47 | + |
| 48 | + /* COPY0 = "expand 32-byte k" */ |
| 49 | + VL COPY0,0,,%r1 |
| 50 | + |
| 51 | + /* PERM4-PERM12,BEPERM = byte selectors for VPERM */ |
| 52 | + VLM PERM4,BEPERM,16,%r1 |
| 53 | + |
| 54 | + /* COPY1,COPY2 = key */ |
| 55 | + VLM COPY1,COPY2,0,%r3 |
| 56 | + |
| 57 | + /* COPY3 = counter || zero nonce */ |
| 58 | + lg %r3,0(%r4) |
| 59 | + VZERO COPY3 |
| 60 | + VLVGG COPY3,%r3,0 |
| 61 | + |
| 62 | + lghi %r1,0 |
| 63 | +.Lblock: |
| 64 | + VLR STATE0,COPY0 |
| 65 | + VLR STATE1,COPY1 |
| 66 | + VLR STATE2,COPY2 |
| 67 | + VLR STATE3,COPY3 |
| 68 | + |
| 69 | + lghi %r0,10 |
| 70 | +.Ldoubleround: |
| 71 | + /* STATE0 += STATE1, STATE3 = rotl32(STATE3 ^ STATE0, 16) */ |
| 72 | + VAF STATE0,STATE0,STATE1 |
| 73 | + VX STATE3,STATE3,STATE0 |
| 74 | + VERLLF STATE3,STATE3,16 |
| 75 | + |
| 76 | + /* STATE2 += STATE3, STATE1 = rotl32(STATE1 ^ STATE2, 12) */ |
| 77 | + VAF STATE2,STATE2,STATE3 |
| 78 | + VX STATE1,STATE1,STATE2 |
| 79 | + VERLLF STATE1,STATE1,12 |
| 80 | + |
| 81 | + /* STATE0 += STATE1, STATE3 = rotl32(STATE3 ^ STATE0, 8) */ |
| 82 | + VAF STATE0,STATE0,STATE1 |
| 83 | + VX STATE3,STATE3,STATE0 |
| 84 | + VERLLF STATE3,STATE3,8 |
| 85 | + |
| 86 | + /* STATE2 += STATE3, STATE1 = rotl32(STATE1 ^ STATE2, 7) */ |
| 87 | + VAF STATE2,STATE2,STATE3 |
| 88 | + VX STATE1,STATE1,STATE2 |
| 89 | + VERLLF STATE1,STATE1,7 |
| 90 | + |
| 91 | + /* STATE1[0,1,2,3] = STATE1[1,2,3,0] */ |
| 92 | + VPERM STATE1,STATE1,STATE1,PERM4 |
| 93 | + /* STATE2[0,1,2,3] = STATE2[2,3,0,1] */ |
| 94 | + VPERM STATE2,STATE2,STATE2,PERM8 |
| 95 | + /* STATE3[0,1,2,3] = STATE3[3,0,1,2] */ |
| 96 | + VPERM STATE3,STATE3,STATE3,PERM12 |
| 97 | + |
| 98 | + /* STATE0 += STATE1, STATE3 = rotl32(STATE3 ^ STATE0, 16) */ |
| 99 | + VAF STATE0,STATE0,STATE1 |
| 100 | + VX STATE3,STATE3,STATE0 |
| 101 | + VERLLF STATE3,STATE3,16 |
| 102 | + |
| 103 | + /* STATE2 += STATE3, STATE1 = rotl32(STATE1 ^ STATE2, 12) */ |
| 104 | + VAF STATE2,STATE2,STATE3 |
| 105 | + VX STATE1,STATE1,STATE2 |
| 106 | + VERLLF STATE1,STATE1,12 |
| 107 | + |
| 108 | + /* STATE0 += STATE1, STATE3 = rotl32(STATE3 ^ STATE0, 8) */ |
| 109 | + VAF STATE0,STATE0,STATE1 |
| 110 | + VX STATE3,STATE3,STATE0 |
| 111 | + VERLLF STATE3,STATE3,8 |
| 112 | + |
| 113 | + /* STATE2 += STATE3, STATE1 = rotl32(STATE1 ^ STATE2, 7) */ |
| 114 | + VAF STATE2,STATE2,STATE3 |
| 115 | + VX STATE1,STATE1,STATE2 |
| 116 | + VERLLF STATE1,STATE1,7 |
| 117 | + |
| 118 | + /* STATE1[0,1,2,3] = STATE1[3,0,1,2] */ |
| 119 | + VPERM STATE1,STATE1,STATE1,PERM12 |
| 120 | + /* STATE2[0,1,2,3] = STATE2[2,3,0,1] */ |
| 121 | + VPERM STATE2,STATE2,STATE2,PERM8 |
| 122 | + /* STATE3[0,1,2,3] = STATE3[1,2,3,0] */ |
| 123 | + VPERM STATE3,STATE3,STATE3,PERM4 |
| 124 | + brctg %r0,.Ldoubleround |
| 125 | + |
| 126 | + /* OUTPUT0 = STATE0 + STATE0 */ |
| 127 | + VAF STATE0,STATE0,COPY0 |
| 128 | + /* OUTPUT1 = STATE1 + STATE1 */ |
| 129 | + VAF STATE1,STATE1,COPY1 |
| 130 | + /* OUTPUT2 = STATE2 + STATE2 */ |
| 131 | + VAF STATE2,STATE2,COPY2 |
| 132 | + /* OUTPUT2 = STATE3 + STATE3 */ |
| 133 | + VAF STATE3,STATE3,COPY3 |
| 134 | + |
| 135 | + /* |
| 136 | + * 32 bit wise little endian store to OUTPUT. If the vector |
| 137 | + * enhancement facility 2 is not installed use the slow path. |
| 138 | + */ |
| 139 | + ALTERNATIVE "brc 0xf,.Lstoreslow", "nop", ALT_FACILITY(148) |
| 140 | + VSTBRF STATE0,0,,%r2 |
| 141 | + VSTBRF STATE1,16,,%r2 |
| 142 | + VSTBRF STATE2,32,,%r2 |
| 143 | + VSTBRF STATE3,48,,%r2 |
| 144 | +.Lstoredone: |
| 145 | + |
| 146 | + /* ++COPY3.COUNTER */ |
| 147 | + /* alsih %r3,1 */ |
| 148 | + .insn rilu,0xcc0a00000000,%r3,1 |
| 149 | + alcr %r3,%r1 |
| 150 | + VLVGG COPY3,%r3,0 |
| 151 | + |
| 152 | + /* OUTPUT += 64, --NBLOCKS */ |
| 153 | + aghi %r2,64 |
| 154 | + brctg %r5,.Lblock |
| 155 | + |
| 156 | + /* COUNTER = COPY3.COUNTER */ |
| 157 | + stg %r3,0(%r4) |
| 158 | + |
| 159 | + /* Zero out potentially sensitive regs */ |
| 160 | + VZERO STATE0 |
| 161 | + VZERO STATE1 |
| 162 | + VZERO STATE2 |
| 163 | + VZERO STATE3 |
| 164 | + VZERO COPY1 |
| 165 | + VZERO COPY2 |
| 166 | + |
| 167 | + /* Early exit if TMP0-TMP3 have not been used */ |
| 168 | + ALTERNATIVE "nopr", "br %r14", ALT_FACILITY(148) |
| 169 | + |
| 170 | + VZERO TMP0 |
| 171 | + VZERO TMP1 |
| 172 | + VZERO TMP2 |
| 173 | + VZERO TMP3 |
| 174 | + |
| 175 | + br %r14 |
| 176 | + |
| 177 | +.Lstoreslow: |
| 178 | + /* Convert STATE to little endian format and store to OUTPUT */ |
| 179 | + VPERM TMP0,STATE0,STATE0,BEPERM |
| 180 | + VPERM TMP1,STATE1,STATE1,BEPERM |
| 181 | + VPERM TMP2,STATE2,STATE2,BEPERM |
| 182 | + VPERM TMP3,STATE3,STATE3,BEPERM |
| 183 | + VSTM TMP0,TMP3,0,%r2 |
| 184 | + j .Lstoredone |
| 185 | +SYM_FUNC_END(__arch_chacha20_blocks_nostack) |
0 commit comments