|
| 1 | +#include "textflag.h" |
| 2 | + |
| 3 | +// func maskAsm(b *byte,len, int, key uint32) |
| 4 | +TEXT ·maskAsm(SB), NOSPLIT, $0-28 |
| 5 | + // R0 = b |
| 6 | + // R1 = len |
| 7 | + // R2 = uint64(key)<<32 | uint64(key) |
| 8 | + // R3 = key (uint32) |
| 9 | + MOVD b_ptr+0(FP), R0 |
| 10 | + MOVD b_len+8(FP), R1 |
| 11 | + MOVWU key+16(FP), R3 |
| 12 | + MOVD R3, R2 |
| 13 | + ORR R2<<32, R2, R2 |
| 14 | + VDUP R2, V0.D2 |
| 15 | + CMP $64, R1 |
| 16 | + BLT less_than_64 |
| 17 | + |
| 18 | + // todo: optimize unaligned case |
| 19 | +loop_64: |
| 20 | + VLD1 (R0), [V1.B16, V2.B16, V3.B16, V4.B16] |
| 21 | + VEOR V1.B16, V0.B16, V1.B16 |
| 22 | + VEOR V2.B16, V0.B16, V2.B16 |
| 23 | + VEOR V3.B16, V0.B16, V3.B16 |
| 24 | + VEOR V4.B16, V0.B16, V4.B16 |
| 25 | + VST1.P [V1.B16, V2.B16, V3.B16, V4.B16], 64(R0) |
| 26 | + SUBS $64, R1 |
| 27 | + CMP $64, R1 |
| 28 | + BGE loop_64 |
| 29 | + |
| 30 | +less_than_64: |
| 31 | + // quick end |
| 32 | + CBZ R1, end |
| 33 | + TBZ $5, R1, less_than32 |
| 34 | + VLD1 (R0), [V1.B16, V2.B16] |
| 35 | + VEOR V1.B16, V0.B16, V1.B16 |
| 36 | + VEOR V2.B16, V0.B16, V2.B16 |
| 37 | + VST1.P [V1.B16, V2.B16], 32(R0) |
| 38 | + |
| 39 | +less_than32: |
| 40 | + TBZ $4, R1, less_than16 |
| 41 | + LDP (R0), (R11, R12) |
| 42 | + EOR R11, R2, R11 |
| 43 | + EOR R12, R2, R12 |
| 44 | + STP.P (R11, R12), 16(R0) |
| 45 | + |
| 46 | +less_than16: |
| 47 | + TBZ $3, R1, less_than8 |
| 48 | + MOVD (R0), R11 |
| 49 | + EOR R2, R11, R11 |
| 50 | + MOVD.P R11, 8(R0) |
| 51 | + |
| 52 | +less_than8: |
| 53 | + TBZ $2, R1, less_than4 |
| 54 | + MOVWU (R0), R11 |
| 55 | + EORW R2, R11, R11 |
| 56 | + MOVWU.P R11, 4(R0) |
| 57 | + |
| 58 | +less_than4: |
| 59 | + TBZ $1, R1, less_than2 |
| 60 | + MOVHU (R0), R11 |
| 61 | + EORW R3, R11, R11 |
| 62 | + MOVHU.P R11, 2(R0) |
| 63 | + RORW $16, R3 |
| 64 | + |
| 65 | +less_than2: |
| 66 | + TBZ $0, R1, end |
| 67 | + MOVBU (R0), R11 |
| 68 | + EORW R3, R11, R11 |
| 69 | + MOVBU.P R11, 1(R0) |
| 70 | + RORW $8, R3 |
| 71 | + |
| 72 | +end: |
| 73 | + MOVWU R3, ret+24(FP) |
| 74 | + RET |
0 commit comments