@@ -29,97 +29,97 @@ SOFTWARE.
2929.macro round
3030 // Execute theta , but without xoring into the state yet.
3131 // Compute parities p [ i ] = a [ i ] ^ a [ 5 + i ] ^ ... ^ a [ 20 + i ] .
32- eor3.16b v25 , v0 , v5 , v10
33- eor3.16b v26 , v1 , v6 , v11
34- eor3.16b v27 , v2 , v7 , v12
35- eor3.16b v28 , v3 , v8 , v13
36- eor3.16b v29 , v4 , v9 , v14
37-
38- eor3.16b v25 , v25 , v15 , v20
39- eor3.16b v26 , v26 , v16 , v21
40- eor3.16b v27 , v27 , v17 , v22
41- eor3.16b v28 , v28 , v18 , v23
42- eor3.16b v29 , v29 , v19 , v24
43-
44- rax1.2d v30 , v29 , v26 // d [ 0 ] = rotl(p [ 1 ], 1 ) ^ p [ 4 ]
45- rax1.2d v29 , v27 , v29 // d [ 3 ] = rotl(p [ 4 ], 1 ) ^ p [ 2 ]
46- rax1.2d v27 , v25 , v27 // d [ 1 ] = rotl(p [ 2 ], 1 ) ^ p [ 0 ]
47- rax1.2d v25 , v28 , v25 // d [ 4 ] = rotl(p [ 0 ], 1 ) ^ p [ 3 ]
48- rax1.2d v28 , v26 , v28 // d [ 2 ] = rotl(p [ 3 ], 1 ) ^ p [ 1 ]
32+ eor3 v25.16b , v0.16b , v5.16b , v10.16b
33+ eor3 v26.16b , v1.16b , v6.16b , v11.16b
34+ eor3 v27.16b , v2.16b , v7.16b , v12.16b
35+ eor3 v28.16b , v3.16b , v8.16b , v13.16b
36+ eor3 v29.16b , v4.16b , v9.16b , v14.16b
37+
38+ eor3 v25.16b , v25.16b , v15.16b , v20.16b
39+ eor3 v26.16b , v26.16b , v16.16b , v21.16b
40+ eor3 v27.16b , v27.16b , v17.16b , v22.16b
41+ eor3 v28.16b , v28.16b , v18.16b , v23.16b
42+ eor3 v29.16b , v29.16b , v19.16b , v24.16b
43+
44+ rax1 v30.2d , v29.2d , v26.2d // d [ 0 ] = rotl(p [ 1 ], 1 ) ^ p [ 4 ]
45+ rax1 v29.2d , v27.2d , v29.2d // d [ 3 ] = rotl(p [ 4 ], 1 ) ^ p [ 2 ]
46+ rax1 v27.2d , v25.2d , v27.2d // d [ 1 ] = rotl(p [ 2 ], 1 ) ^ p [ 0 ]
47+ rax1 v25.2d , v28.2d , v25.2d // d [ 4 ] = rotl(p [ 0 ], 1 ) ^ p [ 3 ]
48+ rax1 v28.2d , v26.2d , v28.2d // d [ 2 ] = rotl(p [ 3 ], 1 ) ^ p [ 1 ]
4949
5050 // Xor parities from step theta into the state at the same time
5151 // as executing rho and pi.
52- eor.16b v0 , v0 , v30
53- mov .16b v31, v1
54- xar.2d v1 , v6 , v27, 20
55- xar.2d v6 , v9 , v25, 44
56- xar.2d v9 , v22, v28 , 3
57- xar.2d v22 , v14 , v25 , 25
58- xar.2d v14 , v20 , v30 , 46
59- xar.2d v20 , v2 , v28, 2
60- xar.2d v2 , v12, v28 , 21
61- xar.2d v12 , v13 , v29 , 39
62- xar.2d v13 , v19 , v25 , 56
63- xar.2d v19 , v23 , v29 , 8
64- xar.2d v23 , v15 , v30 , 23
65- xar.2d v15 , v4 , v25, 37
66- xar.2d v4 , v24, v25 , 50
67- xar.2d v24 , v21 , v27 , 62
68- xar.2d v21 , v8 , v29, 9
69- xar.2d v8 , v16, v27 , 19
70- xar.2d v16 , v5 , v30, 28
71- xar.2d v5 , v3 , v29, 36
72- xar.2d v3 , v18, v29 , 43
73- xar.2d v18 , v17 , v28 , 49
74- xar.2d v17 , v11 , v27 , 54
75- xar.2d v11 , v7 , v28, 58
76- xar.2d v7 , v10, v30 , 61
77- xar.2d v10 , v31 , v27 , 63
52+ eor v0.16b , v0.16b , v30.16b
53+ mov v31.16b , v1.16b
54+ xar v1.2d , v6.2d , v27.2d , 20
55+ xar v6.2d , v9.2d , v25.2d , 44
56+ xar v9.2d , v22.2d , v28.2d , 3
57+ xar v22.2d , v14.2d , v25.2d , 25
58+ xar v14.2d , v20.2d , v30.2d , 46
59+ xar v20.2d , v2.2d , v28.2d , 2
60+ xar v2.2d , v12.2d , v28.2d , 21
61+ xar v12.2d , v13.2d , v29.2d , 39
62+ xar v13.2d , v19.2d , v25.2d , 56
63+ xar v19.2d , v23.2d , v29.2d , 8
64+ xar v23.2d , v15.2d , v30.2d , 23
65+ xar v15.2d , v4.2d , v25.2d , 37
66+ xar v4.2d , v24.2d , v25.2d , 50
67+ xar v24.2d , v21.2d , v27.2d , 62
68+ xar v21.2d , v8.2d , v29.2d , 9
69+ xar v8.2d , v16.2d , v27.2d , 19
70+ xar v16.2d , v5.2d , v30.2d , 28
71+ xar v5.2d , v3.2d , v29.2d , 36
72+ xar v3.2d , v18.2d , v29.2d , 43
73+ xar v18.2d , v17.2d , v28.2d , 49
74+ xar v17.2d , v11.2d , v27.2d , 54
75+ xar v11.2d , v7.2d , v28.2d , 58
76+ xar v7.2d , v10.2d , v30.2d , 61
77+ xar v10.2d , v31.2d , v27.2d , 63
7878
7979 // Chi
80- bcax.16b v25 , v0 , v2 , v1
81- bcax.16b v26 , v1 , v3 , v2
82- bcax.16b v2 , v2 , v4 , v3
83- bcax.16b v3 , v3 , v0 , v4
84- bcax.16b v4 , v4 , v1 , v0
85- mov .16b v0, v25
86- mov .16b v1, v26
87-
88- bcax.16b v25 , v5 , v7 , v6
89- bcax.16b v26 , v6 , v8 , v7
90- bcax.16b v7 , v7 , v9 , v8
91- bcax.16b v8 , v8 , v5 , v9
92- bcax.16b v9 , v9 , v6 , v5
93- mov .16b v5, v25
94- mov .16b v6, v26
95-
96- bcax.16b v25 , v10 , v12, v11
97- bcax.16b v26 , v11 , v13, v12
98- bcax.16b v12 , v12 , v14, v13
99- bcax.16b v13 , v13 , v10, v14
100- bcax.16b v14 , v14 , v11, v10
101- mov .16b v10, v25
102- mov .16b v11, v26
103-
104- bcax.16b v25 , v15 , v17, v16
105- bcax.16b v26 , v16 , v18, v17
106- bcax.16b v17 , v17 , v19, v18
107- bcax.16b v18 , v18 , v15, v19
108- bcax.16b v19 , v19 , v16, v15
109- mov .16b v15, v25
110- mov .16b v16, v26
111-
112- bcax.16b v25 , v20 , v22, v21
113- bcax.16b v26 , v21 , v23, v22
114- bcax.16b v22 , v22 , v24, v23
115- bcax.16b v23 , v23 , v20, v24
116- bcax.16b v24 , v24 , v21, v20
117- mov .16b v20, v25
118- mov .16b v21, v26
80+ bcax v25.16b , v0.16b , v2.16b , v1.16b
81+ bcax v26.16b , v1.16b , v3.16b , v2.16b
82+ bcax v2.16b , v2.16b , v4.16b , v3.16b
83+ bcax v3.16b , v3.16b , v0.16b , v4.16b
84+ bcax v4.16b , v4.16b , v1.16b , v0.16b
85+ mov v0.16b , v25.16b
86+ mov v1.16b , v26.16b
87+
88+ bcax v25.16b , v5.16b , v7.16b , v6.16b
89+ bcax v26.16b , v6.16b , v8.16b , v7.16b
90+ bcax v7.16b , v7.16b , v9.16b , v8.16b
91+ bcax v8.16b , v8.16b , v5.16b , v9.16b
92+ bcax v9.16b , v9.16b , v6.16b , v5.16b
93+ mov v5.16b , v25.16b
94+ mov v6.16b , v26.16b
95+
96+ bcax v25.16b , v10.16b , v12.16b , v11.16b
97+ bcax v26.16b , v11.16b , v13.16b , v12.16b
98+ bcax v12.16b , v12.16b , v14.16b , v13.16b
99+ bcax v13.16b , v13.16b , v10.16b , v14.16b
100+ bcax v14.16b , v14.16b , v11.16b , v10.16b
101+ mov v10.16b , v25.16b
102+ mov v11.16b , v26.16b
103+
104+ bcax v25.16b , v15.16b , v17.16b , v16.16b
105+ bcax v26.16b , v16.16b , v18.16b , v17.16b
106+ bcax v17.16b , v17.16b , v19.16b , v18.16b
107+ bcax v18.16b , v18.16b , v15.16b , v19.16b
108+ bcax v19.16b , v19.16b , v16.16b , v15.16b
109+ mov v15.16b , v25.16b
110+ mov v16.16b , v26.16b
111+
112+ bcax v25.16b , v20.16b , v22.16b , v21.16b
113+ bcax v26.16b , v21.16b , v23.16b , v22.16b
114+ bcax v22.16b , v22.16b , v24.16b , v23.16b
115+ bcax v23.16b , v23.16b , v20.16b , v24.16b
116+ bcax v24.16b , v24.16b , v21.16b , v20.16b
117+ mov v20.16b , v25.16b
118+ mov v21.16b , v26.16b
119119
120120 // iota
121121 ld1r {v25.2d} , [ x1 ], # 8
122- eor.16b v0 , v0 , v25
122+ eor v0.16b , v0.16b , v25.16b
123123.endm
124124
125125. align 4
@@ -135,13 +135,13 @@ _f1600x2:
135135 mov x2 , x0
136136 mov x3 , # 24
137137
138- ld1.2d {v0 , v1 , v2 , v3 }, [ x0 ], # 64
139- ld1.2d {v4 , v5 , v6 , v7 }, [ x0 ], # 64
140- ld1.2d {v8 , v9 , v10, v11} , [ x0 ], # 64
141- ld1.2d {v12 , v13 , v14 , v15} , [ x0 ], # 64
142- ld1.2d {v16 , v17 , v18 , v19} , [ x0 ], # 64
143- ld1.2d {v20 , v21 , v22 , v23} , [ x0 ], # 64
144- ld1.2d {v24} , [ x0 ]
138+ ld1 {v0.2d , v1.2d , v2.2d , v3.2d }, [ x0 ], # 64
139+ ld1 {v4.2d , v5.2d , v6.2d , v7.2d }, [ x0 ], # 64
140+ ld1 {v8.2d , v9.2d , v10.2d , v11.2d } , [ x0 ], # 64
141+ ld1 {v12.2d , v13.2d , v14.2d , v15.2d } , [ x0 ], # 64
142+ ld1 {v16.2d , v17.2d , v18.2d , v19.2d } , [ x0 ], # 64
143+ ld1 {v20.2d , v21.2d , v22.2d , v23.2d } , [ x0 ], # 64
144+ ld1 {v24.2d } , [ x0 ]
145145
146146loop :
147147 round
@@ -150,13 +150,13 @@ loop:
150150 cbnz x3 , loop
151151
152152 mov x0 , x2
153- st1.2d {v0 , v1 , v2 , v3 }, [ x0 ], # 64
154- st1.2d {v4 , v5 , v6 , v7 }, [ x0 ], # 64
155- st1.2d {v8 , v9 , v10, v11} , [ x0 ], # 64
156- st1.2d {v12 , v13 , v14 , v15} , [ x0 ], # 64
157- st1.2d {v16 , v17 , v18 , v19} , [ x0 ], # 64
158- st1.2d {v20 , v21 , v22 , v23} , [ x0 ], # 64
159- st1.2d {v24} , [ x0 ]
153+ st1 {v0.2d , v1.2d , v2.2d , v3.2d }, [ x0 ], # 64
154+ st1 {v4.2d , v5.2d , v6.2d , v7.2d }, [ x0 ], # 64
155+ st1 {v8.2d , v9.2d , v10.2d , v11.2d } , [ x0 ], # 64
156+ st1 {v12.2d , v13.2d , v14.2d , v15.2d } , [ x0 ], # 64
157+ st1 {v16.2d , v17.2d , v18.2d , v19.2d } , [ x0 ], # 64
158+ st1 {v20.2d , v21.2d , v22.2d , v23.2d } , [ x0 ], # 64
159+ st1 {v24.2d } , [ x0 ]
160160
161161 ldp d14 , d15 , [ sp ], # 16
162162 ldp d12 , d13 , [ sp ], # 16
0 commit comments