Skip to content

Commit 1144807

Browse files
authored
Use 'older fashioned' assembly syntax (#78)
1 parent 50b6e6e commit 1144807

File tree

2 files changed

+101
-97
lines changed

2 files changed

+101
-97
lines changed

pqcrypto-internals/build.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,10 @@ fn main() {
1717
cfiledir.join("sp800-185.c"),
1818
];
1919

20+
println!("cargo:rerun-if-changed=cfiles/");
21+
println!("cargo:rerun-if-changed=build.rs");
22+
println!("cargo:rerun-if-changed=src/");
23+
2024
let mut build = cc::Build::new();
2125

2226
let target_os = env::var("CARGO_CFG_TARGET_OS").unwrap();

pqcrypto-internals/cfiles/keccak2x/feat.S

Lines changed: 97 additions & 97 deletions
Original file line numberDiff line numberDiff line change
@@ -29,97 +29,97 @@ SOFTWARE.
2929
.macro round
3030
// Execute theta, but without xoring into the state yet.
3131
// Compute parities p[i] = a[i] ^ a[5+i] ^ ... ^ a[20+i].
32-
eor3.16b v25, v0, v5, v10
33-
eor3.16b v26, v1, v6, v11
34-
eor3.16b v27, v2, v7, v12
35-
eor3.16b v28, v3, v8, v13
36-
eor3.16b v29, v4, v9, v14
37-
38-
eor3.16b v25, v25, v15, v20
39-
eor3.16b v26, v26, v16, v21
40-
eor3.16b v27, v27, v17, v22
41-
eor3.16b v28, v28, v18, v23
42-
eor3.16b v29, v29, v19, v24
43-
44-
rax1.2d v30, v29, v26 // d[0] = rotl(p[1], 1) ^ p[4]
45-
rax1.2d v29, v27, v29 // d[3] = rotl(p[4], 1) ^ p[2]
46-
rax1.2d v27, v25, v27 // d[1] = rotl(p[2], 1) ^ p[0]
47-
rax1.2d v25, v28, v25 // d[4] = rotl(p[0], 1) ^ p[3]
48-
rax1.2d v28, v26, v28 // d[2] = rotl(p[3], 1) ^ p[1]
32+
eor3 v25.16b, v0.16b, v5.16b, v10.16b
33+
eor3 v26.16b, v1.16b, v6.16b, v11.16b
34+
eor3 v27.16b, v2.16b, v7.16b, v12.16b
35+
eor3 v28.16b, v3.16b, v8.16b, v13.16b
36+
eor3 v29.16b, v4.16b, v9.16b, v14.16b
37+
38+
eor3 v25.16b, v25.16b, v15.16b, v20.16b
39+
eor3 v26.16b, v26.16b, v16.16b, v21.16b
40+
eor3 v27.16b, v27.16b, v17.16b, v22.16b
41+
eor3 v28.16b, v28.16b, v18.16b, v23.16b
42+
eor3 v29.16b, v29.16b, v19.16b, v24.16b
43+
44+
rax1 v30.2d, v29.2d, v26.2d // d[0] = rotl(p[1], 1) ^ p[4]
45+
rax1 v29.2d, v27.2d, v29.2d // d[3] = rotl(p[4], 1) ^ p[2]
46+
rax1 v27.2d, v25.2d, v27.2d // d[1] = rotl(p[2], 1) ^ p[0]
47+
rax1 v25.2d, v28.2d, v25.2d // d[4] = rotl(p[0], 1) ^ p[3]
48+
rax1 v28.2d, v26.2d, v28.2d // d[2] = rotl(p[3], 1) ^ p[1]
4949

5050
// Xor parities from step theta into the state at the same time
5151
// as executing rho and pi.
52-
eor.16b v0, v0, v30
53-
mov.16b v31, v1
54-
xar.2d v1, v6, v27, 20
55-
xar.2d v6, v9, v25, 44
56-
xar.2d v9, v22, v28, 3
57-
xar.2d v22, v14, v25, 25
58-
xar.2d v14, v20, v30, 46
59-
xar.2d v20, v2, v28, 2
60-
xar.2d v2, v12, v28, 21
61-
xar.2d v12, v13, v29, 39
62-
xar.2d v13, v19, v25, 56
63-
xar.2d v19, v23, v29, 8
64-
xar.2d v23, v15, v30, 23
65-
xar.2d v15, v4, v25, 37
66-
xar.2d v4, v24, v25, 50
67-
xar.2d v24, v21, v27, 62
68-
xar.2d v21, v8, v29, 9
69-
xar.2d v8, v16, v27, 19
70-
xar.2d v16, v5, v30, 28
71-
xar.2d v5, v3, v29, 36
72-
xar.2d v3, v18, v29, 43
73-
xar.2d v18, v17, v28, 49
74-
xar.2d v17, v11, v27, 54
75-
xar.2d v11, v7, v28, 58
76-
xar.2d v7, v10, v30, 61
77-
xar.2d v10, v31, v27, 63
52+
eor v0.16b, v0.16b, v30.16b
53+
mov v31.16b, v1.16b
54+
xar v1.2d, v6.2d, v27.2d, 20
55+
xar v6.2d, v9.2d, v25.2d, 44
56+
xar v9.2d, v22.2d, v28.2d, 3
57+
xar v22.2d, v14.2d, v25.2d, 25
58+
xar v14.2d, v20.2d, v30.2d, 46
59+
xar v20.2d, v2.2d, v28.2d, 2
60+
xar v2.2d, v12.2d, v28.2d, 21
61+
xar v12.2d, v13.2d, v29.2d, 39
62+
xar v13.2d, v19.2d, v25.2d, 56
63+
xar v19.2d, v23.2d, v29.2d, 8
64+
xar v23.2d, v15.2d, v30.2d, 23
65+
xar v15.2d, v4.2d, v25.2d, 37
66+
xar v4.2d, v24.2d, v25.2d, 50
67+
xar v24.2d, v21.2d, v27.2d, 62
68+
xar v21.2d, v8.2d, v29.2d, 9
69+
xar v8.2d, v16.2d, v27.2d, 19
70+
xar v16.2d, v5.2d, v30.2d, 28
71+
xar v5.2d, v3.2d, v29.2d, 36
72+
xar v3.2d, v18.2d, v29.2d, 43
73+
xar v18.2d, v17.2d, v28.2d, 49
74+
xar v17.2d, v11.2d, v27.2d, 54
75+
xar v11.2d, v7.2d, v28.2d, 58
76+
xar v7.2d, v10.2d, v30.2d, 61
77+
xar v10.2d, v31.2d, v27.2d, 63
7878

7979
// Chi
80-
bcax.16b v25, v0, v2, v1
81-
bcax.16b v26, v1, v3, v2
82-
bcax.16b v2, v2, v4, v3
83-
bcax.16b v3, v3, v0, v4
84-
bcax.16b v4, v4, v1, v0
85-
mov.16b v0, v25
86-
mov.16b v1, v26
87-
88-
bcax.16b v25, v5, v7, v6
89-
bcax.16b v26, v6, v8, v7
90-
bcax.16b v7, v7, v9, v8
91-
bcax.16b v8, v8, v5, v9
92-
bcax.16b v9, v9, v6, v5
93-
mov.16b v5, v25
94-
mov.16b v6, v26
95-
96-
bcax.16b v25, v10, v12, v11
97-
bcax.16b v26, v11, v13, v12
98-
bcax.16b v12, v12, v14, v13
99-
bcax.16b v13, v13, v10, v14
100-
bcax.16b v14, v14, v11, v10
101-
mov.16b v10, v25
102-
mov.16b v11, v26
103-
104-
bcax.16b v25, v15, v17, v16
105-
bcax.16b v26, v16, v18, v17
106-
bcax.16b v17, v17, v19, v18
107-
bcax.16b v18, v18, v15, v19
108-
bcax.16b v19, v19, v16, v15
109-
mov.16b v15, v25
110-
mov.16b v16, v26
111-
112-
bcax.16b v25, v20, v22, v21
113-
bcax.16b v26, v21, v23, v22
114-
bcax.16b v22, v22, v24, v23
115-
bcax.16b v23, v23, v20, v24
116-
bcax.16b v24, v24, v21, v20
117-
mov.16b v20, v25
118-
mov.16b v21, v26
80+
bcax v25.16b, v0.16b, v2.16b, v1.16b
81+
bcax v26.16b, v1.16b, v3.16b, v2.16b
82+
bcax v2.16b, v2.16b, v4.16b, v3.16b
83+
bcax v3.16b, v3.16b, v0.16b, v4.16b
84+
bcax v4.16b, v4.16b, v1.16b, v0.16b
85+
mov v0.16b, v25.16b
86+
mov v1.16b, v26.16b
87+
88+
bcax v25.16b, v5.16b, v7.16b, v6.16b
89+
bcax v26.16b, v6.16b, v8.16b, v7.16b
90+
bcax v7.16b, v7.16b, v9.16b, v8.16b
91+
bcax v8.16b, v8.16b, v5.16b, v9.16b
92+
bcax v9.16b, v9.16b, v6.16b, v5.16b
93+
mov v5.16b, v25.16b
94+
mov v6.16b, v26.16b
95+
96+
bcax v25.16b, v10.16b, v12.16b, v11.16b
97+
bcax v26.16b, v11.16b, v13.16b, v12.16b
98+
bcax v12.16b, v12.16b, v14.16b, v13.16b
99+
bcax v13.16b, v13.16b, v10.16b, v14.16b
100+
bcax v14.16b, v14.16b, v11.16b, v10.16b
101+
mov v10.16b, v25.16b
102+
mov v11.16b, v26.16b
103+
104+
bcax v25.16b, v15.16b, v17.16b, v16.16b
105+
bcax v26.16b, v16.16b, v18.16b, v17.16b
106+
bcax v17.16b, v17.16b, v19.16b, v18.16b
107+
bcax v18.16b, v18.16b, v15.16b, v19.16b
108+
bcax v19.16b, v19.16b, v16.16b, v15.16b
109+
mov v15.16b, v25.16b
110+
mov v16.16b, v26.16b
111+
112+
bcax v25.16b, v20.16b, v22.16b, v21.16b
113+
bcax v26.16b, v21.16b, v23.16b, v22.16b
114+
bcax v22.16b, v22.16b, v24.16b, v23.16b
115+
bcax v23.16b, v23.16b, v20.16b, v24.16b
116+
bcax v24.16b, v24.16b, v21.16b, v20.16b
117+
mov v20.16b, v25.16b
118+
mov v21.16b, v26.16b
119119

120120
// iota
121121
ld1r {v25.2d}, [x1], #8
122-
eor.16b v0, v0, v25
122+
eor v0.16b, v0.16b, v25.16b
123123
.endm
124124

125125
.align 4
@@ -135,13 +135,13 @@ _f1600x2:
135135
mov x2, x0
136136
mov x3, #24
137137

138-
ld1.2d {v0, v1, v2, v3}, [x0], #64
139-
ld1.2d {v4, v5, v6, v7}, [x0], #64
140-
ld1.2d {v8, v9, v10, v11}, [x0], #64
141-
ld1.2d {v12, v13, v14, v15}, [x0], #64
142-
ld1.2d {v16, v17, v18, v19}, [x0], #64
143-
ld1.2d {v20, v21, v22, v23}, [x0], #64
144-
ld1.2d {v24}, [x0]
138+
ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [x0], #64
139+
ld1 {v4.2d, v5.2d, v6.2d, v7.2d}, [x0], #64
140+
ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [x0], #64
141+
ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [x0], #64
142+
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [x0], #64
143+
ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [x0], #64
144+
ld1 {v24.2d}, [x0]
145145

146146
loop:
147147
round
@@ -150,13 +150,13 @@ loop:
150150
cbnz x3, loop
151151

152152
mov x0, x2
153-
st1.2d {v0, v1, v2, v3}, [x0], #64
154-
st1.2d {v4, v5, v6, v7}, [x0], #64
155-
st1.2d {v8, v9, v10, v11}, [x0], #64
156-
st1.2d {v12, v13, v14, v15}, [x0], #64
157-
st1.2d {v16, v17, v18, v19}, [x0], #64
158-
st1.2d {v20, v21, v22, v23}, [x0], #64
159-
st1.2d {v24}, [x0]
153+
st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [x0], #64
154+
st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [x0], #64
155+
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [x0], #64
156+
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [x0], #64
157+
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [x0], #64
158+
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [x0], #64
159+
st1 {v24.2d}, [x0]
160160

161161
ldp d14, d15, [sp], #16
162162
ldp d12, d13, [sp], #16

0 commit comments

Comments
 (0)