Skip to content

Commit 61f5ef7

Browse files
committed
Use AT&T version of asmb
Signed-off-by: Jake Massimo <[email protected]>
1 parent 6651189 commit 61f5ef7

File tree

1 file changed

+163
-164
lines changed

1 file changed

+163
-164
lines changed

proofs/hol_light/x86_64/mldsa/mldsa_ntt.S

Lines changed: 163 additions & 164 deletions
Original file line numberDiff line numberDiff line change
@@ -26,11 +26,10 @@
2626
// Microsoft x64 ABI: RCX = a, RDX = zetas
2727
// ----------------------------------------------------------------------------
2828

29-
.intel_syntax noprefix
3029
.text
3130

32-
#define a rdi
33-
#define zetas rsi
31+
#define a %rdi
32+
#define zetas %rsi
3433

3534
#define MLD_AVX2_BACKEND_DATA_OFFSET_8XQ 0
3635
#define MLD_AVX2_BACKEND_DATA_OFFSET_8XQINV 8
@@ -40,196 +39,196 @@
4039
#define MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS 328
4140

4241
.macro shuffle8 r0,r1,r2,r3
43-
vperm2i128 ymm\r2,ymm\r0,ymm\r1,0x20
44-
vperm2i128 ymm\r3,ymm\r0,ymm\r1,0x31
42+
vperm2i128 $0x20,%ymm\r1,%ymm\r0,%ymm\r2
43+
vperm2i128 $0x31,%ymm\r1,%ymm\r0,%ymm\r3
4544
.endm
4645

4746
.macro shuffle4 r0,r1,r2,r3
48-
vpunpcklqdq ymm\r2,ymm\r0,ymm\r1
49-
vpunpckhqdq ymm\r3,ymm\r0,ymm\r1
47+
vpunpcklqdq %ymm\r1,%ymm\r0,%ymm\r2
48+
vpunpckhqdq %ymm\r1,%ymm\r0,%ymm\r3
5049
.endm
5150

5251
.macro shuffle2 r0,r1,r2,r3
53-
vmovsldup ymm\r2,ymm\r1
54-
vpblendd ymm\r2,ymm\r0,ymm\r2,0xAA
55-
vpsrlq ymm\r0,ymm\r0,32
56-
vpblendd ymm\r3,ymm\r0,ymm\r1,0xAA
52+
vmovsldup %ymm\r1,%ymm\r2
53+
vpblendd $0xAA,%ymm\r2,%ymm\r0,%ymm\r2
54+
vpsrlq $32,%ymm\r0,%ymm\r0
55+
vpblendd $0xAA,%ymm\r1,%ymm\r0,%ymm\r3
5756
.endm
5857

5958
.macro butterfly l,h,zl0=1,zl1=1,zh0=2,zh1=2
60-
vpmuldq ymm13,ymm\h,ymm\zl0
61-
vmovshdup ymm12,ymm\h
62-
vpmuldq ymm14,ymm12,ymm\zl1
59+
vpmuldq %ymm\zl0,%ymm\h,%ymm13
60+
vmovshdup %ymm\h,%ymm12
61+
vpmuldq %ymm\zl1,%ymm12,%ymm14
6362

64-
vpmuldq ymm\h,ymm\h,ymm\zh0
65-
vpmuldq ymm12,ymm12,ymm\zh1
63+
vpmuldq %ymm\zh0,%ymm\h,%ymm\h
64+
vpmuldq %ymm\zh1,%ymm12,%ymm12
6665

67-
vpmuldq ymm13,ymm13,ymm0
68-
vpmuldq ymm14,ymm14,ymm0
66+
vpmuldq %ymm0,%ymm13,%ymm13
67+
vpmuldq %ymm0,%ymm14,%ymm14
6968

70-
vmovshdup ymm\h,ymm\h
71-
vpblendd ymm\h,ymm\h,ymm12,0xAA
69+
vmovshdup %ymm\h,%ymm\h
70+
vpblendd $0xAA,%ymm12,%ymm\h,%ymm\h
7271

73-
vpsubd ymm12,ymm\l,ymm\h
74-
vpaddd ymm\l,ymm\l,ymm\h
72+
vpsubd %ymm\h,%ymm\l,%ymm12
73+
vpaddd %ymm\h,%ymm\l,%ymm\l
7574

76-
vmovshdup ymm13,ymm13
77-
vpblendd ymm13,ymm13,ymm14,0xAA
75+
vmovshdup %ymm13,%ymm13
76+
vpblendd $0xAA,%ymm14,%ymm13,%ymm13
7877

79-
vpaddd ymm\h,ymm12,ymm13
80-
vpsubd ymm\l,ymm\l,ymm13
78+
vpaddd %ymm13,%ymm12,%ymm\h
79+
vpsubd %ymm13,%ymm\l,%ymm\l
8180
.endm
8281

8382
.macro levels0t1 off
8483
/* level 0 */
85-
vpbroadcastd ymm1,[rsi+(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+1)*4]
86-
vpbroadcastd ymm2,[rsi+(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+1)*4]
87-
88-
vmovdqa ymm4,[rdi+0+32*\off]
89-
vmovdqa ymm5,[rdi+128+32*\off]
90-
vmovdqa ymm6,[rdi+256+32*\off]
91-
vmovdqa ymm7,[rdi+384+32*\off]
92-
vmovdqa ymm8,[rdi+512+32*\off]
93-
vmovdqa ymm9,[rdi+640+32*\off]
94-
vmovdqa ymm10,[rdi+768+32*\off]
95-
vmovdqa ymm11,[rdi+896+32*\off]
96-
97-
butterfly 4,8
98-
butterfly 5,9
99-
butterfly 6,10
100-
butterfly 7,11
84+
vpbroadcastd (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+1)*4(%rsi),%ymm1
85+
vpbroadcastd (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+1)*4(%rsi),%ymm2
86+
87+
vmovdqa 0+32*\off(%rdi),%ymm4
88+
vmovdqa 128+32*\off(%rdi),%ymm5
89+
vmovdqa 256+32*\off(%rdi),%ymm6
90+
vmovdqa 384+32*\off(%rdi),%ymm7
91+
vmovdqa 512+32*\off(%rdi),%ymm8
92+
vmovdqa 640+32*\off(%rdi),%ymm9
93+
vmovdqa 768+32*\off(%rdi),%ymm10
94+
vmovdqa 896+32*\off(%rdi),%ymm11
95+
96+
butterfly 4,8
97+
butterfly 5,9
98+
butterfly 6,10
99+
butterfly 7,11
101100

102101
/* level 1 */
103-
vpbroadcastd ymm1,[rsi+(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+2)*4]
104-
vpbroadcastd ymm2,[rsi+(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+2)*4]
105-
butterfly 4,6
106-
butterfly 5,7
107-
108-
vpbroadcastd ymm1,[rsi+(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+3)*4]
109-
vpbroadcastd ymm2,[rsi+(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+3)*4]
110-
butterfly 8,10
111-
butterfly 9,11
112-
113-
vmovdqa [rdi+0+32*\off],ymm4
114-
vmovdqa [rdi+128+32*\off],ymm5
115-
vmovdqa [rdi+256+32*\off],ymm6
116-
vmovdqa [rdi+384+32*\off],ymm7
117-
vmovdqa [rdi+512+32*\off],ymm8
118-
vmovdqa [rdi+640+32*\off],ymm9
119-
vmovdqa [rdi+768+32*\off],ymm10
120-
vmovdqa [rdi+896+32*\off],ymm11
102+
vpbroadcastd (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+2)*4(%rsi),%ymm1
103+
vpbroadcastd (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+2)*4(%rsi),%ymm2
104+
butterfly 4,6
105+
butterfly 5,7
106+
107+
vpbroadcastd (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+3)*4(%rsi),%ymm1
108+
vpbroadcastd (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+3)*4(%rsi),%ymm2
109+
butterfly 8,10
110+
butterfly 9,11
111+
112+
vmovdqa %ymm4,0+32*\off(%rdi)
113+
vmovdqa %ymm5,128+32*\off(%rdi)
114+
vmovdqa %ymm6,256+32*\off(%rdi)
115+
vmovdqa %ymm7,384+32*\off(%rdi)
116+
vmovdqa %ymm8,512+32*\off(%rdi)
117+
vmovdqa %ymm9,640+32*\off(%rdi)
118+
vmovdqa %ymm10,768+32*\off(%rdi)
119+
vmovdqa %ymm11,896+32*\off(%rdi)
121120
.endm
122121

123122
.macro levels2t7 off
124123
/* level 2 */
125-
vmovdqa ymm4,[rdi+0+256*\off]
126-
vmovdqa ymm5,[rdi+32+256*\off]
127-
vmovdqa ymm6,[rdi+64+256*\off]
128-
vmovdqa ymm7,[rdi+96+256*\off]
129-
vmovdqa ymm8,[rdi+128+256*\off]
130-
vmovdqa ymm9,[rdi+160+256*\off]
131-
vmovdqa ymm10,[rdi+192+256*\off]
132-
vmovdqa ymm11,[rdi+224+256*\off]
133-
vpbroadcastd ymm1,[rsi+(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+4+\off)*4]
134-
vpbroadcastd ymm2,[rsi+(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+4+\off)*4]
135-
136-
butterfly 4,8
137-
butterfly 5,9
138-
butterfly 6,10
139-
butterfly 7,11
140-
141-
shuffle8 4,8,3,8
142-
shuffle8 5,9,4,9
143-
shuffle8 6,10,5,10
144-
shuffle8 7,11,6,11
124+
vmovdqa 256*\off+0(%rdi),%ymm4
125+
vmovdqa 256*\off+32(%rdi),%ymm5
126+
vmovdqa 256*\off+64(%rdi),%ymm6
127+
vmovdqa 256*\off+96(%rdi),%ymm7
128+
vmovdqa 256*\off+128(%rdi),%ymm8
129+
vmovdqa 256*\off+160(%rdi),%ymm9
130+
vmovdqa 256*\off+192(%rdi),%ymm10
131+
vmovdqa 256*\off+224(%rdi),%ymm11
132+
vpbroadcastd (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+4+\off)*4(%rsi),%ymm1
133+
vpbroadcastd (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+4+\off)*4(%rsi),%ymm2
134+
135+
butterfly 4,8
136+
butterfly 5,9
137+
butterfly 6,10
138+
butterfly 7,11
139+
140+
shuffle8 4,8,3,8
141+
shuffle8 5,9,4,9
142+
shuffle8 6,10,5,10
143+
shuffle8 7,11,6,11
145144

146145
/* level 3 */
147-
vmovdqa ymm1,[rsi+(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+8+8*\off)*4]
148-
vmovdqa ymm2,[rsi+(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+8+8*\off)*4]
146+
vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+8+8*\off)*4(%rsi),%ymm1
147+
vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+8+8*\off)*4(%rsi),%ymm2
149148

150-
butterfly 3,5
151-
butterfly 8,10
152-
butterfly 4,6
153-
butterfly 9,11
149+
butterfly 3,5
150+
butterfly 8,10
151+
butterfly 4,6
152+
butterfly 9,11
154153

155-
shuffle4 3,5,7,5
156-
shuffle4 8,10,3,10
157-
shuffle4 4,6,8,6
158-
shuffle4 9,11,4,11
154+
shuffle4 3,5,7,5
155+
shuffle4 8,10,3,10
156+
shuffle4 4,6,8,6
157+
shuffle4 9,11,4,11
159158

160159
/* level 4 */
161-
vmovdqa ymm1,[rsi+(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+40+8*\off)*4]
162-
vmovdqa ymm2,[rsi+(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+40+8*\off)*4]
160+
vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+40+8*\off)*4(%rsi),%ymm1
161+
vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+40+8*\off)*4(%rsi),%ymm2
163162

164-
butterfly 7,8
165-
butterfly 5,6
166-
butterfly 3,4
167-
butterfly 10,11
163+
butterfly 7,8
164+
butterfly 5,6
165+
butterfly 3,4
166+
butterfly 10,11
168167

169-
shuffle2 7,8,9,8
170-
shuffle2 5,6,7,6
171-
shuffle2 3,4,5,4
172-
shuffle2 10,11,3,11
168+
shuffle2 7,8,9,8
169+
shuffle2 5,6,7,6
170+
shuffle2 3,4,5,4
171+
shuffle2 10,11,3,11
173172

174173
/* level 5 */
175-
vmovdqa ymm1,[rsi+(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+72+8*\off)*4]
176-
vmovdqa ymm2,[rsi+(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+72+8*\off)*4]
177-
vpsrlq ymm10,ymm1,32
178-
vmovshdup ymm15,ymm2
174+
vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+72+8*\off)*4(%rsi),%ymm1
175+
vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+72+8*\off)*4(%rsi),%ymm2
176+
vpsrlq $32,%ymm1,%ymm10
177+
vmovshdup %ymm2,%ymm15
179178

180-
butterfly 9,5,1,10,2,15
181-
butterfly 8,4,1,10,2,15
182-
butterfly 7,3,1,10,2,15
183-
butterfly 6,11,1,10,2,15
179+
butterfly 9,5,1,10,2,15
180+
butterfly 8,4,1,10,2,15
181+
butterfly 7,3,1,10,2,15
182+
butterfly 6,11,1,10,2,15
184183

185184
/* level 6 */
186-
vmovdqa ymm1,[rsi+(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+104+8*\off)*4]
187-
vmovdqa ymm2,[rsi+(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+104+8*\off)*4]
188-
vpsrlq ymm10,ymm1,32
189-
vmovshdup ymm15,ymm2
190-
butterfly 9,7,1,10,2,15
191-
butterfly 8,6,1,10,2,15
192-
193-
vmovdqa ymm1,[rsi+(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+104+8*\off+32)*4]
194-
vmovdqa ymm2,[rsi+(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+104+8*\off+32)*4]
195-
vpsrlq ymm10,ymm1,32
196-
vmovshdup ymm15,ymm2
197-
butterfly 5,3,1,10,2,15
198-
butterfly 4,11,1,10,2,15
185+
vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+104+8*\off)*4(%rsi),%ymm1
186+
vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+104+8*\off)*4(%rsi),%ymm2
187+
vpsrlq $32,%ymm1,%ymm10
188+
vmovshdup %ymm2,%ymm15
189+
butterfly 9,7,1,10,2,15
190+
butterfly 8,6,1,10,2,15
191+
192+
vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+104+8*\off+32)*4(%rsi),%ymm1
193+
vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+104+8*\off+32)*4(%rsi),%ymm2
194+
vpsrlq $32,%ymm1,%ymm10
195+
vmovshdup %ymm2,%ymm15
196+
butterfly 5,3,1,10,2,15
197+
butterfly 4,11,1,10,2,15
199198

200199
/* level 7 */
201-
vmovdqa ymm1,[rsi+(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+168+8*\off)*4]
202-
vmovdqa ymm2,[rsi+(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+168+8*\off)*4]
203-
vpsrlq ymm10,ymm1,32
204-
vmovshdup ymm15,ymm2
205-
butterfly 9,8,1,10,2,15
206-
207-
vmovdqa ymm1,[rsi+(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+168+8*\off+32)*4]
208-
vmovdqa ymm2,[rsi+(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+168+8*\off+32)*4]
209-
vpsrlq ymm10,ymm1,32
210-
vmovshdup ymm15,ymm2
211-
butterfly 7,6,1,10,2,15
212-
213-
vmovdqa ymm1,[rsi+(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+168+8*\off+64)*4]
214-
vmovdqa ymm2,[rsi+(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+168+8*\off+64)*4]
215-
vpsrlq ymm10,ymm1,32
216-
vmovshdup ymm15,ymm2
217-
butterfly 5,4,1,10,2,15
218-
219-
vmovdqa ymm1,[rsi+(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+168+8*\off+96)*4]
220-
vmovdqa ymm2,[rsi+(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+168+8*\off+96)*4]
221-
vpsrlq ymm10,ymm1,32
222-
vmovshdup ymm15,ymm2
223-
butterfly 3,11,1,10,2,15
224-
225-
vmovdqa [rdi+0+256*\off],ymm9
226-
vmovdqa [rdi+32+256*\off],ymm8
227-
vmovdqa [rdi+64+256*\off],ymm7
228-
vmovdqa [rdi+96+256*\off],ymm6
229-
vmovdqa [rdi+128+256*\off],ymm5
230-
vmovdqa [rdi+160+256*\off],ymm4
231-
vmovdqa [rdi+192+256*\off],ymm3
232-
vmovdqa [rdi+224+256*\off],ymm11
200+
vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+168+8*\off)*4(%rsi),%ymm1
201+
vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+168+8*\off)*4(%rsi),%ymm2
202+
vpsrlq $32,%ymm1,%ymm10
203+
vmovshdup %ymm2,%ymm15
204+
butterfly 9,8,1,10,2,15
205+
206+
vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+168+8*\off+32)*4(%rsi),%ymm1
207+
vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+168+8*\off+32)*4(%rsi),%ymm2
208+
vpsrlq $32,%ymm1,%ymm10
209+
vmovshdup %ymm2,%ymm15
210+
butterfly 7,6,1,10,2,15
211+
212+
vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+168+8*\off+64)*4(%rsi),%ymm1
213+
vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+168+8*\off+64)*4(%rsi),%ymm2
214+
vpsrlq $32,%ymm1,%ymm10
215+
vmovshdup %ymm2,%ymm15
216+
butterfly 5,4,1,10,2,15
217+
218+
vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+168+8*\off+96)*4(%rsi),%ymm1
219+
vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+168+8*\off+96)*4(%rsi),%ymm2
220+
vpsrlq $32,%ymm1,%ymm10
221+
vmovshdup %ymm2,%ymm15
222+
butterfly 3,11,1,10,2,15
223+
224+
vmovdqa %ymm9,256*\off+0(%rdi)
225+
vmovdqa %ymm8,256*\off+32(%rdi)
226+
vmovdqa %ymm7,256*\off+64(%rdi)
227+
vmovdqa %ymm6,256*\off+96(%rdi)
228+
vmovdqa %ymm5,256*\off+128(%rdi)
229+
vmovdqa %ymm4,256*\off+160(%rdi)
230+
vmovdqa %ymm3,256*\off+192(%rdi)
231+
vmovdqa %ymm11,256*\off+224(%rdi)
233232
.endm
234233

235234
#ifdef __APPLE__
@@ -240,21 +239,21 @@ _mldsa_ntt:
240239
mldsa_ntt:
241240
#endif
242241
.cfi_startproc
243-
endbr64
242+
endbr64
244243

245-
vmovdqa ymm0,[rsi+MLD_AVX2_BACKEND_DATA_OFFSET_8XQ*4]
244+
vmovdqa MLD_AVX2_BACKEND_DATA_OFFSET_8XQ*4(%rsi), %ymm0
246245

247-
levels0t1 0
248-
levels0t1 1
249-
levels0t1 2
250-
levels0t1 3
246+
levels0t1 0
247+
levels0t1 1
248+
levels0t1 2
249+
levels0t1 3
251250

252-
levels2t7 0
253-
levels2t7 1
254-
levels2t7 2
255-
levels2t7 3
251+
levels2t7 0
252+
levels2t7 1
253+
levels2t7 2
254+
levels2t7 3
256255

257-
ret
256+
ret
258257
.cfi_endproc
259258

260259
#if defined(__linux__) && defined(__ELF__)

0 commit comments

Comments
 (0)