2626// Microsoft x64 ABI: RCX = a, RDX = zetas
2727// ----------------------------------------------------------------------------
2828
29- .intel_syntax noprefix
3029.text
3130
32- #define a rdi
33- #define zetas rsi
31+ #define a % rdi
32+ #define zetas % rsi
3433
3534#define MLD_AVX2_BACKEND_DATA_OFFSET_8XQ 0
3635#define MLD_AVX2_BACKEND_DATA_OFFSET_8XQINV 8
4039#define MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS 328
4140
4241.macro shuffle8 r0,r1,r2,r3
43- vperm2i128 ymm\r2, ymm\r0,ymm\r1, 0x20
44- vperm2i128 ymm\r3, ymm\r0,ymm\r1, 0x31
42+ vperm2i128 $0x20 ,% ymm\r1,% ymm\r0,% ymm\r2
43+ vperm2i128 $0x31 ,% ymm\r1,% ymm\r0,% ymm\r3
4544.endm
4645
4746.macro shuffle4 r0,r1,r2,r3
48- vpunpcklqdq ymm\r2, ymm\r0,ymm\r1
49- vpunpckhqdq ymm\r3, ymm\r0,ymm\r1
47+ vpunpcklqdq % ymm\r1,% ymm\r0,% ymm\r2
48+ vpunpckhqdq % ymm\r1,% ymm\r0,% ymm\r3
5049.endm
5150
5251.macro shuffle2 r0,r1,r2,r3
53- vmovsldup ymm\r2, ymm\r1
54- vpblendd ymm\r2,ymm\r0,ymm\r2, 0xAA
55- vpsrlq ymm\r0,ymm\r0, 32
56- vpblendd ymm\r3, ymm\r0,ymm\r1, 0xAA
52+ vmovsldup % ymm\r1,% ymm\r2
53+ vpblendd $0xAA ,% ymm\r2,% ymm\r0,% ymm\r2
54+ vpsrlq $32 ,% ymm\r0,% ymm\r0
55+ vpblendd $0xAA ,% ymm\r1,% ymm\r0,% ymm\r3
5756.endm
5857
5958.macro butterfly l,h,zl0 =1 ,zl1 =1 ,zh0 =2 ,zh1 =2
60- vpmuldq ymm13, ymm\h, ymm\zl0
61- vmovshdup ymm12, ymm\h
62- vpmuldq ymm14,ymm12, ymm\zl1
59+ vpmuldq % ymm\zl0,% ymm\h, %ymm13
60+ vmovshdup % ymm\h, %ymm12
61+ vpmuldq % ymm\zl1, %ymm12 , %ymm14
6362
64- vpmuldq ymm\h, ymm\h,ymm\zh0
65- vpmuldq ymm12,ymm12, ymm\zh1
63+ vpmuldq % ymm\zh0,% ymm\h,% ymm\h
64+ vpmuldq % ymm\zh1, %ymm12 , %ymm12
6665
67- vpmuldq ymm13, ymm13,ymm0
68- vpmuldq ymm14, ymm14,ymm0
66+ vpmuldq %ymm0 , % ymm13 ,%ymm13
67+ vpmuldq %ymm0 , % ymm14 ,%ymm14
6968
70- vmovshdup ymm\h,ymm\h
71- vpblendd ymm\h,ymm\h,ymm12, 0xAA
69+ vmovshdup % ymm\h,% ymm\h
70+ vpblendd $0xAA , %ymm12 ,% ymm\h,% ymm\h
7271
73- vpsubd ymm12, ymm\l, ymm\h
74- vpaddd ymm\l, ymm\l,ymm\h
72+ vpsubd % ymm\h,% ymm\l, %ymm12
73+ vpaddd % ymm\h,% ymm\l,% ymm\l
7574
76- vmovshdup ymm13,ymm13
77- vpblendd ymm13,ymm13, ymm14,0xAA
75+ vmovshdup % ymm13 ,% ymm13
76+ vpblendd $0xAA , % ymm14 ,%ymm13 , %ymm13
7877
79- vpaddd ymm\h, ymm12,ymm13
80- vpsubd ymm\l,ymm\l,ymm13
78+ vpaddd %ymm13 , % ymm12 ,%ymm\h
79+ vpsubd %ymm13 ,% ymm\l,% ymm\l
8180.endm
8281
8382.macro levels0t1 off
8483/* level 0 */
85- vpbroadcastd ymm1,[rsi+ (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+1 )*4]
86- vpbroadcastd ymm2,[rsi+ (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+1 )*4]
87-
88- vmovdqa ymm4,[rdi+ 0 +32*\off]
89- vmovdqa ymm5,[rdi+ 128 +32*\off]
90- vmovdqa ymm6,[rdi+ 256 +32*\off]
91- vmovdqa ymm7,[rdi+ 384 +32*\off]
92- vmovdqa ymm8,[rdi+ 512 +32*\off]
93- vmovdqa ymm9,[rdi+ 640 +32*\off]
94- vmovdqa ymm10,[rdi+ 768 +32*\off]
95- vmovdqa ymm11,[rdi+ 896 +32*\off]
96-
97- butterfly 4 ,8
98- butterfly 5 ,9
99- butterfly 6 ,10
100- butterfly 7 ,11
84+ vpbroadcastd (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+1 )*4 ( %rsi ), %ymm1
85+ vpbroadcastd (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+1 )*4 ( %rsi ), %ymm2
86+
87+ vmovdqa 0 +32*\off( %rdi ), %ymm4
88+ vmovdqa 128 +32*\off( %rdi ), %ymm5
89+ vmovdqa 256 +32*\off( %rdi ), %ymm6
90+ vmovdqa 384 +32*\off( %rdi ), %ymm7
91+ vmovdqa 512 +32*\off( %rdi ), %ymm8
92+ vmovdqa 640 +32*\off( %rdi ), %ymm9
93+ vmovdqa 768 +32*\off( %rdi ), %ymm10
94+ vmovdqa 896 +32*\off( %rdi ), %ymm11
95+
96+ butterfly 4 ,8
97+ butterfly 5 ,9
98+ butterfly 6 ,10
99+ butterfly 7 ,11
101100
102101/* level 1 */
103- vpbroadcastd ymm1,[rsi+ (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+2 )*4]
104- vpbroadcastd ymm2,[rsi+ (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+2 )*4]
105- butterfly 4 ,6
106- butterfly 5 ,7
107-
108- vpbroadcastd ymm1,[rsi+ (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+3 )*4]
109- vpbroadcastd ymm2,[rsi+ (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+3 )*4]
110- butterfly 8 ,10
111- butterfly 9 ,11
112-
113- vmovdqa [rdi+ 0 +32*\off],ymm4
114- vmovdqa [rdi+ 128 +32*\off],ymm5
115- vmovdqa [rdi+ 256 +32*\off],ymm6
116- vmovdqa [rdi+ 384 +32*\off],ymm7
117- vmovdqa [rdi+ 512 +32*\off],ymm8
118- vmovdqa [rdi+ 640 +32*\off],ymm9
119- vmovdqa [rdi+ 768 +32*\off],ymm10
120- vmovdqa [rdi+ 896 +32*\off],ymm11
102+ vpbroadcastd (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+2 )*4 ( %rsi ), %ymm1
103+ vpbroadcastd (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+2 )*4 ( %rsi ), %ymm2
104+ butterfly 4 ,6
105+ butterfly 5 ,7
106+
107+ vpbroadcastd (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+3 )*4 ( %rsi ), %ymm1
108+ vpbroadcastd (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+3 )*4 ( %rsi ), %ymm2
109+ butterfly 8 ,10
110+ butterfly 9 ,11
111+
112+ vmovdqa %ymm4 , 0 +32*\off( %rdi )
113+ vmovdqa %ymm5 , 128 +32*\off( %rdi )
114+ vmovdqa %ymm6 , 256 +32*\off( %rdi )
115+ vmovdqa %ymm7 , 384 +32*\off( %rdi )
116+ vmovdqa %ymm8 , 512 +32*\off( %rdi )
117+ vmovdqa %ymm9 , 640 +32*\off( %rdi )
118+ vmovdqa %ymm10 , 768 +32*\off( %rdi )
119+ vmovdqa %ymm11 , 896 +32*\off( %rdi )
121120.endm
122121
123122.macro levels2t7 off
124123/* level 2 */
125- vmovdqa ymm4,[rdi+ 0 + 256*\off]
126- vmovdqa ymm5,[rdi+ 32 + 256*\off]
127- vmovdqa ymm6,[rdi+ 64 + 256*\off]
128- vmovdqa ymm7,[rdi+ 96 + 256*\off]
129- vmovdqa ymm8,[rdi+ 128 + 256*\off]
130- vmovdqa ymm9,[rdi+ 160 + 256*\off]
131- vmovdqa ymm10,[rdi+ 192 + 256*\off]
132- vmovdqa ymm11,[rdi+ 224 + 256*\off]
133- vpbroadcastd ymm1,[rsi+ (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+4 +\off)*4]
134- vpbroadcastd ymm2,[rsi+ (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+4 +\off)*4]
135-
136- butterfly 4 ,8
137- butterfly 5 ,9
138- butterfly 6 ,10
139- butterfly 7 ,11
140-
141- shuffle8 4 ,8 ,3 ,8
142- shuffle8 5 ,9 ,4 ,9
143- shuffle8 6 ,10 ,5 ,10
144- shuffle8 7 ,11 ,6 ,11
124+ vmovdqa 256*\off+ 0 ( %rdi ), %ymm4
125+ vmovdqa 256*\off+ 32 ( %rdi ), %ymm5
126+ vmovdqa 256*\off+ 64 ( %rdi ), %ymm6
127+ vmovdqa 256*\off+ 96 ( %rdi ), %ymm7
128+ vmovdqa 256*\off+ 128 ( %rdi ), %ymm8
129+ vmovdqa 256*\off+ 160 ( %rdi ), %ymm9
130+ vmovdqa 256*\off+ 192 ( %rdi ), %ymm10
131+ vmovdqa 256*\off+ 224 ( %rdi ), %ymm11
132+ vpbroadcastd (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+4 +\off)*4 ( %rsi ), %ymm1
133+ vpbroadcastd (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+4 +\off)*4 ( %rsi ), %ymm2
134+
135+ butterfly 4 ,8
136+ butterfly 5 ,9
137+ butterfly 6 ,10
138+ butterfly 7 ,11
139+
140+ shuffle8 4 ,8 ,3 ,8
141+ shuffle8 5 ,9 ,4 ,9
142+ shuffle8 6 ,10 ,5 ,10
143+ shuffle8 7 ,11 ,6 ,11
145144
146145/* level 3 */
147- vmovdqa ymm1,[rsi+ (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+8 +8*\off)*4]
148- vmovdqa ymm2,[rsi+ (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+8 +8*\off)*4]
146+ vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+8 +8*\off)*4 ( %rsi ), %ymm1
147+ vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+8 +8*\off)*4 ( %rsi ), %ymm2
149148
150- butterfly 3 ,5
151- butterfly 8 ,10
152- butterfly 4 ,6
153- butterfly 9 ,11
149+ butterfly 3 ,5
150+ butterfly 8 ,10
151+ butterfly 4 ,6
152+ butterfly 9 ,11
154153
155- shuffle4 3 ,5 ,7 ,5
156- shuffle4 8 ,10 ,3 ,10
157- shuffle4 4 ,6 ,8 ,6
158- shuffle4 9 ,11 ,4 ,11
154+ shuffle4 3 ,5 ,7 ,5
155+ shuffle4 8 ,10 ,3 ,10
156+ shuffle4 4 ,6 ,8 ,6
157+ shuffle4 9 ,11 ,4 ,11
159158
160159/* level 4 */
161- vmovdqa ymm1,[rsi+ (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+40 +8*\off)*4]
162- vmovdqa ymm2,[rsi+ (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+40 +8*\off)*4]
160+ vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+40 +8*\off)*4 ( %rsi ), %ymm1
161+ vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+40 +8*\off)*4 ( %rsi ), %ymm2
163162
164- butterfly 7 ,8
165- butterfly 5 ,6
166- butterfly 3 ,4
167- butterfly 10 ,11
163+ butterfly 7 ,8
164+ butterfly 5 ,6
165+ butterfly 3 ,4
166+ butterfly 10 ,11
168167
169- shuffle2 7 ,8 ,9 ,8
170- shuffle2 5 ,6 ,7 ,6
171- shuffle2 3 ,4 ,5 ,4
172- shuffle2 10 ,11 ,3 ,11
168+ shuffle2 7 ,8 ,9 ,8
169+ shuffle2 5 ,6 ,7 ,6
170+ shuffle2 3 ,4 ,5 ,4
171+ shuffle2 10 ,11 ,3 ,11
173172
174173/* level 5 */
175- vmovdqa ymm1,[rsi+ (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+72 +8*\off)*4]
176- vmovdqa ymm2,[rsi+ (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+72 +8*\off)*4]
177- vpsrlq ymm10, ymm1,32
178- vmovshdup ymm15, ymm2
174+ vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+72 +8*\off)*4 ( %rsi ), %ymm1
175+ vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+72 +8*\off)*4 ( %rsi ), %ymm2
176+ vpsrlq $32 , % ymm1 ,%ymm10
177+ vmovshdup % ymm2, %ymm15
179178
180- butterfly 9 ,5 ,1 ,10 ,2 ,15
181- butterfly 8 ,4 ,1 ,10 ,2 ,15
182- butterfly 7 ,3 ,1 ,10 ,2 ,15
183- butterfly 6 ,11 ,1 ,10 ,2 ,15
179+ butterfly 9 ,5 ,1 ,10 ,2 ,15
180+ butterfly 8 ,4 ,1 ,10 ,2 ,15
181+ butterfly 7 ,3 ,1 ,10 ,2 ,15
182+ butterfly 6 ,11 ,1 ,10 ,2 ,15
184183
185184/* level 6 */
186- vmovdqa ymm1,[rsi+ (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+104 +8*\off)*4]
187- vmovdqa ymm2,[rsi+ (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+104 +8*\off)*4]
188- vpsrlq ymm10, ymm1,32
189- vmovshdup ymm15, ymm2
190- butterfly 9 ,7 ,1 ,10 ,2 ,15
191- butterfly 8 ,6 ,1 ,10 ,2 ,15
192-
193- vmovdqa ymm1,[rsi+ (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+104 +8*\off+32 )*4]
194- vmovdqa ymm2,[rsi+ (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+104 +8*\off+32 )*4]
195- vpsrlq ymm10, ymm1,32
196- vmovshdup ymm15, ymm2
197- butterfly 5 ,3 ,1 ,10 ,2 ,15
198- butterfly 4 ,11 ,1 ,10 ,2 ,15
185+ vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+104 +8*\off)*4 ( %rsi ), %ymm1
186+ vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+104 +8*\off)*4 ( %rsi ), %ymm2
187+ vpsrlq $32 , % ymm1 ,%ymm10
188+ vmovshdup % ymm2, %ymm15
189+ butterfly 9 ,7 ,1 ,10 ,2 ,15
190+ butterfly 8 ,6 ,1 ,10 ,2 ,15
191+
192+ vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+104 +8*\off+32 )*4 ( %rsi ), %ymm1
193+ vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+104 +8*\off+32 )*4 ( %rsi ), %ymm2
194+ vpsrlq $32 , % ymm1 ,%ymm10
195+ vmovshdup % ymm2, %ymm15
196+ butterfly 5 ,3 ,1 ,10 ,2 ,15
197+ butterfly 4 ,11 ,1 ,10 ,2 ,15
199198
200199/* level 7 */
201- vmovdqa ymm1,[rsi+ (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+168 +8*\off)*4]
202- vmovdqa ymm2,[rsi+ (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+168 +8*\off)*4]
203- vpsrlq ymm10, ymm1,32
204- vmovshdup ymm15, ymm2
205- butterfly 9 ,8 ,1 ,10 ,2 ,15
206-
207- vmovdqa ymm1,[rsi+ (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+168 +8*\off+32 )*4]
208- vmovdqa ymm2,[rsi+ (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+168 +8*\off+32 )*4]
209- vpsrlq ymm10, ymm1,32
210- vmovshdup ymm15, ymm2
211- butterfly 7 ,6 ,1 ,10 ,2 ,15
212-
213- vmovdqa ymm1,[rsi+ (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+168 +8*\off+64 )*4]
214- vmovdqa ymm2,[rsi+ (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+168 +8*\off+64 )*4]
215- vpsrlq ymm10, ymm1,32
216- vmovshdup ymm15, ymm2
217- butterfly 5 ,4 ,1 ,10 ,2 ,15
218-
219- vmovdqa ymm1,[rsi+ (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+168 +8*\off+96 )*4]
220- vmovdqa ymm2,[rsi+ (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+168 +8*\off+96 )*4]
221- vpsrlq ymm10, ymm1,32
222- vmovshdup ymm15, ymm2
223- butterfly 3 ,11 ,1 ,10 ,2 ,15
224-
225- vmovdqa [rdi+ 0 + 256*\off],ymm9
226- vmovdqa [rdi+ 32 + 256*\off],ymm8
227- vmovdqa [rdi+ 64 + 256*\off],ymm7
228- vmovdqa [rdi+ 96 + 256*\off],ymm6
229- vmovdqa [rdi+ 128 + 256*\off],ymm5
230- vmovdqa [rdi+ 160 + 256*\off],ymm4
231- vmovdqa [rdi+ 192 + 256*\off],ymm3
232- vmovdqa [rdi+ 224 + 256*\off],ymm11
200+ vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+168 +8*\off)*4 ( %rsi ), %ymm1
201+ vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+168 +8*\off)*4 ( %rsi ), %ymm2
202+ vpsrlq $32 , % ymm1 ,%ymm10
203+ vmovshdup % ymm2, %ymm15
204+ butterfly 9 ,8 ,1 ,10 ,2 ,15
205+
206+ vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+168 +8*\off+32 )*4 ( %rsi ), %ymm1
207+ vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+168 +8*\off+32 )*4 ( %rsi ), %ymm2
208+ vpsrlq $32 , % ymm1 ,%ymm10
209+ vmovshdup % ymm2, %ymm15
210+ butterfly 7 ,6 ,1 ,10 ,2 ,15
211+
212+ vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+168 +8*\off+64 )*4 ( %rsi ), %ymm1
213+ vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+168 +8*\off+64 )*4 ( %rsi ), %ymm2
214+ vpsrlq $32 , % ymm1 ,%ymm10
215+ vmovshdup % ymm2, %ymm15
216+ butterfly 5 ,4 ,1 ,10 ,2 ,15
217+
218+ vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+168 +8*\off+96 )*4 ( %rsi ), %ymm1
219+ vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+168 +8*\off+96 )*4 ( %rsi ), %ymm2
220+ vpsrlq $32 , % ymm1 ,%ymm10
221+ vmovshdup % ymm2, %ymm15
222+ butterfly 3 ,11 ,1 ,10 ,2 ,15
223+
224+ vmovdqa %ymm9 , 256*\off+ 0 ( %rdi )
225+ vmovdqa %ymm8 , 256*\off+ 32 ( %rdi )
226+ vmovdqa %ymm7 , 256*\off+ 64 ( %rdi )
227+ vmovdqa %ymm6 , 256*\off+ 96 ( %rdi )
228+ vmovdqa %ymm5 , 256*\off+ 128 ( %rdi )
229+ vmovdqa %ymm4 , 256*\off+ 160 ( %rdi )
230+ vmovdqa %ymm3 , 256*\off+ 192 ( %rdi )
231+ vmovdqa %ymm11 , 256*\off+ 224 ( %rdi )
233232.endm
234233
235234#ifdef __APPLE__
@@ -240,21 +239,21 @@ _mldsa_ntt:
240239mldsa_ntt:
241240#endif
242241.cfi_startproc
243- endbr64
242+ endbr64
244243
245- vmovdqa ymm0,[rsi+ MLD_AVX2_BACKEND_DATA_OFFSET_8XQ*4]
244+ vmovdqa MLD_AVX2_BACKEND_DATA_OFFSET_8XQ*4 ( %rsi ), %ymm0
246245
247- levels0t1 0
248- levels0t1 1
249- levels0t1 2
250- levels0t1 3
246+ levels0t1 0
247+ levels0t1 1
248+ levels0t1 2
249+ levels0t1 3
251250
252- levels2t7 0
253- levels2t7 1
254- levels2t7 2
255- levels2t7 3
251+ levels2t7 0
252+ levels2t7 1
253+ levels2t7 2
254+ levels2t7 3
256255
257- ret
256+ ret
258257.cfi_endproc
259258
260259#if defined(__linux__) && defined(__ELF__)
0 commit comments