@@ -44,6 +44,16 @@ vpsrlq $32,%ymm\r0,%ymm\r0
4444vpblendd $0xAA ,%ymm\r1,%ymm\r0,%ymm\r3
4545.endm
4646
47+ /*
48+ * Compute l + montmul(h, zh), l - montmul(h, zh) then store the results back to
49+ * l, h respectively.
50+ *
51+ * Although the general abs bound of Montgomery multiplication is 3q/4, we use
52+ * the more convenient bound q here.
53+ *
54+ * In conclusion, the magnitudes of all coefficients grow by at most q after
55+ * each layer.
56+ */
4757.macro butterfly l,h,zl0 =1 ,zl1 =1 ,zh0 =2 ,zh1 =2
4858vpmuldq %ymm\zl0,%ymm\h,%ymm13
4959vmovshdup %ymm\h,%ymm12
@@ -56,16 +66,30 @@ vpmuldq %ymm0,%ymm13,%ymm13
5666vpmuldq %ymm0 ,%ymm14 ,%ymm14
5767
5868vmovshdup %ymm\h,%ymm\h
59- vpblendd $0xAA ,%ymm12 ,%ymm\h,%ymm\h
69+ vpblendd $0xAA ,%ymm12 ,%ymm\h,%ymm\h /* mulhi(h, zh) */
6070
61- vpsubd %ymm\h,%ymm\l,%ymm12
62- vpaddd %ymm\h,%ymm\l,%ymm\l
71+ /*
72+ * Originally, mulhi(h, zh) should be subtracted by mulhi(q, mullo(h, zl)) in
73+ * order to complete computing
74+ *
75+ * montmul(h, zh) = mulhi(h, zh) - mulhi(q, mullo(h, zl)).
76+ *
77+ * Here, since mulhi(q, mullo(h, zl)) has not been computed yet, this task is
78+ * delayed until after add/sub.
79+ */
80+ vpsubd %ymm\h,%ymm\l,%ymm12 /* l - mulhi(h, zh)
81+ * = l - montmul(h, zh)
82+ * - mulhi(q, mullo(h, zl)) */
83+ vpaddd %ymm\h,%ymm\l,%ymm\l /* l + mulhi(h, zh)
84+ * = l + montmul(h, zh)
85+ * + mulhi(q, mullo(h, zl)) */
6386
6487vmovshdup %ymm13 ,%ymm13
65- vpblendd $0xAA ,%ymm14 ,%ymm13 ,%ymm13
88+ vpblendd $0xAA ,%ymm14 ,%ymm13 ,%ymm13 /* mulhi(q, mullo(h, zl)) */
6689
67- vpaddd %ymm13 ,%ymm12 ,%ymm\h
68- vpsubd %ymm13 ,%ymm\l,%ymm\l
90+ /* Finish the delayed task mentioned above */
91+ vpaddd %ymm13 ,%ymm12 ,%ymm\h /* l - montmul(h, zh) */
92+ vpsubd %ymm13 ,%ymm\l,%ymm\l /* l + montmul(h, zh) */
6993.endm
7094
7195.macro levels0t1 off
@@ -82,11 +106,15 @@ vmovdqa 640+32*\off(%rdi),%ymm9
82106vmovdqa 768 +32*\off(%rdi ),%ymm10
83107vmovdqa 896 +32*\off(%rdi ),%ymm11
84108
109+ /* All: abs bound < q */
110+
85111butterfly 4 ,8
86112butterfly 5 ,9
87113butterfly 6 ,10
88114butterfly 7 ,11
89115
116+ /* All: abs bound < 2q */
117+
90118/* level 1 */
91119vpbroadcastd (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+2 )*4 (%rsi ),%ymm1
92120vpbroadcastd (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+2 )*4 (%rsi ),%ymm2
@@ -98,6 +126,8 @@ vpbroadcastd (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+3)*4(%rsi),%ymm2
98126butterfly 8 ,10
99127butterfly 9 ,11
100128
129+ /* All: abs bound < 3q */
130+
101131vmovdqa %ymm4 , 0 +32*\off(%rdi )
102132vmovdqa %ymm5 ,128 +32*\off(%rdi )
103133vmovdqa %ymm6 ,256 +32*\off(%rdi )
@@ -132,6 +162,8 @@ shuffle8 5,9,4,9
132162shuffle8 6 ,10 ,5 ,10
133163shuffle8 7 ,11 ,6 ,11
134164
165+ /* All: abs bound < 4q */
166+
135167/* level 3 */
136168vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+8 +8*\off)*4 (%rsi ),%ymm1
137169vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+8 +8*\off)*4 (%rsi ),%ymm2
@@ -146,6 +178,8 @@ shuffle4 8,10,3,10
146178shuffle4 4 ,6 ,8 ,6
147179shuffle4 9 ,11 ,4 ,11
148180
181+ /* All: abs bound < 5q */
182+
149183/* level 4 */
150184vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+40 +8*\off)*4 (%rsi ),%ymm1
151185vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+40 +8*\off)*4 (%rsi ),%ymm2
@@ -160,6 +194,8 @@ shuffle2 5,6,7,6
160194shuffle2 3 ,4 ,5 ,4
161195shuffle2 10 ,11 ,3 ,11
162196
197+ /* All: abs bound < 6q */
198+
163199/* level 5 */
164200vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+72 +8*\off)*4 (%rsi ),%ymm1
165201vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+72 +8*\off)*4 (%rsi ),%ymm2
@@ -171,6 +207,8 @@ butterfly 8,4,1,10,2,15
171207butterfly 7 ,3 ,1 ,10 ,2 ,15
172208butterfly 6 ,11 ,1 ,10 ,2 ,15
173209
210+ /* All: abs bound < 7q */
211+
174212/* level 6 */
175213vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+104 +8*\off)*4 (%rsi ),%ymm1
176214vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+104 +8*\off)*4 (%rsi ),%ymm2
@@ -186,6 +224,8 @@ vmovshdup %ymm2,%ymm15
186224butterfly 5 ,3 ,1 ,10 ,2 ,15
187225butterfly 4 ,11 ,1 ,10 ,2 ,15
188226
227+ /* All: abs bound < 8q */
228+
189229/* level 7 */
190230vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+168 +8*\off)*4 (%rsi ),%ymm1
191231vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+168 +8*\off)*4 (%rsi ),%ymm2
@@ -211,6 +251,8 @@ vpsrlq $32,%ymm1,%ymm10
211251vmovshdup %ymm2 ,%ymm15
212252butterfly 3 ,11 ,1 ,10 ,2 ,15
213253
254+ /* All: abs bound < 9q */
255+
214256vmovdqa %ymm9 ,256*\off+ 0 (%rdi )
215257vmovdqa %ymm8 ,256*\off+ 32 (%rdi )
216258vmovdqa %ymm7 ,256*\off+ 64 (%rdi )
0 commit comments