@@ -43,6 +43,12 @@ vpsrlq $32,%ymm\r0,%ymm\r0
4343vpblendd $0xAA ,%ymm\r1,%ymm\r0,%ymm\r3
4444.endm
4545
46+ /*
47+ * Compute l + h, montmul(h - l, zh) then store the results back to l, h
48+ * respectively.
49+ *
50+ * The general abs bound of Montgomery multiplication is 3q/4.
51+ */
4652.macro butterfly l,h,zl0 =1 ,zl1 =1 ,zh0 =2 ,zh1 =2
4753vpsubd %ymm\l,%ymm\h,%ymm12
4854vpaddd %ymm\h,%ymm\l,%ymm\l
@@ -74,6 +80,8 @@ vmovdqa 256*\off+160(%rdi),%ymm9
7480vmovdqa 256*\off+192 (%rdi ),%ymm10
7581vmovdqa 256*\off+224 (%rdi ),%ymm11
7682
83+ /* All: abs bound < q */
84+
7785/* level 0 */
7886vpermq $0x1B ,(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+296 -8*\off-8 )*4 (%rsi ),%ymm3
7987vpermq $0x1B ,(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+296 -8*\off-8 )*4 (%rsi ),%ymm15
@@ -99,6 +107,19 @@ vmovshdup %ymm3,%ymm1
99107vmovshdup %ymm15 ,%ymm2
100108butterfly 10 ,11 ,1 ,3 ,2 ,15
101109
110+ /* 4, 6, 8, 10: abs bound < 2q; 5, 7, 9, 11: abs bound < 3q/4 */
111+ /*
112+ * Note that since 2^31 / q > 256, the sum of all 256 coefficients does not
113+ * overflow. This allows us to greatly simplify the range analysis by relaxing
114+ * and unifying the bounds of all coefficients on the same layer. As a concrete
115+ * example, here we relax the bounds on 5, 7, 9, 11 and conclude that
116+ *
117+ * All: abs bound < 2q
118+ *
119+ * In all but last of the following layers, we do the same relaxation without
120+ * explicit mention.
121+ */
122+
102123/* level 1 */
103124vpermq $0x1B ,(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+168 -8*\off-8 )*4 (%rsi ),%ymm3
104125vpermq $0x1B ,(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+168 -8*\off-8 )*4 (%rsi ),%ymm15
@@ -114,6 +135,8 @@ vmovshdup %ymm15,%ymm2
114135butterfly 8 ,10 ,1 ,3 ,2 ,15
115136butterfly 9 ,11 ,1 ,3 ,2 ,15
116137
138+ /* All: abs bound < 4q */
139+
117140/* level 2 */
118141vpermq $0x1B ,(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+104 -8*\off-8 )*4 (%rsi ),%ymm3
119142vpermq $0x1B ,(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+104 -8*\off-8 )*4 (%rsi ),%ymm15
@@ -124,6 +147,8 @@ butterfly 5,9,1,3,2,15
124147butterfly 6 ,10 ,1 ,3 ,2 ,15
125148butterfly 7 ,11 ,1 ,3 ,2 ,15
126149
150+ /* All: abs bound < 8q */
151+
127152/* level 3 */
128153shuffle2 4 ,5 ,3 ,5
129154shuffle2 6 ,7 ,4 ,7
@@ -137,6 +162,8 @@ butterfly 4,7
137162butterfly 6 ,9
138163butterfly 8 ,11
139164
165+ /* All: abs bound < 16q */
166+
140167/* level 4 */
141168shuffle4 3 ,4 ,10 ,4
142169shuffle4 6 ,8 ,3 ,8
@@ -150,6 +177,8 @@ butterfly 3,8
150177butterfly 6 ,7
151178butterfly 5 ,11
152179
180+ /* All: abs bound < 32q */
181+
153182/* level 5 */
154183shuffle8 10 ,3 ,9 ,3
155184shuffle8 6 ,5 ,10 ,5
@@ -163,6 +192,8 @@ butterfly 10,5
163192butterfly 6 ,8
164193butterfly 4 ,11
165194
195+ /* All: abs bound < 64q */
196+
166197vmovdqa %ymm9 ,256*\off+ 0 (%rdi )
167198vmovdqa %ymm10 ,256*\off+ 32 (%rdi )
168199vmovdqa %ymm6 ,256*\off+ 64 (%rdi )
@@ -194,6 +225,8 @@ vpbroadcastd (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+2)*4(%rsi),%ymm2
194225butterfly 8 ,10
195226butterfly 9 ,11
196227
228+ /* All: abs bound < 128q */
229+
197230/* level 7 */
198231vpbroadcastd (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+0 )*4 (%rsi ),%ymm1
199232vpbroadcastd (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+0 )*4 (%rsi ),%ymm2
@@ -203,11 +236,27 @@ butterfly 5,9
203236butterfly 6 ,10
204237butterfly 7 ,11
205238
239+ /* 4, 5, 6, 7: abs bound < 256q; 8, 9, 10, 11: abs bound < 3q/4 */
240+
206241vmovdqa %ymm8 ,512 +32*\off(%rdi )
207242vmovdqa %ymm9 ,640 +32*\off(%rdi )
208243vmovdqa %ymm10 ,768 +32*\off(%rdi )
209244vmovdqa %ymm11 ,896 +32*\off(%rdi )
210245
246+ /*
247+ * In order to (a) remove the factor of 256 arising from the 256-point intt
248+ * butterfly and (b) transform the output into Montgomery domain, we need to
249+ * multiply all coefficients by 2^32/256.
250+ *
251+ * For ymm{8,9,10,11}, the scaling has been merged into the last butterfly, so
252+ * only ymm{4,5,6,7} need to be scaled explicitly.
253+ *
254+ * The scaling is achieved by computing montmul(-, MLD_AVX2_DIV), so the output
255+ * will have an abs bound of 3q/4.
256+ *
257+ * 4, 5, 6, 7: abs bound < 256q
258+ */
259+
211260vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_8XDIV_QINV)*4 (%rsi ),%ymm1
212261vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_8XDIV)*4 (%rsi ),%ymm2
213262vpmuldq %ymm1 ,%ymm4 ,%ymm12
@@ -256,6 +305,8 @@ vmovshdup %ymm7,%ymm7
256305vpblendd $0xAA ,%ymm8 ,%ymm6 ,%ymm6
257306vpblendd $0xAA ,%ymm9 ,%ymm7 ,%ymm7
258307
308+ /* 4, 5, 6, 7: abs bound < 3q/4 */
309+
259310vmovdqa %ymm4 , 0 +32*\off(%rdi )
260311vmovdqa %ymm5 ,128 +32*\off(%rdi )
261312vmovdqa %ymm6 ,256 +32*\off(%rdi )
0 commit comments