@@ -43,6 +43,12 @@ vpsrlq $32,%ymm\r0,%ymm\r0
4343vpblendd $0xAA ,%ymm\r1,%ymm\r0,%ymm\r3
4444.endm
4545
46+ /*
47+ * Compute l + h, montmul(h - l, zh) then store the results back to l, h
48+ * respectively.
49+ *
50+ * The general abs bound of Montgomery multiplication is 3q/4.
51+ */
4652.macro butterfly l,h,zl0 =1 ,zl1 =1 ,zh0 =2 ,zh1 =2
4753vpsubd %ymm\l,%ymm\h,%ymm12
4854vpaddd %ymm\h,%ymm\l,%ymm\l
@@ -74,6 +80,8 @@ vmovdqa 256*\off+160(%rdi),%ymm9
7480vmovdqa 256*\off+192 (%rdi ),%ymm10
7581vmovdqa 256*\off+224 (%rdi ),%ymm11
7682
83+ /* All: abs bound < q */
84+
7785/* level 0 */
7886vpermq $0x1B ,(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+296 -8*\off-8 )*4 (%rsi ),%ymm3
7987vpermq $0x1B ,(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+296 -8*\off-8 )*4 (%rsi ),%ymm15
@@ -99,6 +107,18 @@ vmovshdup %ymm3,%ymm1
99107vmovshdup %ymm15 ,%ymm2
100108butterfly 10 ,11 ,1 ,3 ,2 ,15
101109
110+ /* 4, 6, 8, 10: abs bound < 2q; 5, 7, 9, 11: abs bound < 3q/4 */
111+ /*
112+ * Note that since 2^31 / q > 256, the sum of all 256 coefficients does not
113+ * overflow. This allows us to greatly simplify the range analysis by relaxing
114+ * and unifying the bounds of all coefficients on the same layer. As a concrete
115+ * example, here we relax the bounds on 5, 7, 9, 11 and conclude that
116+ *
117+ * All: abs bound < 2q
118+ *
119+ * In all following layers, we do the same relaxation without explicit mention.
120+ */
121+
102122/* level 1 */
103123vpermq $0x1B ,(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+168 -8*\off-8 )*4 (%rsi ),%ymm3
104124vpermq $0x1B ,(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+168 -8*\off-8 )*4 (%rsi ),%ymm15
@@ -114,6 +134,8 @@ vmovshdup %ymm15,%ymm2
114134butterfly 8 ,10 ,1 ,3 ,2 ,15
115135butterfly 9 ,11 ,1 ,3 ,2 ,15
116136
137+ /* All: abs bound < 4q */
138+
117139/* level 2 */
118140vpermq $0x1B ,(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+104 -8*\off-8 )*4 (%rsi ),%ymm3
119141vpermq $0x1B ,(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+104 -8*\off-8 )*4 (%rsi ),%ymm15
@@ -124,6 +146,8 @@ butterfly 5,9,1,3,2,15
124146butterfly 6 ,10 ,1 ,3 ,2 ,15
125147butterfly 7 ,11 ,1 ,3 ,2 ,15
126148
149+ /* All: abs bound < 8q */
150+
127151/* level 3 */
128152shuffle2 4 ,5 ,3 ,5
129153shuffle2 6 ,7 ,4 ,7
@@ -137,6 +161,8 @@ butterfly 4,7
137161butterfly 6 ,9
138162butterfly 8 ,11
139163
164+ /* All: abs bound < 16q */
165+
140166/* level 4 */
141167shuffle4 3 ,4 ,10 ,4
142168shuffle4 6 ,8 ,3 ,8
@@ -150,6 +176,8 @@ butterfly 3,8
150176butterfly 6 ,7
151177butterfly 5 ,11
152178
179+ /* All: abs bound < 32q */
180+
153181/* level 5 */
154182shuffle8 10 ,3 ,9 ,3
155183shuffle8 6 ,5 ,10 ,5
@@ -163,6 +191,8 @@ butterfly 10,5
163191butterfly 6 ,8
164192butterfly 4 ,11
165193
194+ /* All: abs bound < 64q */
195+
166196vmovdqa %ymm9 ,256*\off+ 0 (%rdi )
167197vmovdqa %ymm10 ,256*\off+ 32 (%rdi )
168198vmovdqa %ymm6 ,256*\off+ 64 (%rdi )
@@ -194,6 +224,8 @@ vpbroadcastd (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+2)*4(%rsi),%ymm2
194224butterfly 8 ,10
195225butterfly 9 ,11
196226
227+ /* All: abs bound < 128q */
228+
197229/* level 7 */
198230vpbroadcastd (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+0 )*4 (%rsi ),%ymm1
199231vpbroadcastd (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+0 )*4 (%rsi ),%ymm2
@@ -203,6 +235,8 @@ butterfly 5,9
203235butterfly 6 ,10
204236butterfly 7 ,11
205237
238+ /* All: abs bound < 256q */
239+
206240vmovdqa %ymm8 ,512 +32*\off(%rdi )
207241vmovdqa %ymm9 ,640 +32*\off(%rdi )
208242vmovdqa %ymm10 ,768 +32*\off(%rdi )
@@ -256,6 +290,13 @@ vmovshdup %ymm7,%ymm7
256290vpblendd $0xAA ,%ymm8 ,%ymm6 ,%ymm6
257291vpblendd $0xAA ,%ymm9 ,%ymm7 ,%ymm7
258292
293+ /*
294+ * All coefficients are Montgomery-multiplied with the same constant. This
295+ * reduces the magnitudes of all coefficients and results in the bound
296+ *
297+ * All: abs bound < 3q/4
298+ */
299+
259300vmovdqa %ymm4 , 0 +32*\off(%rdi )
260301vmovdqa %ymm5 ,128 +32*\off(%rdi )
261302vmovdqa %ymm6 ,256 +32*\off(%rdi )
0 commit comments