@@ -42,6 +42,12 @@ vpsrlq $32,%ymm\r0,%ymm\r0
4242vpblendd $0xAA ,%ymm\r1,%ymm\r0,%ymm\r3
4343.endm
4444
45+ /*
46+ * Compute l + h, montmul(h - l, zh) then store the results back to l, h
47+ * respectively.
48+ *
49+ * The general abs bound of Montgomery multiplication is 3q/4.
50+ */
4551.macro butterfly l,h,zl0 =1 ,zl1 =1 ,zh0 =2 ,zh1 =2
4652vpsubd %ymm\l,%ymm\h,%ymm12
4753vpaddd %ymm\h,%ymm\l,%ymm\l
@@ -73,6 +79,8 @@ vmovdqa 256*\off+160(%rdi),%ymm9
7379vmovdqa 256*\off+192 (%rdi ),%ymm10
7480vmovdqa 256*\off+224 (%rdi ),%ymm11
7581
82+ /* All: abs bound < q */
83+
7684/* level 0 */
7785vpermq $0x1B ,(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+296 -8*\off-8 )*4 (%rsi ),%ymm3
7886vpermq $0x1B ,(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+296 -8*\off-8 )*4 (%rsi ),%ymm15
@@ -98,6 +106,18 @@ vmovshdup %ymm3,%ymm1
98106vmovshdup %ymm15 ,%ymm2
99107butterfly 10 ,11 ,1 ,3 ,2 ,15
100108
109+ /* 4, 6, 8, 10: abs bound < 2q; 5, 7, 9, 11: abs bound < 3q/4 */
110+ /*
111+ * Note that since 2^31 / q > 256, the sum of all 256 coefficients does not
112+ * overflow. This allows us to greatly simplify the range analysis by relaxing
113+ * and unifying the bounds of all coefficients on the same layer. As a concrete
114+ * example, here we relax the bounds on 5, 7, 9, 11 and conclude that
115+ *
116+ * All: abs bound < 2q
117+ *
118+ * In all following layers, we do the same relaxation without explicit mention.
119+ */
120+
101121/* level 1 */
102122vpermq $0x1B ,(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+168 -8*\off-8 )*4 (%rsi ),%ymm3
103123vpermq $0x1B ,(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+168 -8*\off-8 )*4 (%rsi ),%ymm15
@@ -113,6 +133,8 @@ vmovshdup %ymm15,%ymm2
113133butterfly 8 ,10 ,1 ,3 ,2 ,15
114134butterfly 9 ,11 ,1 ,3 ,2 ,15
115135
136+ /* All: abs bound < 4q */
137+
116138/* level 2 */
117139vpermq $0x1B ,(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+104 -8*\off-8 )*4 (%rsi ),%ymm3
118140vpermq $0x1B ,(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+104 -8*\off-8 )*4 (%rsi ),%ymm15
@@ -123,6 +145,8 @@ butterfly 5,9,1,3,2,15
123145butterfly 6 ,10 ,1 ,3 ,2 ,15
124146butterfly 7 ,11 ,1 ,3 ,2 ,15
125147
148+ /* All: abs bound < 8q */
149+
126150/* level 3 */
127151shuffle2 4 ,5 ,3 ,5
128152shuffle2 6 ,7 ,4 ,7
@@ -136,6 +160,8 @@ butterfly 4,7
136160butterfly 6 ,9
137161butterfly 8 ,11
138162
163+ /* All: abs bound < 16q */
164+
139165/* level 4 */
140166shuffle4 3 ,4 ,10 ,4
141167shuffle4 6 ,8 ,3 ,8
@@ -149,6 +175,8 @@ butterfly 3,8
149175butterfly 6 ,7
150176butterfly 5 ,11
151177
178+ /* All: abs bound < 32q */
179+
152180/* level 5 */
153181shuffle8 10 ,3 ,9 ,3
154182shuffle8 6 ,5 ,10 ,5
@@ -162,6 +190,8 @@ butterfly 10,5
162190butterfly 6 ,8
163191butterfly 4 ,11
164192
193+ /* All: abs bound < 64q */
194+
165195vmovdqa %ymm9 ,256*\off+ 0 (%rdi )
166196vmovdqa %ymm10 ,256*\off+ 32 (%rdi )
167197vmovdqa %ymm6 ,256*\off+ 64 (%rdi )
@@ -193,6 +223,8 @@ vpbroadcastd (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+2)*4(%rsi),%ymm2
193223butterfly 8 ,10
194224butterfly 9 ,11
195225
226+ /* All: abs bound < 128q */
227+
196228/* level 7 */
197229vpbroadcastd (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+0 )*4 (%rsi ),%ymm1
198230vpbroadcastd (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+0 )*4 (%rsi ),%ymm2
@@ -202,6 +234,8 @@ butterfly 5,9
202234butterfly 6 ,10
203235butterfly 7 ,11
204236
237+ /* All: abs bound < 256q */
238+
205239vmovdqa %ymm8 ,512 +32*\off(%rdi )
206240vmovdqa %ymm9 ,640 +32*\off(%rdi )
207241vmovdqa %ymm10 ,768 +32*\off(%rdi )
@@ -255,6 +289,13 @@ vmovshdup %ymm7,%ymm7
255289vpblendd $0xAA ,%ymm8 ,%ymm6 ,%ymm6
256290vpblendd $0xAA ,%ymm9 ,%ymm7 ,%ymm7
257291
292+ /*
293+ * All coefficients are Montgomery-multiplied with the same constant. This
294+ * reduces the magnitudes of all coefficients and results in the bound
295+ *
296+ * All: abs bound < 3q/4
297+ */
298+
258299vmovdqa %ymm4 , 0 +32*\off(%rdi )
259300vmovdqa %ymm5 ,128 +32*\off(%rdi )
260301vmovdqa %ymm6 ,256 +32*\off(%rdi )
0 commit comments