1515 * CRYSTALS-Dilithium optimized AVX2 implementation
1616 * Bai, Ducas, Kiltz, Lepoint, Lyubashevsky, Schwabe, Seiler, Stehlé
1717 * https://github.com/pq-crystals/dilithium/tree/master/avx2
18+ *
19+ * - [Survey_Hwang23]
20+ * A Survey of Polynomial Multiplications for Lattice-Based Cryptosystems
21+ * Vincent Hwang
22+ * https://eprint.iacr.org/2023/1962
1823 */
1924
2025#include "../../../common.h"
@@ -47,7 +52,8 @@ vpblendd $0xAA,%ymm\r1,%ymm\r0,%ymm\r3
4752 * Compute l + h, montmul(h - l, zh) then store the results back to l, h
4853 * respectively.
4954 *
50- * The general abs bound of Montgomery multiplication is 3q/4.
55+ * The abs bound of "Montgomery multiplication with signed canonical constant"
56+ * is ceil(3q/4) (see the end of this file).
5157 */
5258.macro butterfly l,h,zl0 =1 ,zl1 =1 ,zh0 =2 ,zh1 =2
5359vpsubd %ymm\l,%ymm\h,%ymm12
@@ -107,7 +113,7 @@ vmovshdup %ymm3,%ymm1
107113vmovshdup %ymm15 ,%ymm2
108114butterfly 10 ,11 ,1 ,3 ,2 ,15
109115
110- /* 4, 6, 8, 10: abs bound < 2q; 5, 7, 9, 11: abs bound < 3q/4 */
116+ /* 4, 6, 8, 10: abs bound < 2q; 5, 7, 9, 11: abs bound < ceil( 3q/4) */
111117/*
112118 * Note that since 2^31 / q > 256, the sum of all 256 coefficients does not
113119 * overflow. This allows us to greatly simplify the range analysis by relaxing
@@ -236,7 +242,7 @@ butterfly 5,9
236242butterfly 6 ,10
237243butterfly 7 ,11
238244
239- /* 4, 5, 6, 7: abs bound < 256q; 8, 9, 10, 11: abs bound < 3q/4 */
245+ /* 4, 5, 6, 7: abs bound < 256q; 8, 9, 10, 11: abs bound < ceil( 3q/4) */
240246
241247vmovdqa %ymm8 ,512 +32*\off(%rdi )
242248vmovdqa %ymm9 ,640 +32*\off(%rdi )
@@ -251,8 +257,7 @@ vmovdqa %ymm11,896+32*\off(%rdi)
251257 * For ymm{8,9,10,11}, the scaling has been merged into the last butterfly, so
252258 * only ymm{4,5,6,7} need to be scaled explicitly.
253259 *
254- * The scaling is achieved by computing montmul(-, MLD_AVX2_DIV), so the output
255- * will have an abs bound of 3q/4.
260+ * The scaling is achieved by computing montmul(-, MLD_AVX2_DIV).
256261 *
257262 * 4, 5, 6, 7: abs bound < 256q
258263 */
@@ -305,7 +310,22 @@ vmovshdup %ymm7,%ymm7
305310vpblendd $0xAA ,%ymm8 ,%ymm6 ,%ymm6
306311vpblendd $0xAA ,%ymm9 ,%ymm7 ,%ymm7
307312
308- /* 4, 5, 6, 7: abs bound < 3q/4 */
313+ /*
314+ * The bound ceil(3q/4) for this scaling, as well as any other "Montgomery
315+ * multiplication with signed canonical constant", is justified as follows.
316+ *
317+ * In @[Survey_Hwang23, Section 2.2] they showed a bound that works for any
318+ * variable input a, as long as the constant b is signed canonical:
319+ *
320+ * |montmul(a, b)| <= (|a| (q/2) + (R/2) q) / R = (q/2) (1 + |a|/R).
321+ *
322+ * Therefore, even if we know nothing about a except that it fits inside
323+ * int32_t (thus |a| <= R/2), we still have |montmul(a, b)| <= 3q/4. This can be
324+ * strengthened to |montmul_pos(a, b)| <= floor(3q/4) < ceil(3q/4) since LHS is
325+ * an integer and 3q/4 isn't.
326+ */
327+
328+ /* 4, 5, 6, 7: abs bound < ceil(3q/4) */
309329
310330vmovdqa %ymm4 , 0 +32*\off(%rdi )
311331vmovdqa %ymm5 ,128 +32*\off(%rdi )
0 commit comments