Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 51 additions & 0 deletions dev/x86_64/src/intt.S
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,12 @@ vpsrlq $32,%ymm\r0,%ymm\r0
vpblendd $0xAA,%ymm\r1,%ymm\r0,%ymm\r3
.endm

/*
* Compute l + h, montmul(h - l, zh) then store the results back to l, h
* respectively.
*
* The general abs bound of Montgomery multiplication is 3q/4.
Comment on lines +46 to +50
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As discussed offline: Please include reasoing for the 3q/4 bound --- you actually mean Montgomery multiplication by a constant, not general Montgomery multiplication.

*/
.macro butterfly l,h,zl0=1,zl1=1,zh0=2,zh1=2
vpsubd %ymm\l,%ymm\h,%ymm12
vpaddd %ymm\h,%ymm\l,%ymm\l
Expand Down Expand Up @@ -74,6 +80,8 @@ vmovdqa 256*\off+160(%rdi),%ymm9
vmovdqa 256*\off+192(%rdi),%ymm10
vmovdqa 256*\off+224(%rdi),%ymm11

/* All: abs bound < q */

/* level 0 */
vpermq $0x1B,(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+296-8*\off-8)*4(%rsi),%ymm3
vpermq $0x1B,(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+296-8*\off-8)*4(%rsi),%ymm15
Expand All @@ -99,6 +107,19 @@ vmovshdup %ymm3,%ymm1
vmovshdup %ymm15,%ymm2
butterfly 10,11,1,3,2,15

/* 4, 6, 8, 10: abs bound < 2q; 5, 7, 9, 11: abs bound < 3q/4 */
/*
* Note that since 2^31 / q > 256, the sum of all 256 coefficients does not
* overflow. This allows us to greatly simplify the range analysis by relaxing
* and unifying the bounds of all coefficients on the same layer. As a concrete
* example, here we relax the bounds on 5, 7, 9, 11 and conclude that
*
* All: abs bound < 2q
*
* In all but last of the following layers, we do the same relaxation without
* explicit mention.
*/

/* level 1 */
vpermq $0x1B,(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+168-8*\off-8)*4(%rsi),%ymm3
vpermq $0x1B,(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+168-8*\off-8)*4(%rsi),%ymm15
Expand All @@ -114,6 +135,8 @@ vmovshdup %ymm15,%ymm2
butterfly 8,10,1,3,2,15
butterfly 9,11,1,3,2,15

/* All: abs bound < 4q */

/* level 2 */
vpermq $0x1B,(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+104-8*\off-8)*4(%rsi),%ymm3
vpermq $0x1B,(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+104-8*\off-8)*4(%rsi),%ymm15
Expand All @@ -124,6 +147,8 @@ butterfly 5,9,1,3,2,15
butterfly 6,10,1,3,2,15
butterfly 7,11,1,3,2,15

/* All: abs bound < 8q */

/* level 3 */
shuffle2 4,5,3,5
shuffle2 6,7,4,7
Expand All @@ -137,6 +162,8 @@ butterfly 4,7
butterfly 6,9
butterfly 8,11

/* All: abs bound < 16q */

/* level 4 */
shuffle4 3,4,10,4
shuffle4 6,8,3,8
Expand All @@ -150,6 +177,8 @@ butterfly 3,8
butterfly 6,7
butterfly 5,11

/* All: abs bound < 32q */

/* level 5 */
shuffle8 10,3,9,3
shuffle8 6,5,10,5
Expand All @@ -163,6 +192,8 @@ butterfly 10,5
butterfly 6,8
butterfly 4,11

/* All: abs bound < 64q */

vmovdqa %ymm9,256*\off+ 0(%rdi)
vmovdqa %ymm10,256*\off+ 32(%rdi)
vmovdqa %ymm6,256*\off+ 64(%rdi)
Expand Down Expand Up @@ -194,6 +225,8 @@ vpbroadcastd (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+2)*4(%rsi),%ymm2
butterfly 8,10
butterfly 9,11

/* All: abs bound < 128q */

/* level 7 */
vpbroadcastd (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+0)*4(%rsi),%ymm1
vpbroadcastd (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+0)*4(%rsi),%ymm2
Expand All @@ -203,11 +236,27 @@ butterfly 5,9
butterfly 6,10
butterfly 7,11

/* 4, 5, 6, 7: abs bound < 256q; 8, 9, 10, 11: abs bound < 3q/4 */

vmovdqa %ymm8,512+32*\off(%rdi)
vmovdqa %ymm9,640+32*\off(%rdi)
vmovdqa %ymm10,768+32*\off(%rdi)
vmovdqa %ymm11,896+32*\off(%rdi)

/*
* In order to (a) remove the factor of 256 arising from the 256-point intt
* butterflies and (b) transform the output into Montgomery domain, we need to
* multiply all coefficients by 2^32/256.
*
* For ymm{8,9,10,11}, the scaling has been merged into the last butterfly, so
* only ymm{4,5,6,7} need to be scaled explicitly.
*
* The scaling is achieved by computing montmul(-, MLD_AVX2_DIV), so the output
* will have an abs bound of 3q/4.
*
* 4, 5, 6, 7: abs bound < 256q
*/

vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_8XDIV_QINV)*4(%rsi),%ymm1
vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_8XDIV)*4(%rsi),%ymm2
vpmuldq %ymm1,%ymm4,%ymm12
Expand Down Expand Up @@ -256,6 +305,8 @@ vmovshdup %ymm7,%ymm7
vpblendd $0xAA,%ymm8,%ymm6,%ymm6
vpblendd $0xAA,%ymm9,%ymm7,%ymm7

/* 4, 5, 6, 7: abs bound < 3q/4 */

vmovdqa %ymm4, 0+32*\off(%rdi)
vmovdqa %ymm5,128+32*\off(%rdi)
vmovdqa %ymm6,256+32*\off(%rdi)
Expand Down
54 changes: 48 additions & 6 deletions dev/x86_64/src/ntt.S
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,16 @@ vpsrlq $32,%ymm\r0,%ymm\r0
vpblendd $0xAA,%ymm\r1,%ymm\r0,%ymm\r3
.endm

/*
* Compute l + montmul(h, zh), l - montmul(h, zh) then store the results back to
* l, h respectively.
*
* Although the general abs bound of Montgomery multiplication is 3q/4, we use
* the more convenient bound q here.
Comment on lines +51 to +52
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Rephrase and reference intt.S for the 3q/4.

*
* In conclusion, the magnitudes of all coefficients grow by at most q after
* each layer.
*/
.macro butterfly l,h,zl0=1,zl1=1,zh0=2,zh1=2
vpmuldq %ymm\zl0,%ymm\h,%ymm13
vmovshdup %ymm\h,%ymm12
Expand All @@ -56,16 +66,30 @@ vpmuldq %ymm0,%ymm13,%ymm13
vpmuldq %ymm0,%ymm14,%ymm14

vmovshdup %ymm\h,%ymm\h
vpblendd $0xAA,%ymm12,%ymm\h,%ymm\h
vpblendd $0xAA,%ymm12,%ymm\h,%ymm\h /* mulhi(h * zh) */

vpsubd %ymm\h,%ymm\l,%ymm12
vpaddd %ymm\h,%ymm\l,%ymm\l
/*
* Originally, mulhi(h * zh) should be subtracted by mulhi(q * mullo(h * zl))
* in order to complete computing
*
* montmul(h, zh) = mulhi(h * zh) - mulhi(q * mullo(h * zl)).
*
* Here, since mulhi(q * mullo(h * zl)) has not been computed yet, this task is
* delayed until after add/sub.
*/
vpsubd %ymm\h,%ymm\l,%ymm12 /* l - mulhi(h * zh)
* = l - montmul(h, zh)
* - mulhi(q * mullo(h * zl)) */
vpaddd %ymm\h,%ymm\l,%ymm\l /* l + mulhi(h * zh)
* = l + montmul(h, zh)
* + mulhi(q * mullo(h * zl)) */

vmovshdup %ymm13,%ymm13
vpblendd $0xAA,%ymm14,%ymm13,%ymm13
vpblendd $0xAA,%ymm14,%ymm13,%ymm13 /* mulhi(q * mullo(h * zl)) */

vpaddd %ymm13,%ymm12,%ymm\h
vpsubd %ymm13,%ymm\l,%ymm\l
/* Finish the delayed task mentioned above */
vpaddd %ymm13,%ymm12,%ymm\h /* l - montmul(h, zh) */
vpsubd %ymm13,%ymm\l,%ymm\l /* l + montmul(h, zh) */
.endm

.macro levels0t1 off
Expand All @@ -82,11 +106,15 @@ vmovdqa 640+32*\off(%rdi),%ymm9
vmovdqa 768+32*\off(%rdi),%ymm10
vmovdqa 896+32*\off(%rdi),%ymm11

/* All: abs bound < q */

butterfly 4,8
butterfly 5,9
butterfly 6,10
butterfly 7,11

/* All: abs bound < 2q */

/* level 1 */
vpbroadcastd (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+2)*4(%rsi),%ymm1
vpbroadcastd (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+2)*4(%rsi),%ymm2
Expand All @@ -98,6 +126,8 @@ vpbroadcastd (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+3)*4(%rsi),%ymm2
butterfly 8,10
butterfly 9,11

/* All: abs bound < 3q */

vmovdqa %ymm4, 0+32*\off(%rdi)
vmovdqa %ymm5,128+32*\off(%rdi)
vmovdqa %ymm6,256+32*\off(%rdi)
Expand Down Expand Up @@ -132,6 +162,8 @@ shuffle8 5,9,4,9
shuffle8 6,10,5,10
shuffle8 7,11,6,11

/* All: abs bound < 4q */

/* level 3 */
vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+8+8*\off)*4(%rsi),%ymm1
vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+8+8*\off)*4(%rsi),%ymm2
Expand All @@ -146,6 +178,8 @@ shuffle4 8,10,3,10
shuffle4 4,6,8,6
shuffle4 9,11,4,11

/* All: abs bound < 5q */

/* level 4 */
vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+40+8*\off)*4(%rsi),%ymm1
vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+40+8*\off)*4(%rsi),%ymm2
Expand All @@ -160,6 +194,8 @@ shuffle2 5,6,7,6
shuffle2 3,4,5,4
shuffle2 10,11,3,11

/* All: abs bound < 6q */

/* level 5 */
vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+72+8*\off)*4(%rsi),%ymm1
vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+72+8*\off)*4(%rsi),%ymm2
Expand All @@ -171,6 +207,8 @@ butterfly 8,4,1,10,2,15
butterfly 7,3,1,10,2,15
butterfly 6,11,1,10,2,15

/* All: abs bound < 7q */

/* level 6 */
vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+104+8*\off)*4(%rsi),%ymm1
vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+104+8*\off)*4(%rsi),%ymm2
Expand All @@ -186,6 +224,8 @@ vmovshdup %ymm2,%ymm15
butterfly 5,3,1,10,2,15
butterfly 4,11,1,10,2,15

/* All: abs bound < 8q */

/* level 7 */
vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+168+8*\off)*4(%rsi),%ymm1
vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+168+8*\off)*4(%rsi),%ymm2
Expand All @@ -211,6 +251,8 @@ vpsrlq $32,%ymm1,%ymm10
vmovshdup %ymm2,%ymm15
butterfly 3,11,1,10,2,15

/* All: abs bound < 9q */

vmovdqa %ymm9,256*\off+ 0(%rdi)
vmovdqa %ymm8,256*\off+ 32(%rdi)
vmovdqa %ymm7,256*\off+ 64(%rdi)
Expand Down
7 changes: 7 additions & 0 deletions dev/x86_64/src/pointwise.S
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ _looptop1:
vpsrlq ymm11, ymm10, 32
vpsrlq ymm13, ymm12, 32
vmovshdup ymm15, ymm14
/* All: abs bound < 9q */

// Multiply
vpmuldq ymm2, ymm2, ymm10
Expand All @@ -69,6 +70,7 @@ _looptop1:
vpmuldq ymm5, ymm5, ymm13
vpmuldq ymm6, ymm6, ymm14
vpmuldq ymm7, ymm7, ymm15
/* All: abs bound < 81q^2 < 81*2^46 < 2^53 = 2^21R < qR/2 */

// Reduce
vpmuldq ymm10, ymm0, ymm2
Expand All @@ -92,6 +94,11 @@ _looptop1:
vpsrlq ymm2, ymm2, 32
vpsrlq ymm4, ymm4, 32
vmovshdup ymm6, ymm6
/*
* All coefficients are Montgomery-reduced. This results in the bound
*
* All: abs bound <= "input abs bound"/R + q/2 < (qR/2)/R + q/2 = q
*/

// Store
vpblendd ymm2, ymm2, ymm3, 0xAA
Expand Down
14 changes: 14 additions & 0 deletions dev/x86_64/src/pointwise_acc_l4.S
Original file line number Diff line number Diff line change
Expand Up @@ -37,12 +37,17 @@
vpsrlq ymm9, ymm8, 32
vmovshdup ymm11, ymm10
vmovshdup ymm13, ymm12
/*
* 6, 7, 8, 9: from the first input polynomial, abs bound < q
* 10, 11, 12, 13: from the second input polynomial, abs bound < 9q
*/

// Multiply
vpmuldq ymm6, ymm6, ymm10
vpmuldq ymm7, ymm7, ymm11
vpmuldq ymm8, ymm8, ymm12
vpmuldq ymm9, ymm9, ymm13
/* All: abs bound < 9q^2 */
.endm

.macro acc
Expand Down Expand Up @@ -80,15 +85,19 @@ _looptop2:
vmovdqa ymm3, ymm7
vmovdqa ymm4, ymm8
vmovdqa ymm5, ymm9
/* All: abs bound < 9q^2 */

pointwise 1024
acc
/* All: abs bound < 18q^2 */

pointwise 2048
acc
/* All: abs bound < 27q^2 */

pointwise 3072
acc
/* All: abs bound < 36q^2 < 36*2^46 < 2^52 = 2^20R < qR/2 */

// Reduce
vpmuldq ymm6, ymm0, ymm2
Expand All @@ -105,6 +114,11 @@ _looptop2:
vpsubq ymm5, ymm5, ymm9
vpsrlq ymm2, ymm2, 32
vmovshdup ymm4, ymm4
/*
* All coefficients are Montgomery-reduced. This results in the bound
*
* All: abs bound <= "input abs bound"/R + q/2 < (qR/2)/R + q/2 = q
*/

// Store
vpblendd ymm2, ymm2, ymm3, 0xAA
Expand Down
15 changes: 15 additions & 0 deletions dev/x86_64/src/pointwise_acc_l5.S
Original file line number Diff line number Diff line change
Expand Up @@ -37,12 +37,17 @@
vpsrlq ymm9, ymm8, 32
vmovshdup ymm11, ymm10
vmovshdup ymm13, ymm12
/*
* 6, 7, 8, 9: from the first input polynomial, abs bound < q
* 10, 11, 12, 13: from the second input polynomial, abs bound < 9q
*/

// Multiply
vpmuldq ymm6, ymm6, ymm10
vpmuldq ymm7, ymm7, ymm11
vpmuldq ymm8, ymm8, ymm12
vpmuldq ymm9, ymm9, ymm13
/* All: abs bound < 9q^2 */
.endm

.macro acc
Expand Down Expand Up @@ -80,18 +85,23 @@ _looptop2:
vmovdqa ymm3, ymm7
vmovdqa ymm4, ymm8
vmovdqa ymm5, ymm9
/* All: abs bound < 9q^2 */

pointwise 1024
acc
/* All: abs bound < 18q^2 */

pointwise 2048
acc
/* All: abs bound < 27q^2 */

pointwise 3072
acc
/* All: abs bound < 36q^2 */

pointwise 4096
acc
/* All: abs bound < 45q^2 < 45*2^46 < 2^52 = 2^20R < qR/2 */

// Reduce
vpmuldq ymm6, ymm0, ymm2
Expand All @@ -108,6 +118,11 @@ _looptop2:
vpsubq ymm5, ymm5, ymm9
vpsrlq ymm2, ymm2, 32
vmovshdup ymm4, ymm4
/*
* All coefficients are Montgomery-reduced. This results in the bound
*
* All: abs bound <= "input abs bound"/R + q/2 < (qR/2)/R + q/2 = q
*/

// Store
vpblendd ymm2, ymm2, ymm3, 0xAA
Expand Down
Loading