Skip to content

Commit 5f86090

Browse files
committed
Add bounds reasoning comments to AVX2 ntt/intt
Signed-off-by: jammychiou1 <[email protected]>
1 parent ee7e1bf commit 5f86090

File tree

2 files changed

+99
-6
lines changed

2 files changed

+99
-6
lines changed

dev/x86_64/src/intt.S

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,12 @@ vpsrlq $32,%ymm\r0,%ymm\r0
4343
vpblendd $0xAA,%ymm\r1,%ymm\r0,%ymm\r3
4444
.endm
4545

46+
/*
47+
* Compute l + h, montmul(h - l, zh) then store the results back to l, h
48+
* respectively.
49+
*
50+
* The general abs bound of Montgomery multiplication is 3q/4.
51+
*/
4652
.macro butterfly l,h,zl0=1,zl1=1,zh0=2,zh1=2
4753
vpsubd %ymm\l,%ymm\h,%ymm12
4854
vpaddd %ymm\h,%ymm\l,%ymm\l
@@ -74,6 +80,8 @@ vmovdqa 256*\off+160(%rdi),%ymm9
7480
vmovdqa 256*\off+192(%rdi),%ymm10
7581
vmovdqa 256*\off+224(%rdi),%ymm11
7682

83+
/* All: abs bound < q */
84+
7785
/* level 0 */
7886
vpermq $0x1B,(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+296-8*\off-8)*4(%rsi),%ymm3
7987
vpermq $0x1B,(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+296-8*\off-8)*4(%rsi),%ymm15
@@ -99,6 +107,19 @@ vmovshdup %ymm3,%ymm1
99107
vmovshdup %ymm15,%ymm2
100108
butterfly 10,11,1,3,2,15
101109

110+
/* 4, 6, 8, 10: abs bound < 2q; 5, 7, 9, 11: abs bound < 3q/4 */
111+
/*
112+
* Note that since 2^31 / q > 256, the sum of all 256 coefficients does not
113+
* overflow. This allows us to greatly simplify the range analysis by relaxing
114+
* and unifying the bounds of all coefficients on the same layer. As a concrete
115+
* example, here we relax the bounds on 5, 7, 9, 11 and conclude that
116+
*
117+
* All: abs bound < 2q
118+
*
119+
* In all but last of the following layers, we do the same relaxation without
120+
* explicit mention.
121+
*/
122+
102123
/* level 1 */
103124
vpermq $0x1B,(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+168-8*\off-8)*4(%rsi),%ymm3
104125
vpermq $0x1B,(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+168-8*\off-8)*4(%rsi),%ymm15
@@ -114,6 +135,8 @@ vmovshdup %ymm15,%ymm2
114135
butterfly 8,10,1,3,2,15
115136
butterfly 9,11,1,3,2,15
116137

138+
/* All: abs bound < 4q */
139+
117140
/* level 2 */
118141
vpermq $0x1B,(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+104-8*\off-8)*4(%rsi),%ymm3
119142
vpermq $0x1B,(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+104-8*\off-8)*4(%rsi),%ymm15
@@ -124,6 +147,8 @@ butterfly 5,9,1,3,2,15
124147
butterfly 6,10,1,3,2,15
125148
butterfly 7,11,1,3,2,15
126149

150+
/* All: abs bound < 8q */
151+
127152
/* level 3 */
128153
shuffle2 4,5,3,5
129154
shuffle2 6,7,4,7
@@ -137,6 +162,8 @@ butterfly 4,7
137162
butterfly 6,9
138163
butterfly 8,11
139164

165+
/* All: abs bound < 16q */
166+
140167
/* level 4 */
141168
shuffle4 3,4,10,4
142169
shuffle4 6,8,3,8
@@ -150,6 +177,8 @@ butterfly 3,8
150177
butterfly 6,7
151178
butterfly 5,11
152179

180+
/* All: abs bound < 32q */
181+
153182
/* level 5 */
154183
shuffle8 10,3,9,3
155184
shuffle8 6,5,10,5
@@ -163,6 +192,8 @@ butterfly 10,5
163192
butterfly 6,8
164193
butterfly 4,11
165194

195+
/* All: abs bound < 64q */
196+
166197
vmovdqa %ymm9,256*\off+ 0(%rdi)
167198
vmovdqa %ymm10,256*\off+ 32(%rdi)
168199
vmovdqa %ymm6,256*\off+ 64(%rdi)
@@ -194,6 +225,8 @@ vpbroadcastd (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+2)*4(%rsi),%ymm2
194225
butterfly 8,10
195226
butterfly 9,11
196227

228+
/* All: abs bound < 128q */
229+
197230
/* level 7 */
198231
vpbroadcastd (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+0)*4(%rsi),%ymm1
199232
vpbroadcastd (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+0)*4(%rsi),%ymm2
@@ -203,11 +236,27 @@ butterfly 5,9
203236
butterfly 6,10
204237
butterfly 7,11
205238

239+
/* 4, 5, 6, 7: abs bound < 256q; 8, 9, 10, 11: abs bound < 3q/4 */
240+
206241
vmovdqa %ymm8,512+32*\off(%rdi)
207242
vmovdqa %ymm9,640+32*\off(%rdi)
208243
vmovdqa %ymm10,768+32*\off(%rdi)
209244
vmovdqa %ymm11,896+32*\off(%rdi)
210245

246+
/*
247+
* In order to (a) remove the factor of 256 arising from the 256-point intt
248+
* butterfly and (b) transform the output into Montgomery domain, we need to
249+
* multiply all coefficients by 2^32/256.
250+
*
251+
* For ymm{8,9,10,11}, the scaling has been merged into the last butterfly, so
252+
* only ymm{4,5,6,7} need to be scaled explicitly.
253+
*
254+
* The scaling is achieved by computing montmul(-, MLD_AVX2_DIV), so the output
255+
* will have an abs bound of 3q/4.
256+
*
257+
* 4, 5, 6, 7: abs bound < 256q
258+
*/
259+
211260
vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_8XDIV_QINV)*4(%rsi),%ymm1
212261
vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_8XDIV)*4(%rsi),%ymm2
213262
vpmuldq %ymm1,%ymm4,%ymm12
@@ -256,6 +305,8 @@ vmovshdup %ymm7,%ymm7
256305
vpblendd $0xAA,%ymm8,%ymm6,%ymm6
257306
vpblendd $0xAA,%ymm9,%ymm7,%ymm7
258307

308+
/* 4, 5, 6, 7: abs bound < 3q/4 */
309+
259310
vmovdqa %ymm4, 0+32*\off(%rdi)
260311
vmovdqa %ymm5,128+32*\off(%rdi)
261312
vmovdqa %ymm6,256+32*\off(%rdi)

dev/x86_64/src/ntt.S

Lines changed: 48 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,16 @@ vpsrlq $32,%ymm\r0,%ymm\r0
4444
vpblendd $0xAA,%ymm\r1,%ymm\r0,%ymm\r3
4545
.endm
4646

47+
/*
48+
* Compute l + montmul(h, zh), l - montmul(h, zh) then store the results back to
49+
* l, h respectively.
50+
*
51+
* Although the general abs bound of Montgomery multiplication is 3q/4, we use
52+
* the more convenient bound q here.
53+
*
54+
* In conclusion, the magnitudes of all coefficients grow by at most q after
55+
* each layer.
56+
*/
4757
.macro butterfly l,h,zl0=1,zl1=1,zh0=2,zh1=2
4858
vpmuldq %ymm\zl0,%ymm\h,%ymm13
4959
vmovshdup %ymm\h,%ymm12
@@ -56,16 +66,30 @@ vpmuldq %ymm0,%ymm13,%ymm13
5666
vpmuldq %ymm0,%ymm14,%ymm14
5767

5868
vmovshdup %ymm\h,%ymm\h
59-
vpblendd $0xAA,%ymm12,%ymm\h,%ymm\h
69+
vpblendd $0xAA,%ymm12,%ymm\h,%ymm\h /* mulhi(h * zh) */
6070

61-
vpsubd %ymm\h,%ymm\l,%ymm12
62-
vpaddd %ymm\h,%ymm\l,%ymm\l
71+
/*
72+
* Originally, mulhi(h * zh) should be subtracted by mulhi(q * mullo(h * zl))
73+
* in order to complete computing
74+
*
75+
* montmul(h, zh) = mulhi(h * zh) - mulhi(q * mullo(h * zl)).
76+
*
77+
* Here, since mulhi(q * mullo(h * zl)) has not been computed yet, this task is
78+
* delayed until after add/sub.
79+
*/
80+
vpsubd %ymm\h,%ymm\l,%ymm12 /* l - mulhi(h * zh)
81+
* = l - montmul(h, zh)
82+
* - mulhi(q * mullo(h * zl)) */
83+
vpaddd %ymm\h,%ymm\l,%ymm\l /* l + mulhi(h * zh)
84+
* = l + montmul(h, zh)
85+
* + mulhi(q * mullo(h * zl)) */
6386

6487
vmovshdup %ymm13,%ymm13
65-
vpblendd $0xAA,%ymm14,%ymm13,%ymm13
88+
vpblendd $0xAA,%ymm14,%ymm13,%ymm13 /* mulhi(q * mullo(h * zl)) */
6689

67-
vpaddd %ymm13,%ymm12,%ymm\h
68-
vpsubd %ymm13,%ymm\l,%ymm\l
90+
/* Finish the delayed task mentioned above */
91+
vpaddd %ymm13,%ymm12,%ymm\h /* l - montmul(h, zh) */
92+
vpsubd %ymm13,%ymm\l,%ymm\l /* l + montmul(h, zh) */
6993
.endm
7094

7195
.macro levels0t1 off
@@ -82,11 +106,15 @@ vmovdqa 640+32*\off(%rdi),%ymm9
82106
vmovdqa 768+32*\off(%rdi),%ymm10
83107
vmovdqa 896+32*\off(%rdi),%ymm11
84108

109+
/* All: abs bound < q */
110+
85111
butterfly 4,8
86112
butterfly 5,9
87113
butterfly 6,10
88114
butterfly 7,11
89115

116+
/* All: abs bound < 2q */
117+
90118
/* level 1 */
91119
vpbroadcastd (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+2)*4(%rsi),%ymm1
92120
vpbroadcastd (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+2)*4(%rsi),%ymm2
@@ -98,6 +126,8 @@ vpbroadcastd (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+3)*4(%rsi),%ymm2
98126
butterfly 8,10
99127
butterfly 9,11
100128

129+
/* All: abs bound < 3q */
130+
101131
vmovdqa %ymm4, 0+32*\off(%rdi)
102132
vmovdqa %ymm5,128+32*\off(%rdi)
103133
vmovdqa %ymm6,256+32*\off(%rdi)
@@ -132,6 +162,8 @@ shuffle8 5,9,4,9
132162
shuffle8 6,10,5,10
133163
shuffle8 7,11,6,11
134164

165+
/* All: abs bound < 4q */
166+
135167
/* level 3 */
136168
vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+8+8*\off)*4(%rsi),%ymm1
137169
vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+8+8*\off)*4(%rsi),%ymm2
@@ -146,6 +178,8 @@ shuffle4 8,10,3,10
146178
shuffle4 4,6,8,6
147179
shuffle4 9,11,4,11
148180

181+
/* All: abs bound < 5q */
182+
149183
/* level 4 */
150184
vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+40+8*\off)*4(%rsi),%ymm1
151185
vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+40+8*\off)*4(%rsi),%ymm2
@@ -160,6 +194,8 @@ shuffle2 5,6,7,6
160194
shuffle2 3,4,5,4
161195
shuffle2 10,11,3,11
162196

197+
/* All: abs bound < 6q */
198+
163199
/* level 5 */
164200
vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+72+8*\off)*4(%rsi),%ymm1
165201
vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+72+8*\off)*4(%rsi),%ymm2
@@ -171,6 +207,8 @@ butterfly 8,4,1,10,2,15
171207
butterfly 7,3,1,10,2,15
172208
butterfly 6,11,1,10,2,15
173209

210+
/* All: abs bound < 7q */
211+
174212
/* level 6 */
175213
vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+104+8*\off)*4(%rsi),%ymm1
176214
vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+104+8*\off)*4(%rsi),%ymm2
@@ -186,6 +224,8 @@ vmovshdup %ymm2,%ymm15
186224
butterfly 5,3,1,10,2,15
187225
butterfly 4,11,1,10,2,15
188226

227+
/* All: abs bound < 8q */
228+
189229
/* level 7 */
190230
vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+168+8*\off)*4(%rsi),%ymm1
191231
vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+168+8*\off)*4(%rsi),%ymm2
@@ -211,6 +251,8 @@ vpsrlq $32,%ymm1,%ymm10
211251
vmovshdup %ymm2,%ymm15
212252
butterfly 3,11,1,10,2,15
213253

254+
/* All: abs bound < 9q */
255+
214256
vmovdqa %ymm9,256*\off+ 0(%rdi)
215257
vmovdqa %ymm8,256*\off+ 32(%rdi)
216258
vmovdqa %ymm7,256*\off+ 64(%rdi)

0 commit comments

Comments
 (0)