Skip to content

Commit 191f6a7

Browse files
committed
Add bounds reasoning comments to AVX2 ntt/intt
Signed-off-by: jammychiou1 <[email protected]>
1 parent ee7e1bf commit 191f6a7

File tree

2 files changed

+69
-0
lines changed

2 files changed

+69
-0
lines changed

dev/x86_64/src/intt.S

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,12 @@ vpsrlq $32,%ymm\r0,%ymm\r0
4343
vpblendd $0xAA,%ymm\r1,%ymm\r0,%ymm\r3
4444
.endm
4545

46+
/*
47+
* Compute l + h, montmul(h - l, zh) then store the results back to l, h
48+
* respectively.
49+
*
50+
* The general abs bound of Montgomery multiplication is 3q/4.
51+
*/
4652
.macro butterfly l,h,zl0=1,zl1=1,zh0=2,zh1=2
4753
vpsubd %ymm\l,%ymm\h,%ymm12
4854
vpaddd %ymm\h,%ymm\l,%ymm\l
@@ -74,6 +80,8 @@ vmovdqa 256*\off+160(%rdi),%ymm9
7480
vmovdqa 256*\off+192(%rdi),%ymm10
7581
vmovdqa 256*\off+224(%rdi),%ymm11
7682

83+
/* All: abs bound < q */
84+
7785
/* level 0 */
7886
vpermq $0x1B,(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+296-8*\off-8)*4(%rsi),%ymm3
7987
vpermq $0x1B,(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+296-8*\off-8)*4(%rsi),%ymm15
@@ -99,6 +107,18 @@ vmovshdup %ymm3,%ymm1
99107
vmovshdup %ymm15,%ymm2
100108
butterfly 10,11,1,3,2,15
101109

110+
/* 4, 6, 8, 10: abs bound < 2q; 5, 7, 9, 11: abs bound < 3q/4 */
111+
/*
112+
* Note that since 2^31 / q > 256, the sum of all 256 coefficients does not
113+
* overflow. This allows us to greatly simplify the range analysis by relaxing
114+
* and unifying the bounds of all coefficients on the same layer. As a concrete
115+
* example, here we relax the bounds on 5, 7, 9, 11 and conclude that
116+
*
117+
* All: abs bound < 2q
118+
*
119+
* In all following layers, we do the same relaxation without explicit mention.
120+
*/
121+
102122
/* level 1 */
103123
vpermq $0x1B,(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+168-8*\off-8)*4(%rsi),%ymm3
104124
vpermq $0x1B,(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+168-8*\off-8)*4(%rsi),%ymm15
@@ -114,6 +134,8 @@ vmovshdup %ymm15,%ymm2
114134
butterfly 8,10,1,3,2,15
115135
butterfly 9,11,1,3,2,15
116136

137+
/* All: abs bound < 4q */
138+
117139
/* level 2 */
118140
vpermq $0x1B,(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+104-8*\off-8)*4(%rsi),%ymm3
119141
vpermq $0x1B,(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+104-8*\off-8)*4(%rsi),%ymm15
@@ -124,6 +146,8 @@ butterfly 5,9,1,3,2,15
124146
butterfly 6,10,1,3,2,15
125147
butterfly 7,11,1,3,2,15
126148

149+
/* All: abs bound < 8q */
150+
127151
/* level 3 */
128152
shuffle2 4,5,3,5
129153
shuffle2 6,7,4,7
@@ -137,6 +161,8 @@ butterfly 4,7
137161
butterfly 6,9
138162
butterfly 8,11
139163

164+
/* All: abs bound < 16q */
165+
140166
/* level 4 */
141167
shuffle4 3,4,10,4
142168
shuffle4 6,8,3,8
@@ -150,6 +176,8 @@ butterfly 3,8
150176
butterfly 6,7
151177
butterfly 5,11
152178

179+
/* All: abs bound < 32q */
180+
153181
/* level 5 */
154182
shuffle8 10,3,9,3
155183
shuffle8 6,5,10,5
@@ -163,6 +191,8 @@ butterfly 10,5
163191
butterfly 6,8
164192
butterfly 4,11
165193

194+
/* All: abs bound < 64q */
195+
166196
vmovdqa %ymm9,256*\off+ 0(%rdi)
167197
vmovdqa %ymm10,256*\off+ 32(%rdi)
168198
vmovdqa %ymm6,256*\off+ 64(%rdi)
@@ -194,6 +224,8 @@ vpbroadcastd (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+2)*4(%rsi),%ymm2
194224
butterfly 8,10
195225
butterfly 9,11
196226

227+
/* All: abs bound < 128q */
228+
197229
/* level 7 */
198230
vpbroadcastd (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+0)*4(%rsi),%ymm1
199231
vpbroadcastd (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+0)*4(%rsi),%ymm2
@@ -203,6 +235,8 @@ butterfly 5,9
203235
butterfly 6,10
204236
butterfly 7,11
205237

238+
/* All: abs bound < 256q */
239+
206240
vmovdqa %ymm8,512+32*\off(%rdi)
207241
vmovdqa %ymm9,640+32*\off(%rdi)
208242
vmovdqa %ymm10,768+32*\off(%rdi)
@@ -256,6 +290,13 @@ vmovshdup %ymm7,%ymm7
256290
vpblendd $0xAA,%ymm8,%ymm6,%ymm6
257291
vpblendd $0xAA,%ymm9,%ymm7,%ymm7
258292

293+
/*
294+
* All coefficients are Montgomery-multiplied with the same constant. This
295+
* reduces the magnitudes of all coefficients and results in the bound
296+
*
297+
* All: abs bound < 3q/4
298+
*/
299+
259300
vmovdqa %ymm4, 0+32*\off(%rdi)
260301
vmovdqa %ymm5,128+32*\off(%rdi)
261302
vmovdqa %ymm6,256+32*\off(%rdi)

dev/x86_64/src/ntt.S

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,16 @@ vpsrlq $32,%ymm\r0,%ymm\r0
4444
vpblendd $0xAA,%ymm\r1,%ymm\r0,%ymm\r3
4545
.endm
4646

47+
/*
48+
* Compute l + montmul(h, zh), l - montmul(h, zh) then store the results back to
49+
* l, h respectively.
50+
*
51+
* Although the general abs bound of Montgomery multiplication is 3q/4, we use
52+
* the more convenient bound q here.
53+
*
54+
* In conclusion, the magnitudes of all coefficients grow by at most q after
55+
* each layer.
56+
*/
4757
.macro butterfly l,h,zl0=1,zl1=1,zh0=2,zh1=2
4858
vpmuldq %ymm\zl0,%ymm\h,%ymm13
4959
vmovshdup %ymm\h,%ymm12
@@ -82,11 +92,15 @@ vmovdqa 640+32*\off(%rdi),%ymm9
8292
vmovdqa 768+32*\off(%rdi),%ymm10
8393
vmovdqa 896+32*\off(%rdi),%ymm11
8494

95+
/* All: abs bound < q */
96+
8597
butterfly 4,8
8698
butterfly 5,9
8799
butterfly 6,10
88100
butterfly 7,11
89101

102+
/* All: abs bound < 2q */
103+
90104
/* level 1 */
91105
vpbroadcastd (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+2)*4(%rsi),%ymm1
92106
vpbroadcastd (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+2)*4(%rsi),%ymm2
@@ -98,6 +112,8 @@ vpbroadcastd (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+3)*4(%rsi),%ymm2
98112
butterfly 8,10
99113
butterfly 9,11
100114

115+
/* All: abs bound < 3q */
116+
101117
vmovdqa %ymm4, 0+32*\off(%rdi)
102118
vmovdqa %ymm5,128+32*\off(%rdi)
103119
vmovdqa %ymm6,256+32*\off(%rdi)
@@ -132,6 +148,8 @@ shuffle8 5,9,4,9
132148
shuffle8 6,10,5,10
133149
shuffle8 7,11,6,11
134150

151+
/* All: abs bound < 4q */
152+
135153
/* level 3 */
136154
vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+8+8*\off)*4(%rsi),%ymm1
137155
vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+8+8*\off)*4(%rsi),%ymm2
@@ -146,6 +164,8 @@ shuffle4 8,10,3,10
146164
shuffle4 4,6,8,6
147165
shuffle4 9,11,4,11
148166

167+
/* All: abs bound < 5q */
168+
149169
/* level 4 */
150170
vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+40+8*\off)*4(%rsi),%ymm1
151171
vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+40+8*\off)*4(%rsi),%ymm2
@@ -160,6 +180,8 @@ shuffle2 5,6,7,6
160180
shuffle2 3,4,5,4
161181
shuffle2 10,11,3,11
162182

183+
/* All: abs bound < 6q */
184+
163185
/* level 5 */
164186
vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+72+8*\off)*4(%rsi),%ymm1
165187
vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+72+8*\off)*4(%rsi),%ymm2
@@ -171,6 +193,8 @@ butterfly 8,4,1,10,2,15
171193
butterfly 7,3,1,10,2,15
172194
butterfly 6,11,1,10,2,15
173195

196+
/* All: abs bound < 7q */
197+
174198
/* level 6 */
175199
vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+104+8*\off)*4(%rsi),%ymm1
176200
vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+104+8*\off)*4(%rsi),%ymm2
@@ -186,6 +210,8 @@ vmovshdup %ymm2,%ymm15
186210
butterfly 5,3,1,10,2,15
187211
butterfly 4,11,1,10,2,15
188212

213+
/* All: abs bound < 8q */
214+
189215
/* level 7 */
190216
vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+168+8*\off)*4(%rsi),%ymm1
191217
vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+168+8*\off)*4(%rsi),%ymm2
@@ -211,6 +237,8 @@ vpsrlq $32,%ymm1,%ymm10
211237
vmovshdup %ymm2,%ymm15
212238
butterfly 3,11,1,10,2,15
213239

240+
/* All: abs bound < 9q */
241+
214242
vmovdqa %ymm9,256*\off+ 0(%rdi)
215243
vmovdqa %ymm8,256*\off+ 32(%rdi)
216244
vmovdqa %ymm7,256*\off+ 64(%rdi)

0 commit comments

Comments
 (0)