Skip to content

Commit 1b76629

Browse files
committed
Add bounds reasoning comments to AVX2 ntt/intt
Signed-off-by: jammychiou1 <[email protected]>
1 parent 94311a7 commit 1b76629

File tree

2 files changed

+69
-0
lines changed

2 files changed

+69
-0
lines changed

mldsa/native/x86_64/src/intt.S

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,12 @@ vpsrlq $32,%ymm\r0,%ymm\r0
4242
vpblendd $0xAA,%ymm\r1,%ymm\r0,%ymm\r3
4343
.endm
4444

45+
/*
46+
* Compute l + h, montmul(h - l, zh) then store the results back to l, h
47+
* respectively.
48+
*
49+
* The general abs bound of Montgomery multiplication is 3q/4.
50+
*/
4551
.macro butterfly l,h,zl0=1,zl1=1,zh0=2,zh1=2
4652
vpsubd %ymm\l,%ymm\h,%ymm12
4753
vpaddd %ymm\h,%ymm\l,%ymm\l
@@ -73,6 +79,8 @@ vmovdqa 256*\off+160(%rdi),%ymm9
7379
vmovdqa 256*\off+192(%rdi),%ymm10
7480
vmovdqa 256*\off+224(%rdi),%ymm11
7581

82+
/* All: abs bound < q */
83+
7684
/* level 0 */
7785
vpermq $0x1B,(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+296-8*\off-8)*4(%rsi),%ymm3
7886
vpermq $0x1B,(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+296-8*\off-8)*4(%rsi),%ymm15
@@ -98,6 +106,18 @@ vmovshdup %ymm3,%ymm1
98106
vmovshdup %ymm15,%ymm2
99107
butterfly 10,11,1,3,2,15
100108

109+
/* 4, 6, 8, 10: abs bound < 2q; 5, 7, 9, 11: abs bound < 3q/4 */
110+
/*
111+
* Note that since 2^31 / q > 256, the sum of all 256 coefficients does not
112+
* overflow. This allows us to greatly simplify the range analysis by relaxing
113+
* and unifying the bounds of all coefficients on the same layer. As a concrete
114+
* example, here we relax the bounds on 5, 7, 9, 11 and conclude that
115+
*
116+
* All: abs bound < 2q
117+
*
118+
* In all following layers, we do the same relaxation without explicit mention.
119+
*/
120+
101121
/* level 1 */
102122
vpermq $0x1B,(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+168-8*\off-8)*4(%rsi),%ymm3
103123
vpermq $0x1B,(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+168-8*\off-8)*4(%rsi),%ymm15
@@ -113,6 +133,8 @@ vmovshdup %ymm15,%ymm2
113133
butterfly 8,10,1,3,2,15
114134
butterfly 9,11,1,3,2,15
115135

136+
/* All: abs bound < 4q */
137+
116138
/* level 2 */
117139
vpermq $0x1B,(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+104-8*\off-8)*4(%rsi),%ymm3
118140
vpermq $0x1B,(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+104-8*\off-8)*4(%rsi),%ymm15
@@ -123,6 +145,8 @@ butterfly 5,9,1,3,2,15
123145
butterfly 6,10,1,3,2,15
124146
butterfly 7,11,1,3,2,15
125147

148+
/* All: abs bound < 8q */
149+
126150
/* level 3 */
127151
shuffle2 4,5,3,5
128152
shuffle2 6,7,4,7
@@ -136,6 +160,8 @@ butterfly 4,7
136160
butterfly 6,9
137161
butterfly 8,11
138162

163+
/* All: abs bound < 16q */
164+
139165
/* level 4 */
140166
shuffle4 3,4,10,4
141167
shuffle4 6,8,3,8
@@ -149,6 +175,8 @@ butterfly 3,8
149175
butterfly 6,7
150176
butterfly 5,11
151177

178+
/* All: abs bound < 32q */
179+
152180
/* level 5 */
153181
shuffle8 10,3,9,3
154182
shuffle8 6,5,10,5
@@ -162,6 +190,8 @@ butterfly 10,5
162190
butterfly 6,8
163191
butterfly 4,11
164192

193+
/* All: abs bound < 64q */
194+
165195
vmovdqa %ymm9,256*\off+ 0(%rdi)
166196
vmovdqa %ymm10,256*\off+ 32(%rdi)
167197
vmovdqa %ymm6,256*\off+ 64(%rdi)
@@ -193,6 +223,8 @@ vpbroadcastd (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+2)*4(%rsi),%ymm2
193223
butterfly 8,10
194224
butterfly 9,11
195225

226+
/* All: abs bound < 128q */
227+
196228
/* level 7 */
197229
vpbroadcastd (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+0)*4(%rsi),%ymm1
198230
vpbroadcastd (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+0)*4(%rsi),%ymm2
@@ -202,6 +234,8 @@ butterfly 5,9
202234
butterfly 6,10
203235
butterfly 7,11
204236

237+
/* All: abs bound < 256q */
238+
205239
vmovdqa %ymm8,512+32*\off(%rdi)
206240
vmovdqa %ymm9,640+32*\off(%rdi)
207241
vmovdqa %ymm10,768+32*\off(%rdi)
@@ -255,6 +289,13 @@ vmovshdup %ymm7,%ymm7
255289
vpblendd $0xAA,%ymm8,%ymm6,%ymm6
256290
vpblendd $0xAA,%ymm9,%ymm7,%ymm7
257291

292+
/*
293+
* All coefficients are Montgomery-multiplied with the same constant. This
294+
* reduces the magnitudes of all coefficients and results in the bound
295+
*
296+
* All: abs bound < 3q/4
297+
*/
298+
258299
vmovdqa %ymm4, 0+32*\off(%rdi)
259300
vmovdqa %ymm5,128+32*\off(%rdi)
260301
vmovdqa %ymm6,256+32*\off(%rdi)

mldsa/native/x86_64/src/ntt.S

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,16 @@ vpsrlq $32,%ymm\r0,%ymm\r0
4343
vpblendd $0xAA,%ymm\r1,%ymm\r0,%ymm\r3
4444
.endm
4545

46+
/*
47+
* Compute l + montmul(h, zh), l - montmul(h, zh) then store the results back to
48+
* l, h respectively.
49+
*
50+
* Although the general abs bound of Montgomery multiplication is 3q/4, we use
51+
* the more convenient bound q here.
52+
*
53+
* In conclusion, the magnitudes of all coefficients grow by at most q after
54+
* each layer.
55+
*/
4656
.macro butterfly l,h,zl0=1,zl1=1,zh0=2,zh1=2
4757
vpmuldq %ymm\zl0,%ymm\h,%ymm13
4858
vmovshdup %ymm\h,%ymm12
@@ -81,11 +91,15 @@ vmovdqa 640+32*\off(%rdi),%ymm9
8191
vmovdqa 768+32*\off(%rdi),%ymm10
8292
vmovdqa 896+32*\off(%rdi),%ymm11
8393

94+
/* All: abs bound < q */
95+
8496
butterfly 4,8
8597
butterfly 5,9
8698
butterfly 6,10
8799
butterfly 7,11
88100

101+
/* All: abs bound < 2q */
102+
89103
/* level 1 */
90104
vpbroadcastd (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+2)*4(%rsi),%ymm1
91105
vpbroadcastd (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+2)*4(%rsi),%ymm2
@@ -97,6 +111,8 @@ vpbroadcastd (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+3)*4(%rsi),%ymm2
97111
butterfly 8,10
98112
butterfly 9,11
99113

114+
/* All: abs bound < 3q */
115+
100116
vmovdqa %ymm4, 0+32*\off(%rdi)
101117
vmovdqa %ymm5,128+32*\off(%rdi)
102118
vmovdqa %ymm6,256+32*\off(%rdi)
@@ -131,6 +147,8 @@ shuffle8 5,9,4,9
131147
shuffle8 6,10,5,10
132148
shuffle8 7,11,6,11
133149

150+
/* All: abs bound < 4q */
151+
134152
/* level 3 */
135153
vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+8+8*\off)*4(%rsi),%ymm1
136154
vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+8+8*\off)*4(%rsi),%ymm2
@@ -145,6 +163,8 @@ shuffle4 8,10,3,10
145163
shuffle4 4,6,8,6
146164
shuffle4 9,11,4,11
147165

166+
/* All: abs bound < 5q */
167+
148168
/* level 4 */
149169
vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+40+8*\off)*4(%rsi),%ymm1
150170
vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+40+8*\off)*4(%rsi),%ymm2
@@ -159,6 +179,8 @@ shuffle2 5,6,7,6
159179
shuffle2 3,4,5,4
160180
shuffle2 10,11,3,11
161181

182+
/* All: abs bound < 6q */
183+
162184
/* level 5 */
163185
vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+72+8*\off)*4(%rsi),%ymm1
164186
vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+72+8*\off)*4(%rsi),%ymm2
@@ -170,6 +192,8 @@ butterfly 8,4,1,10,2,15
170192
butterfly 7,3,1,10,2,15
171193
butterfly 6,11,1,10,2,15
172194

195+
/* All: abs bound < 7q */
196+
173197
/* level 6 */
174198
vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+104+8*\off)*4(%rsi),%ymm1
175199
vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+104+8*\off)*4(%rsi),%ymm2
@@ -185,6 +209,8 @@ vmovshdup %ymm2,%ymm15
185209
butterfly 5,3,1,10,2,15
186210
butterfly 4,11,1,10,2,15
187211

212+
/* All: abs bound < 8q */
213+
188214
/* level 7 */
189215
vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+168+8*\off)*4(%rsi),%ymm1
190216
vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+168+8*\off)*4(%rsi),%ymm2
@@ -210,6 +236,8 @@ vpsrlq $32,%ymm1,%ymm10
210236
vmovshdup %ymm2,%ymm15
211237
butterfly 3,11,1,10,2,15
212238

239+
/* All: abs bound < 9q */
240+
213241
vmovdqa %ymm9,256*\off+ 0(%rdi)
214242
vmovdqa %ymm8,256*\off+ 32(%rdi)
215243
vmovdqa %ymm7,256*\off+ 64(%rdi)

0 commit comments

Comments
 (0)