Skip to content

Commit ca27600

Browse files
committed
Add bounds reasoning comments to AVX2 basemul
Signed-off-by: jammychiou1 <[email protected]>
1 parent 4b3b1a5 commit ca27600

File tree

4 files changed

+53
-0
lines changed

4 files changed

+53
-0
lines changed

dev/x86_64/src/pointwise.S

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@ _looptop1:
6161
vpsrlq ymm11, ymm10, 32
6262
vpsrlq ymm13, ymm12, 32
6363
vmovshdup ymm15, ymm14
64+
/* All: abs bound < 9q */
6465

6566
// Multiply
6667
vpmuldq ymm2, ymm2, ymm10
@@ -69,6 +70,7 @@ _looptop1:
6970
vpmuldq ymm5, ymm5, ymm13
7071
vpmuldq ymm6, ymm6, ymm14
7172
vpmuldq ymm7, ymm7, ymm15
73+
/* All: abs bound < 81q^2 < 81*2^46 < 2^53 = 2^21R < qR/2 */
7274

7375
// Reduce
7476
vpmuldq ymm10, ymm0, ymm2
@@ -92,6 +94,11 @@ _looptop1:
9294
vpsrlq ymm2, ymm2, 32
9395
vpsrlq ymm4, ymm4, 32
9496
vmovshdup ymm6, ymm6
97+
/*
98+
* All coefficients are Montgomery-reduced. This results in the bound
99+
*
100+
* All: abs bound <= "input abs bound"/R + q/2 < (qR/2)/R + q/2 = q
101+
*/
95102

96103
// Store
97104
vpblendd ymm2, ymm2, ymm3, 0xAA

dev/x86_64/src/pointwise_acc_l4.S

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,12 +37,17 @@
3737
vpsrlq ymm9, ymm8, 32
3838
vmovshdup ymm11, ymm10
3939
vmovshdup ymm13, ymm12
40+
/*
41+
* 6, 7, 8, 9: from the first input polynomial, abs bound < q
42+
* 10, 11, 12, 13: from the second input polynomial, abs bound < 9q
43+
*/
4044

4145
// Multiply
4246
vpmuldq ymm6, ymm6, ymm10
4347
vpmuldq ymm7, ymm7, ymm11
4448
vpmuldq ymm8, ymm8, ymm12
4549
vpmuldq ymm9, ymm9, ymm13
50+
/* All: abs bound < 9q^2 */
4651
.endm
4752

4853
.macro acc
@@ -80,15 +85,19 @@ _looptop2:
8085
vmovdqa ymm3, ymm7
8186
vmovdqa ymm4, ymm8
8287
vmovdqa ymm5, ymm9
88+
/* All: abs bound < 9q^2 */
8389

8490
pointwise 1024
8591
acc
92+
/* All: abs bound < 18q^2 */
8693

8794
pointwise 2048
8895
acc
96+
/* All: abs bound < 27q^2 */
8997

9098
pointwise 3072
9199
acc
100+
/* All: abs bound < 36q^2 < 36*2^46 < 2^52 = 2^20R < qR/2 */
92101

93102
// Reduce
94103
vpmuldq ymm6, ymm0, ymm2
@@ -105,6 +114,11 @@ _looptop2:
105114
vpsubq ymm5, ymm5, ymm9
106115
vpsrlq ymm2, ymm2, 32
107116
vmovshdup ymm4, ymm4
117+
/*
118+
* All coefficients are Montgomery-reduced. This results in the bound
119+
*
120+
* All: abs bound <= "input abs bound"/R + q/2 < (qR/2)/R + q/2 = q
121+
*/
108122

109123
// Store
110124
vpblendd ymm2, ymm2, ymm3, 0xAA

dev/x86_64/src/pointwise_acc_l5.S

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,12 +37,17 @@
3737
vpsrlq ymm9, ymm8, 32
3838
vmovshdup ymm11, ymm10
3939
vmovshdup ymm13, ymm12
40+
/*
41+
* 6, 7, 8, 9: from the first input polynomial, abs bound < q
42+
* 10, 11, 12, 13: from the second input polynomial, abs bound < 9q
43+
*/
4044

4145
// Multiply
4246
vpmuldq ymm6, ymm6, ymm10
4347
vpmuldq ymm7, ymm7, ymm11
4448
vpmuldq ymm8, ymm8, ymm12
4549
vpmuldq ymm9, ymm9, ymm13
50+
/* All: abs bound < 9q^2 */
4651
.endm
4752

4853
.macro acc
@@ -80,18 +85,23 @@ _looptop2:
8085
vmovdqa ymm3, ymm7
8186
vmovdqa ymm4, ymm8
8287
vmovdqa ymm5, ymm9
88+
/* All: abs bound < 9q^2 */
8389

8490
pointwise 1024
8591
acc
92+
/* All: abs bound < 18q^2 */
8693

8794
pointwise 2048
8895
acc
96+
/* All: abs bound < 27q^2 */
8997

9098
pointwise 3072
9199
acc
100+
/* All: abs bound < 36q^2 */
92101

93102
pointwise 4096
94103
acc
104+
/* All: abs bound < 45q^2 < 45*2^46 < 2^52 = 2^20R < qR/2 */
95105

96106
// Reduce
97107
vpmuldq ymm6, ymm0, ymm2
@@ -108,6 +118,11 @@ _looptop2:
108118
vpsubq ymm5, ymm5, ymm9
109119
vpsrlq ymm2, ymm2, 32
110120
vmovshdup ymm4, ymm4
121+
/*
122+
* All coefficients are Montgomery-reduced. This results in the bound
123+
*
124+
* All: abs bound <= "input abs bound"/R + q/2 < (qR/2)/R + q/2 = q
125+
*/
111126

112127
// Store
113128
vpblendd ymm2, ymm2, ymm3, 0xAA

dev/x86_64/src/pointwise_acc_l7.S

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,12 +37,17 @@
3737
vpsrlq ymm9, ymm8, 32
3838
vmovshdup ymm11, ymm10
3939
vmovshdup ymm13, ymm12
40+
/*
41+
* 6, 7, 8, 9: from the first input polynomial, abs bound < q
42+
* 10, 11, 12, 13: from the second input polynomial, abs bound < 9q
43+
*/
4044

4145
// Multiply
4246
vpmuldq ymm6, ymm6, ymm10
4347
vpmuldq ymm7, ymm7, ymm11
4448
vpmuldq ymm8, ymm8, ymm12
4549
vpmuldq ymm9, ymm9, ymm13
50+
/* All: abs bound < 9q^2 */
4651
.endm
4752

4853
.macro acc
@@ -80,24 +85,31 @@ _looptop2:
8085
vmovdqa ymm3, ymm7
8186
vmovdqa ymm4, ymm8
8287
vmovdqa ymm5, ymm9
88+
/* All: abs bound < 9q^2 */
8389

8490
pointwise 1024
8591
acc
92+
/* All: abs bound < 18q^2 */
8693

8794
pointwise 2048
8895
acc
96+
/* All: abs bound < 27q^2 */
8997

9098
pointwise 3072
9199
acc
100+
/* All: abs bound < 36q^2 */
92101

93102
pointwise 4096
94103
acc
104+
/* All: abs bound < 45q^2 */
95105

96106
pointwise 5120
97107
acc
108+
/* All: abs bound < 54q^2 */
98109

99110
pointwise 6144
100111
acc
112+
/* All: abs bound < 63q^2 < 63*2^46 < 2^52 = 2^20R < qR/2 */
101113

102114
// Reduce
103115
vpmuldq ymm6, ymm0, ymm2
@@ -114,6 +126,11 @@ _looptop2:
114126
vpsubq ymm5, ymm5, ymm9
115127
vpsrlq ymm2, ymm2, 32
116128
vmovshdup ymm4, ymm4
129+
/*
130+
* All coefficients are Montgomery-reduced. This results in the bound
131+
*
132+
* All: abs bound <= "input abs bound"/R + q/2 < (qR/2)/R + q/2 = q
133+
*/
117134

118135
// Store
119136
vpblendd ymm2, ymm2, ymm3, 0xAA

0 commit comments

Comments
 (0)