Skip to content

Commit 6baf76f

Browse files
dkostichanno-becker
authored andcommitted
Rewrote tomont.S with a macro instead of the function call
Signed-off-by: Dusan Kostic <[email protected]>
1 parent 752f8aa commit 6baf76f

File tree

2 files changed

+82
-40
lines changed

2 files changed

+82
-40
lines changed

dev/x86_64/src/tomont.S

Lines changed: 32 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,8 @@
1818
* Changes:
1919
* - Add call to csub in reduce128_avx to produce outputs
2020
* in [0,1,...,q-1] rather than [0,1,...,q], matching the
21-
* semantics of mlk_poly_reduce().
21+
* semantics of mlk_poly_reduce(),
22+
* - Use a macro instead of a local function call.
2223
*/
2324

2425
#include "../../../common.h"
@@ -29,29 +30,16 @@
2930
#include "consts.h"
3031
#include "fq.inc"
3132

32-
.text
33-
.global MLK_ASM_NAMESPACE(tomont_avx2)
34-
.balign 4
35-
MLK_ASM_FN_SYMBOL(tomont_avx2)
36-
#consts
37-
vmovdqa MLK_AVX2_BACKEND_DATA_OFFSET_16XQ*2(%rsi),%ymm0
38-
vmovdqa MLK_AVX2_BACKEND_DATA_OFFSET_16XMONTSQLO*2(%rsi),%ymm1
39-
vmovdqa MLK_AVX2_BACKEND_DATA_OFFSET_16XMONTSQHI*2(%rsi),%ymm2
40-
call tomont_avx2_core
41-
add $256,%rdi
42-
call tomont_avx2_core
43-
ret
44-
45-
tomont_avx2_core:
33+
.macro tomont_128_coefficients offset
4634
#load
47-
vmovdqa (%rdi),%ymm3
48-
vmovdqa 32(%rdi),%ymm4
49-
vmovdqa 64(%rdi),%ymm5
50-
vmovdqa 96(%rdi),%ymm6
51-
vmovdqa 128(%rdi),%ymm7
52-
vmovdqa 160(%rdi),%ymm8
53-
vmovdqa 192(%rdi),%ymm9
54-
vmovdqa 224(%rdi),%ymm10
35+
vmovdqa (\offset + 0)(%rdi), %ymm3
36+
vmovdqa (\offset + 32)(%rdi), %ymm4
37+
vmovdqa (\offset + 64)(%rdi), %ymm5
38+
vmovdqa (\offset + 96)(%rdi), %ymm6
39+
vmovdqa (\offset +128)(%rdi), %ymm7
40+
vmovdqa (\offset +160)(%rdi), %ymm8
41+
vmovdqa (\offset +192)(%rdi), %ymm9
42+
vmovdqa (\offset +224)(%rdi), %ymm10
5543

5644
fqmulprecomp 1,2,3,11
5745
fqmulprecomp 1,2,4,12
@@ -63,14 +51,27 @@ fqmulprecomp 1,2,9,12
6351
fqmulprecomp 1,2,10,13
6452

6553
#store
66-
vmovdqa %ymm3,(%rdi)
67-
vmovdqa %ymm4,32(%rdi)
68-
vmovdqa %ymm5,64(%rdi)
69-
vmovdqa %ymm6,96(%rdi)
70-
vmovdqa %ymm7,128(%rdi)
71-
vmovdqa %ymm8,160(%rdi)
72-
vmovdqa %ymm9,192(%rdi)
73-
vmovdqa %ymm10,224(%rdi)
54+
vmovdqa %ymm3, (\offset + 0)(%rdi)
55+
vmovdqa %ymm4, (\offset + 32)(%rdi)
56+
vmovdqa %ymm5, (\offset + 64)(%rdi)
57+
vmovdqa %ymm6, (\offset + 96)(%rdi)
58+
vmovdqa %ymm7, (\offset +128)(%rdi)
59+
vmovdqa %ymm8, (\offset +160)(%rdi)
60+
vmovdqa %ymm9, (\offset +192)(%rdi)
61+
vmovdqa %ymm10, (\offset +224)(%rdi)
62+
.endm
63+
64+
.text
65+
.global MLK_ASM_NAMESPACE(tomont_avx2)
66+
.balign 4
67+
MLK_ASM_FN_SYMBOL(tomont_avx2)
68+
#consts
69+
vmovdqa MLK_AVX2_BACKEND_DATA_OFFSET_16XQ*2(%rsi),%ymm0
70+
vmovdqa MLK_AVX2_BACKEND_DATA_OFFSET_16XMONTSQLO*2(%rsi),%ymm1
71+
vmovdqa MLK_AVX2_BACKEND_DATA_OFFSET_16XMONTSQHI*2(%rsi),%ymm2
72+
73+
tomont_128_coefficients 0
74+
tomont_128_coefficients 2*128
7475

7576
ret
7677

mlkem/src/native/x86_64/src/tomont.S

Lines changed: 50 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,8 @@
1818
* Changes:
1919
* - Add call to csub in reduce128_avx to produce outputs
2020
* in [0,1,...,q-1] rather than [0,1,...,q], matching the
21-
* semantics of mlk_poly_reduce().
21+
* semantics of mlk_poly_reduce(),
22+
* - Use a macro instead of a local function call.
2223
*/
2324

2425
#include "../../../common.h"
@@ -40,14 +41,6 @@ MLK_ASM_FN_SYMBOL(tomont_avx2)
4041
vmovdqa (%rsi), %ymm0
4142
vmovdqa 0xa0(%rsi), %ymm1
4243
vmovdqa 0xc0(%rsi), %ymm2
43-
callq Ltomont_avx2_core
44-
addq $0x100, %rdi # imm = 0x100
45-
callq Ltomont_avx2_core
46-
retq
47-
.cfi_endproc
48-
49-
Ltomont_avx2_core:
50-
.cfi_startproc
5144
vmovdqa (%rdi), %ymm3
5245
vmovdqa 0x20(%rdi), %ymm4
5346
vmovdqa 0x40(%rdi), %ymm5
@@ -96,6 +89,54 @@ Ltomont_avx2_core:
9689
vmovdqa %ymm8, 0xa0(%rdi)
9790
vmovdqa %ymm9, 0xc0(%rdi)
9891
vmovdqa %ymm10, 0xe0(%rdi)
92+
vmovdqa 0x100(%rdi), %ymm3
93+
vmovdqa 0x120(%rdi), %ymm4
94+
vmovdqa 0x140(%rdi), %ymm5
95+
vmovdqa 0x160(%rdi), %ymm6
96+
vmovdqa 0x180(%rdi), %ymm7
97+
vmovdqa 0x1a0(%rdi), %ymm8
98+
vmovdqa 0x1c0(%rdi), %ymm9
99+
vmovdqa 0x1e0(%rdi), %ymm10
100+
vpmullw %ymm1, %ymm3, %ymm11
101+
vpmulhw %ymm2, %ymm3, %ymm3
102+
vpmulhw %ymm0, %ymm11, %ymm11
103+
vpsubw %ymm11, %ymm3, %ymm3
104+
vpmullw %ymm1, %ymm4, %ymm12
105+
vpmulhw %ymm2, %ymm4, %ymm4
106+
vpmulhw %ymm0, %ymm12, %ymm12
107+
vpsubw %ymm12, %ymm4, %ymm4
108+
vpmullw %ymm1, %ymm5, %ymm13
109+
vpmulhw %ymm2, %ymm5, %ymm5
110+
vpmulhw %ymm0, %ymm13, %ymm13
111+
vpsubw %ymm13, %ymm5, %ymm5
112+
vpmullw %ymm1, %ymm6, %ymm14
113+
vpmulhw %ymm2, %ymm6, %ymm6
114+
vpmulhw %ymm0, %ymm14, %ymm14
115+
vpsubw %ymm14, %ymm6, %ymm6
116+
vpmullw %ymm1, %ymm7, %ymm15
117+
vpmulhw %ymm2, %ymm7, %ymm7
118+
vpmulhw %ymm0, %ymm15, %ymm15
119+
vpsubw %ymm15, %ymm7, %ymm7
120+
vpmullw %ymm1, %ymm8, %ymm11
121+
vpmulhw %ymm2, %ymm8, %ymm8
122+
vpmulhw %ymm0, %ymm11, %ymm11
123+
vpsubw %ymm11, %ymm8, %ymm8
124+
vpmullw %ymm1, %ymm9, %ymm12
125+
vpmulhw %ymm2, %ymm9, %ymm9
126+
vpmulhw %ymm0, %ymm12, %ymm12
127+
vpsubw %ymm12, %ymm9, %ymm9
128+
vpmullw %ymm1, %ymm10, %ymm13
129+
vpmulhw %ymm2, %ymm10, %ymm10
130+
vpmulhw %ymm0, %ymm13, %ymm13
131+
vpsubw %ymm13, %ymm10, %ymm10
132+
vmovdqa %ymm3, 0x100(%rdi)
133+
vmovdqa %ymm4, 0x120(%rdi)
134+
vmovdqa %ymm5, 0x140(%rdi)
135+
vmovdqa %ymm6, 0x160(%rdi)
136+
vmovdqa %ymm7, 0x180(%rdi)
137+
vmovdqa %ymm8, 0x1a0(%rdi)
138+
vmovdqa %ymm9, 0x1c0(%rdi)
139+
vmovdqa %ymm10, 0x1e0(%rdi)
99140
retq
100141
.cfi_endproc
101142

0 commit comments

Comments
 (0)