Skip to content

Commit 58da580

Browse files
authored
Merge pull request #1177 from pq-code-package/x86-asm-remove-local-call
Use asm macros instead of local function calls in x86 asm
2 parents b68ef1c + 6baf76f commit 58da580

File tree

10 files changed

+489
-200
lines changed

10 files changed

+489
-200
lines changed

dev/x86_64/src/nttfrombytes.S

Lines changed: 26 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -27,26 +27,14 @@
2727
#include "fq.inc"
2828
#include "shuffle.inc"
2929

30-
.text
31-
.global MLK_ASM_NAMESPACE(nttfrombytes_avx2)
32-
.balign 4
33-
MLK_ASM_FN_SYMBOL(nttfrombytes_avx2)
34-
#consts
35-
vmovdqa MLK_AVX2_BACKEND_DATA_OFFSET_16XMASK*2(%rdx),%ymm0
36-
call nttfrombytes_avx2_core
37-
add $256,%rdi
38-
add $192,%rsi
39-
call nttfrombytes_avx2_core
40-
ret
41-
42-
nttfrombytes_avx2_core:
30+
.macro nttfrombytes_128_coefficients offset_in offset_out
4331
#load
44-
vmovdqu (%rsi),%ymm4
45-
vmovdqu 32(%rsi),%ymm5
46-
vmovdqu 64(%rsi),%ymm6
47-
vmovdqu 96(%rsi),%ymm7
48-
vmovdqu 128(%rsi),%ymm8
49-
vmovdqu 160(%rsi),%ymm9
32+
vmovdqu (\offset_in + 0)(%rsi), %ymm4
33+
vmovdqu (\offset_in + 32)(%rsi), %ymm5
34+
vmovdqu (\offset_in + 64)(%rsi), %ymm6
35+
vmovdqu (\offset_in + 96)(%rsi), %ymm7
36+
vmovdqu (\offset_in +128)(%rsi), %ymm8
37+
vmovdqu (\offset_in +160)(%rsi), %ymm9
5038

5139
shuffle8 4,7,3,7
5240
shuffle8 5,8,4,8
@@ -94,14 +82,25 @@ vpsrlw $4,%ymm9,%ymm1
9482
vpand %ymm0,%ymm1,%ymm1
9583

9684
#store
97-
vmovdqa %ymm10,(%rdi)
98-
vmovdqa %ymm11,32(%rdi)
99-
vmovdqa %ymm12,64(%rdi)
100-
vmovdqa %ymm13,96(%rdi)
101-
vmovdqa %ymm8,128(%rdi)
102-
vmovdqa %ymm14,160(%rdi)
103-
vmovdqa %ymm15,192(%rdi)
104-
vmovdqa %ymm1,224(%rdi)
85+
vmovdqa %ymm10, (\offset_out + 0)(%rdi)
86+
vmovdqa %ymm11, (\offset_out + 32)(%rdi)
87+
vmovdqa %ymm12, (\offset_out + 64)(%rdi)
88+
vmovdqa %ymm13, (\offset_out + 96)(%rdi)
89+
vmovdqa %ymm8, (\offset_out +128)(%rdi)
90+
vmovdqa %ymm14, (\offset_out +160)(%rdi)
91+
vmovdqa %ymm15, (\offset_out +192)(%rdi)
92+
vmovdqa %ymm1, (\offset_out +224)(%rdi)
93+
.endm
94+
95+
.text
96+
.global MLK_ASM_NAMESPACE(nttfrombytes_avx2)
97+
.balign 4
98+
MLK_ASM_FN_SYMBOL(nttfrombytes_avx2)
99+
#consts
100+
vmovdqa MLK_AVX2_BACKEND_DATA_OFFSET_16XMASK*2(%rdx),%ymm0
101+
102+
nttfrombytes_128_coefficients 0 0
103+
nttfrombytes_128_coefficients 192 256
105104

106105
ret
107106

dev/x86_64/src/ntttobytes.S

Lines changed: 27 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -27,28 +27,16 @@
2727
#include "fq.inc"
2828
#include "shuffle.inc"
2929

30-
.text
31-
.global MLK_ASM_NAMESPACE(ntttobytes_avx2)
32-
.balign 4
33-
MLK_ASM_FN_SYMBOL(ntttobytes_avx2)
34-
#consts
35-
vmovdqa MLK_AVX2_BACKEND_DATA_OFFSET_16XQ*2(%rdx),%ymm0
36-
call ntttobytes_avx2_core
37-
add $256,%rsi
38-
add $192,%rdi
39-
call ntttobytes_avx2_core
40-
ret
41-
42-
ntttobytes_avx2_core:
30+
.macro ntttobytes_128_coefficients offset_in offset_out
4331
#load
44-
vmovdqa (%rsi),%ymm5
45-
vmovdqa 32(%rsi),%ymm6
46-
vmovdqa 64(%rsi),%ymm7
47-
vmovdqa 96(%rsi),%ymm8
48-
vmovdqa 128(%rsi),%ymm9
49-
vmovdqa 160(%rsi),%ymm10
50-
vmovdqa 192(%rsi),%ymm11
51-
vmovdqa 224(%rsi),%ymm12
32+
vmovdqa (\offset_in + 0)(%rsi), %ymm5
33+
vmovdqa (\offset_in + 32)(%rsi), %ymm6
34+
vmovdqa (\offset_in + 64)(%rsi), %ymm7
35+
vmovdqa (\offset_in + 96)(%rsi), %ymm8
36+
vmovdqa (\offset_in + 128)(%rsi), %ymm9
37+
vmovdqa (\offset_in + 160)(%rsi), %ymm10
38+
vmovdqa (\offset_in + 192)(%rsi), %ymm11
39+
vmovdqa (\offset_in + 224)(%rsi), %ymm12
5240

5341
#bitpack
5442
vpsllw $12,%ymm6,%ymm4
@@ -90,12 +78,24 @@ shuffle8 6,3,7,3
9078
shuffle8 4,9,6,9
9179

9280
#store
93-
vmovdqu %ymm5,(%rdi)
94-
vmovdqu %ymm7,32(%rdi)
95-
vmovdqu %ymm6,64(%rdi)
96-
vmovdqu %ymm8,96(%rdi)
97-
vmovdqu %ymm3,128(%rdi)
98-
vmovdqu %ymm9,160(%rdi)
81+
vmovdqu %ymm5, (\offset_out + 0)(%rdi)
82+
vmovdqu %ymm7, (\offset_out + 32)(%rdi)
83+
vmovdqu %ymm6, (\offset_out + 64)(%rdi)
84+
vmovdqu %ymm8, (\offset_out + 96)(%rdi)
85+
vmovdqu %ymm3, (\offset_out +128)(%rdi)
86+
vmovdqu %ymm9, (\offset_out +160)(%rdi)
87+
.endm
88+
89+
90+
.text
91+
.global MLK_ASM_NAMESPACE(ntttobytes_avx2)
92+
.balign 4
93+
MLK_ASM_FN_SYMBOL(ntttobytes_avx2)
94+
#consts
95+
vmovdqa MLK_AVX2_BACKEND_DATA_OFFSET_16XQ*2(%rdx),%ymm0
96+
97+
ntttobytes_128_coefficients 0 0
98+
ntttobytes_128_coefficients 256 192
9999

100100
ret
101101

dev/x86_64/src/nttunpack.S

Lines changed: 26 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -27,25 +27,16 @@
2727
#include "fq.inc"
2828
#include "shuffle.inc"
2929

30-
.text
31-
.global MLK_ASM_NAMESPACE(nttunpack_avx2)
32-
.balign 4
33-
MLK_ASM_FN_SYMBOL(nttunpack_avx2)
34-
call nttunpack_avx2_core
35-
add $256,%rdi
36-
call nttunpack_avx2_core
37-
ret
38-
39-
nttunpack_avx2_core:
30+
.macro nttunpack_128_coefficients offset
4031
#load
41-
vmovdqa (%rdi),%ymm4
42-
vmovdqa 32(%rdi),%ymm5
43-
vmovdqa 64(%rdi),%ymm6
44-
vmovdqa 96(%rdi),%ymm7
45-
vmovdqa 128(%rdi),%ymm8
46-
vmovdqa 160(%rdi),%ymm9
47-
vmovdqa 192(%rdi),%ymm10
48-
vmovdqa 224(%rdi),%ymm11
32+
vmovdqa (\offset + 0)(%rdi), %ymm4
33+
vmovdqa (\offset + 32)(%rdi), %ymm5
34+
vmovdqa (\offset + 64)(%rdi), %ymm6
35+
vmovdqa (\offset + 96)(%rdi), %ymm7
36+
vmovdqa (\offset + 128)(%rdi), %ymm8
37+
vmovdqa (\offset + 160)(%rdi), %ymm9
38+
vmovdqa (\offset + 192)(%rdi), %ymm10
39+
vmovdqa (\offset + 224)(%rdi), %ymm11
4940

5041
shuffle8 4,8,3,8
5142
shuffle8 5,9,4,9
@@ -68,14 +59,23 @@ shuffle1 7,3,8,3
6859
shuffle1 6,11,7,11
6960

7061
#store
71-
vmovdqa %ymm10,(%rdi)
72-
vmovdqa %ymm5,32(%rdi)
73-
vmovdqa %ymm9,64(%rdi)
74-
vmovdqa %ymm4,96(%rdi)
75-
vmovdqa %ymm8,128(%rdi)
76-
vmovdqa %ymm3,160(%rdi)
77-
vmovdqa %ymm7,192(%rdi)
78-
vmovdqa %ymm11,224(%rdi)
62+
vmovdqa %ymm10, (\offset + 0)(%rdi)
63+
vmovdqa %ymm5, (\offset + 32)(%rdi)
64+
vmovdqa %ymm9, (\offset + 64)(%rdi)
65+
vmovdqa %ymm4, (\offset + 96)(%rdi)
66+
vmovdqa %ymm8, (\offset +128)(%rdi)
67+
vmovdqa %ymm3, (\offset +160)(%rdi)
68+
vmovdqa %ymm7, (\offset +192)(%rdi)
69+
vmovdqa %ymm11, (\offset +224)(%rdi)
70+
.endm
71+
72+
.text
73+
.global MLK_ASM_NAMESPACE(nttunpack_avx2)
74+
.balign 4
75+
MLK_ASM_FN_SYMBOL(nttunpack_avx2)
76+
77+
nttunpack_128_coefficients 0
78+
nttunpack_128_coefficients 2*128
7979

8080
ret
8181

dev/x86_64/src/reduce.S

Lines changed: 44 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,8 @@
1919
* Changes:
2020
* - Add call to csub in reduce128_avx to produce outputs
2121
* in [0,1,...,q-1] rather than [0,1,...,q], matching the
22-
* semantics of mlk_poly_reduce().
22+
* semantics of mlk_poly_reduce(),
23+
* - Use a macro instead of a local function call.
2324
*/
2425

2526
#include "../../../common.h"
@@ -31,56 +32,54 @@
3132
#include "consts.h"
3233
#include "fq.inc"
3334

35+
.macro reduce_128_coefficients offset
36+
vmovdqa (\offset + 0)(%rdi), %ymm2
37+
vmovdqa (\offset + 32)(%rdi), %ymm3
38+
vmovdqa (\offset + 64)(%rdi), %ymm4
39+
vmovdqa (\offset + 96)(%rdi), %ymm5
40+
vmovdqa (\offset + 128)(%rdi), %ymm6
41+
vmovdqa (\offset + 160)(%rdi), %ymm7
42+
vmovdqa (\offset + 192)(%rdi), %ymm8
43+
vmovdqa (\offset + 224)(%rdi), %ymm9
44+
45+
red16 2
46+
red16 3
47+
red16 4
48+
red16 5
49+
red16 6
50+
red16 7
51+
red16 8
52+
red16 9
53+
54+
csubq 2
55+
csubq 3
56+
csubq 4
57+
csubq 5
58+
csubq 6
59+
csubq 7
60+
csubq 8
61+
csubq 9
62+
63+
vmovdqa %ymm2, (\offset + 0)(%rdi)
64+
vmovdqa %ymm3, (\offset + 32)(%rdi)
65+
vmovdqa %ymm4, (\offset + 64)(%rdi)
66+
vmovdqa %ymm5, (\offset + 96)(%rdi)
67+
vmovdqa %ymm6, (\offset + 128)(%rdi)
68+
vmovdqa %ymm7, (\offset + 160)(%rdi)
69+
vmovdqa %ymm8, (\offset + 192)(%rdi)
70+
vmovdqa %ymm9, (\offset + 224)(%rdi)
71+
.endm
72+
3473
.text
3574
.global MLK_ASM_NAMESPACE(reduce_avx2)
3675
.balign 4
3776
MLK_ASM_FN_SYMBOL(reduce_avx2)
3877
#consts
39-
vmovdqa MLK_AVX2_BACKEND_DATA_OFFSET_16XQ*2(%rsi),%ymm0
40-
vmovdqa MLK_AVX2_BACKEND_DATA_OFFSET_16XV*2(%rsi),%ymm1
41-
call reduce_avx2_core
42-
add $256,%rdi
43-
call reduce_avx2_core
44-
ret
45-
46-
reduce_avx2_core:
47-
#load
48-
vmovdqa (%rdi),%ymm2
49-
vmovdqa 32(%rdi),%ymm3
50-
vmovdqa 64(%rdi),%ymm4
51-
vmovdqa 96(%rdi),%ymm5
52-
vmovdqa 128(%rdi),%ymm6
53-
vmovdqa 160(%rdi),%ymm7
54-
vmovdqa 192(%rdi),%ymm8
55-
vmovdqa 224(%rdi),%ymm9
56-
57-
red16 2
58-
red16 3
59-
red16 4
60-
red16 5
61-
red16 6
62-
red16 7
63-
red16 8
64-
red16 9
65-
66-
csubq 2
67-
csubq 3
68-
csubq 4
69-
csubq 5
70-
csubq 6
71-
csubq 7
72-
csubq 8
73-
csubq 9
78+
vmovdqa MLK_AVX2_BACKEND_DATA_OFFSET_16XQ*2(%rsi),%ymm0
79+
vmovdqa MLK_AVX2_BACKEND_DATA_OFFSET_16XV*2(%rsi),%ymm1
7480

75-
#store
76-
vmovdqa %ymm2,(%rdi)
77-
vmovdqa %ymm3,32(%rdi)
78-
vmovdqa %ymm4,64(%rdi)
79-
vmovdqa %ymm5,96(%rdi)
80-
vmovdqa %ymm6,128(%rdi)
81-
vmovdqa %ymm7,160(%rdi)
82-
vmovdqa %ymm8,192(%rdi)
83-
vmovdqa %ymm9,224(%rdi)
81+
reduce_128_coefficients 0
82+
reduce_128_coefficients 128*2
8483

8584
ret
8685

dev/x86_64/src/tomont.S

Lines changed: 32 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,8 @@
1818
* Changes:
1919
* - Add call to csub in reduce128_avx to produce outputs
2020
* in [0,1,...,q-1] rather than [0,1,...,q], matching the
21-
* semantics of mlk_poly_reduce().
21+
* semantics of mlk_poly_reduce(),
22+
* - Use a macro instead of a local function call.
2223
*/
2324

2425
#include "../../../common.h"
@@ -29,29 +30,16 @@
2930
#include "consts.h"
3031
#include "fq.inc"
3132

32-
.text
33-
.global MLK_ASM_NAMESPACE(tomont_avx2)
34-
.balign 4
35-
MLK_ASM_FN_SYMBOL(tomont_avx2)
36-
#consts
37-
vmovdqa MLK_AVX2_BACKEND_DATA_OFFSET_16XQ*2(%rsi),%ymm0
38-
vmovdqa MLK_AVX2_BACKEND_DATA_OFFSET_16XMONTSQLO*2(%rsi),%ymm1
39-
vmovdqa MLK_AVX2_BACKEND_DATA_OFFSET_16XMONTSQHI*2(%rsi),%ymm2
40-
call tomont_avx2_core
41-
add $256,%rdi
42-
call tomont_avx2_core
43-
ret
44-
45-
tomont_avx2_core:
33+
.macro tomont_128_coefficients offset
4634
#load
47-
vmovdqa (%rdi),%ymm3
48-
vmovdqa 32(%rdi),%ymm4
49-
vmovdqa 64(%rdi),%ymm5
50-
vmovdqa 96(%rdi),%ymm6
51-
vmovdqa 128(%rdi),%ymm7
52-
vmovdqa 160(%rdi),%ymm8
53-
vmovdqa 192(%rdi),%ymm9
54-
vmovdqa 224(%rdi),%ymm10
35+
vmovdqa (\offset + 0)(%rdi), %ymm3
36+
vmovdqa (\offset + 32)(%rdi), %ymm4
37+
vmovdqa (\offset + 64)(%rdi), %ymm5
38+
vmovdqa (\offset + 96)(%rdi), %ymm6
39+
vmovdqa (\offset +128)(%rdi), %ymm7
40+
vmovdqa (\offset +160)(%rdi), %ymm8
41+
vmovdqa (\offset +192)(%rdi), %ymm9
42+
vmovdqa (\offset +224)(%rdi), %ymm10
5543

5644
fqmulprecomp 1,2,3,11
5745
fqmulprecomp 1,2,4,12
@@ -63,14 +51,27 @@ fqmulprecomp 1,2,9,12
6351
fqmulprecomp 1,2,10,13
6452

6553
#store
66-
vmovdqa %ymm3,(%rdi)
67-
vmovdqa %ymm4,32(%rdi)
68-
vmovdqa %ymm5,64(%rdi)
69-
vmovdqa %ymm6,96(%rdi)
70-
vmovdqa %ymm7,128(%rdi)
71-
vmovdqa %ymm8,160(%rdi)
72-
vmovdqa %ymm9,192(%rdi)
73-
vmovdqa %ymm10,224(%rdi)
54+
vmovdqa %ymm3, (\offset + 0)(%rdi)
55+
vmovdqa %ymm4, (\offset + 32)(%rdi)
56+
vmovdqa %ymm5, (\offset + 64)(%rdi)
57+
vmovdqa %ymm6, (\offset + 96)(%rdi)
58+
vmovdqa %ymm7, (\offset +128)(%rdi)
59+
vmovdqa %ymm8, (\offset +160)(%rdi)
60+
vmovdqa %ymm9, (\offset +192)(%rdi)
61+
vmovdqa %ymm10, (\offset +224)(%rdi)
62+
.endm
63+
64+
.text
65+
.global MLK_ASM_NAMESPACE(tomont_avx2)
66+
.balign 4
67+
MLK_ASM_FN_SYMBOL(tomont_avx2)
68+
#consts
69+
vmovdqa MLK_AVX2_BACKEND_DATA_OFFSET_16XQ*2(%rsi),%ymm0
70+
vmovdqa MLK_AVX2_BACKEND_DATA_OFFSET_16XMONTSQLO*2(%rsi),%ymm1
71+
vmovdqa MLK_AVX2_BACKEND_DATA_OFFSET_16XMONTSQHI*2(%rsi),%ymm2
72+
73+
tomont_128_coefficients 0
74+
tomont_128_coefficients 2*128
7475

7576
ret
7677

0 commit comments

Comments
 (0)