Skip to content

Commit c82cda3

Browse files
committed
Eliminate caddq intrinsics
This commit replace the eurrently caddq AVX2 implementation to x86_64 assembly code. Signed-off-by: willieyz <[email protected]>
1 parent 9258ea1 commit c82cda3

File tree

6 files changed

+116
-97
lines changed

6 files changed

+116
-97
lines changed

BIBLIOGRAPHY.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -219,7 +219,7 @@ source code and documentation.
219219
- [dev/x86_64/src/pointwise_acc_l4.S](dev/x86_64/src/pointwise_acc_l4.S)
220220
- [dev/x86_64/src/pointwise_acc_l5.S](dev/x86_64/src/pointwise_acc_l5.S)
221221
- [dev/x86_64/src/pointwise_acc_l7.S](dev/x86_64/src/pointwise_acc_l7.S)
222-
- [dev/x86_64/src/poly_caddq_avx2.c](dev/x86_64/src/poly_caddq_avx2.c)
222+
- [dev/x86_64/src/poly_caddq_avx2.S](dev/x86_64/src/poly_caddq_avx2.S)
223223
- [dev/x86_64/src/poly_chknorm_avx2.c](dev/x86_64/src/poly_chknorm_avx2.c)
224224
- [dev/x86_64/src/poly_decompose_32_avx2.c](dev/x86_64/src/poly_decompose_32_avx2.c)
225225
- [dev/x86_64/src/poly_decompose_88_avx2.c](dev/x86_64/src/poly_decompose_88_avx2.c)
@@ -237,7 +237,7 @@ source code and documentation.
237237
- [mldsa/src/native/x86_64/src/pointwise_acc_l4.S](mldsa/src/native/x86_64/src/pointwise_acc_l4.S)
238238
- [mldsa/src/native/x86_64/src/pointwise_acc_l5.S](mldsa/src/native/x86_64/src/pointwise_acc_l5.S)
239239
- [mldsa/src/native/x86_64/src/pointwise_acc_l7.S](mldsa/src/native/x86_64/src/pointwise_acc_l7.S)
240-
- [mldsa/src/native/x86_64/src/poly_caddq_avx2.c](mldsa/src/native/x86_64/src/poly_caddq_avx2.c)
240+
- [mldsa/src/native/x86_64/src/poly_caddq_avx2.S](mldsa/src/native/x86_64/src/poly_caddq_avx2.S)
241241
- [mldsa/src/native/x86_64/src/poly_chknorm_avx2.c](mldsa/src/native/x86_64/src/poly_chknorm_avx2.c)
242242
- [mldsa/src/native/x86_64/src/poly_decompose_32_avx2.c](mldsa/src/native/x86_64/src/poly_decompose_32_avx2.c)
243243
- [mldsa/src/native/x86_64/src/poly_decompose_88_avx2.c](mldsa/src/native/x86_64/src/poly_decompose_88_avx2.c)
Lines changed: 38 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -17,14 +17,6 @@
1717
* AVX2 Dilithium implementation @[REF_AVX2].
1818
*/
1919

20-
#include "../../../common.h"
21-
22-
#if defined(MLD_ARITH_BACKEND_X86_64_DEFAULT) && \
23-
!defined(MLD_CONFIG_MULTILEVEL_NO_SHARED)
24-
25-
#include <immintrin.h>
26-
#include "arith_native_x86_64.h"
27-
#include "consts.h"
2820

2921
/*************************************************
3022
* Name: mld_poly_caddq_avx2
@@ -34,28 +26,41 @@
3426
*
3527
* Arguments: - int32_t *r: pointer to input/output polynomial
3628
**************************************************/
37-
void mld_poly_caddq_avx2(int32_t *r)
38-
{
39-
unsigned int i;
40-
__m256i f, g;
41-
const __m256i q = _mm256_set1_epi32(MLDSA_Q);
42-
const __m256i zero = _mm256_setzero_si256();
43-
__m256i *rr = (__m256i *)r;
44-
45-
for (i = 0; i < MLDSA_N / 8; i++)
46-
{
47-
f = _mm256_load_si256(&rr[i]);
48-
g = _mm256_cmpgt_epi32(zero, f);
49-
g = _mm256_and_si256(g, q);
50-
f = _mm256_add_epi32(f, g);
51-
_mm256_store_si256(&rr[i], f);
52-
}
53-
}
54-
55-
#else /* MLD_ARITH_BACKEND_X86_64_DEFAULT && !MLD_CONFIG_MULTILEVEL_NO_SHARED \
56-
*/
57-
58-
MLD_EMPTY_CU(avx2_reduce)
59-
60-
#endif /* !(MLD_ARITH_BACKEND_X86_64_DEFAULT && \
61-
!MLD_CONFIG_MULTILEVEL_NO_SHARED) */
29+
30+
#include "../../../common.h"
31+
32+
#if defined(MLD_ARITH_BACKEND_X86_64_DEFAULT) && \
33+
!defined(MLD_CONFIG_MULTILEVEL_NO_SHARED)
34+
35+
/* simpasm: header-end */
36+
37+
.text
38+
.global MLD_ASM_NAMESPACE(poly_caddq_avx2)
39+
.balign 16
40+
MLD_ASM_FN_SYMBOL(poly_caddq_avx2)
41+
42+
movl $8380417, %eax
43+
leaq 1024(%rdi), %rdx
44+
vpxor %xmm3, %xmm3, %xmm3
45+
vmovd %eax, %xmm2
46+
vpbroadcastd %xmm2, %ymm2
47+
48+
poly_caddq_avx2_loop:
49+
vmovdqa (%rdi), %ymm1
50+
addq $32, %rdi
51+
vpcmpgtd %ymm1, %ymm3, %ymm0
52+
vpand %ymm2, %ymm0, %ymm0
53+
vpaddd %ymm1, %ymm0, %ymm0
54+
vmovdqa %ymm0, -32(%rdi)
55+
cmpq %rdi, %rdx
56+
jne poly_caddq_avx2_loop
57+
vzeroupper
58+
xorl %eax, %eax
59+
xorl %edx, %edx
60+
xorl %edi, %edi
61+
ret
62+
63+
/* simpasm: footer-start */
64+
65+
#endif /* MLD_ARITH_BACKEND_X86_64_DEFAULT && !MLD_CONFIG_MULTILEVEL_NO_SHARED \
66+
*/

mldsa/mldsa_native.S

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,7 @@
8686
#include "src/native/x86_64/src/pointwise_acc_l4.S"
8787
#include "src/native/x86_64/src/pointwise_acc_l5.S"
8888
#include "src/native/x86_64/src/pointwise_acc_l7.S"
89+
#include "src/native/x86_64/src/poly_caddq_avx2.S"
8990
#endif /* MLD_SYS_X86_64 */
9091
#endif /* MLD_CONFIG_USE_NATIVE_BACKEND_ARITH */
9192

mldsa/mldsa_native.c

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,6 @@
8282
#endif /* MLD_SYS_AARCH64 */
8383
#if defined(MLD_SYS_X86_64)
8484
#include "src/native/x86_64/src/consts.c"
85-
#include "src/native/x86_64/src/poly_caddq_avx2.c"
8685
#include "src/native/x86_64/src/poly_chknorm_avx2.c"
8786
#include "src/native/x86_64/src/poly_decompose_32_avx2.c"
8887
#include "src/native/x86_64/src/poly_decompose_88_avx2.c"
Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
/*
2+
* Copyright (c) The mldsa-native project authors
3+
* SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
4+
*/
5+
6+
/* References
7+
* ==========
8+
*
9+
* - [REF_AVX2]
10+
* CRYSTALS-Dilithium optimized AVX2 implementation
11+
* Bai, Ducas, Kiltz, Lepoint, Lyubashevsky, Schwabe, Seiler, Stehlé
12+
* https://github.com/pq-crystals/dilithium/tree/master/avx2
13+
*/
14+
15+
/*
16+
* This file is derived from the public domain
17+
* AVX2 Dilithium implementation @[REF_AVX2].
18+
*/
19+
20+
21+
/*************************************************
22+
* Name: mld_poly_caddq_avx2
23+
*
24+
* Description: For all coefficients of in/out polynomial add Q if
25+
* coefficient is negative.
26+
*
27+
* Arguments: - int32_t *r: pointer to input/output polynomial
28+
**************************************************/
29+
30+
#include "../../../common.h"
31+
32+
#if defined(MLD_ARITH_BACKEND_X86_64_DEFAULT) && \
33+
!defined(MLD_CONFIG_MULTILEVEL_NO_SHARED)
34+
35+
36+
/*
37+
* WARNING: This file is auto-derived from the mldsa-native source file
38+
* dev/x86_64/src/poly_caddq_avx2.S using scripts/simpasm. Do not modify it directly.
39+
*/
40+
41+
#if defined(__ELF__)
42+
.section .note.GNU-stack,"",@progbits
43+
#endif
44+
45+
.text
46+
.balign 4
47+
.global MLD_ASM_NAMESPACE(poly_caddq_avx2)
48+
MLD_ASM_FN_SYMBOL(poly_caddq_avx2)
49+
50+
.cfi_startproc
51+
movl $0x7fe001, %eax # imm = 0x7FE001
52+
leaq 0x400(%rdi), %rdx
53+
vpxor %xmm3, %xmm3, %xmm3
54+
vmovd %eax, %xmm2
55+
vpbroadcastd %xmm2, %ymm2
56+
57+
Lpoly_caddq_avx2_loop:
58+
vmovdqa (%rdi), %ymm1
59+
addq $0x20, %rdi
60+
vpcmpgtd %ymm1, %ymm3, %ymm0
61+
vpand %ymm2, %ymm0, %ymm0
62+
vpaddd %ymm1, %ymm0, %ymm0
63+
vmovdqa %ymm0, -0x20(%rdi)
64+
cmpq %rdi, %rdx
65+
jne Lpoly_caddq_avx2_loop
66+
vzeroupper
67+
xorl %eax, %eax
68+
xorl %edx, %edx
69+
xorl %edi, %edi
70+
retq
71+
.cfi_endproc
72+
73+
74+
#endif /* MLD_ARITH_BACKEND_X86_64_DEFAULT && !MLD_CONFIG_MULTILEVEL_NO_SHARED \
75+
*/

mldsa/src/native/x86_64/src/poly_caddq_avx2.c

Lines changed: 0 additions & 61 deletions
This file was deleted.

0 commit comments

Comments
 (0)