Skip to content

Commit bf8a4e4

Browse files
willieyzmkannwischer
authored andcommitted
refactor:remove intrinsics.h dependency
- This commit add with 12 files modified, including: - Removing __m256i types, immintrin.h includes, and simplifying type definitions to use int32_t. - Change the prototype of dev/x86/*.c files from __m256i to int32_t, also cast all in32_t usage to __m256i of dev/x86/*.c files. - To aply above changes, we need use [8 * i] instead of i in each loop. Signed-off-by: willieyz <[email protected]>
1 parent 64ae351 commit bf8a4e4

29 files changed

+126
-219
lines changed

BIBLIOGRAPHY.md

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -206,7 +206,6 @@ source code and documentation.
206206
- Damien Stehlé
207207
* URL: https://github.com/pq-crystals/dilithium/tree/master/avx2
208208
* Referenced from:
209-
- [dev/x86_64/src/align.h](dev/x86_64/src/align.h)
210209
- [dev/x86_64/src/consts.c](dev/x86_64/src/consts.c)
211210
- [dev/x86_64/src/consts.h](dev/x86_64/src/consts.h)
212211
- [dev/x86_64/src/intt.S](dev/x86_64/src/intt.S)
@@ -227,7 +226,6 @@ source code and documentation.
227226
- [dev/x86_64/src/rej_uniform_avx2.c](dev/x86_64/src/rej_uniform_avx2.c)
228227
- [dev/x86_64/src/rej_uniform_eta2_avx2.c](dev/x86_64/src/rej_uniform_eta2_avx2.c)
229228
- [dev/x86_64/src/rej_uniform_eta4_avx2.c](dev/x86_64/src/rej_uniform_eta4_avx2.c)
230-
- [mldsa/src/native/x86_64/src/align.h](mldsa/src/native/x86_64/src/align.h)
231229
- [mldsa/src/native/x86_64/src/consts.c](mldsa/src/native/x86_64/src/consts.c)
232230
- [mldsa/src/native/x86_64/src/consts.h](mldsa/src/native/x86_64/src/consts.h)
233231
- [mldsa/src/native/x86_64/src/intt.S](mldsa/src/native/x86_64/src/intt.S)

dev/x86_64/meta.h

Lines changed: 14 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ static MLD_INLINE void mld_poly_permute_bitrev_to_custom(int32_t data[MLDSA_N])
4040
{
4141
if (mld_sys_check_capability(MLD_SYS_CAP_AVX2))
4242
{
43-
mld_nttunpack_avx2((__m256i *)(data));
43+
mld_nttunpack_avx2(data);
4444
}
4545
}
4646

@@ -51,7 +51,7 @@ static MLD_INLINE int mld_ntt_native(int32_t data[MLDSA_N])
5151
return MLD_NATIVE_FUNC_FALLBACK;
5252
}
5353

54-
mld_ntt_avx2((__m256i *)data, mld_qdata.vec);
54+
mld_ntt_avx2(data, mld_qdata);
5555
return MLD_NATIVE_FUNC_SUCCESS;
5656
}
5757
static MLD_INLINE int mld_intt_native(int32_t data[MLDSA_N])
@@ -60,7 +60,7 @@ static MLD_INLINE int mld_intt_native(int32_t data[MLDSA_N])
6060
{
6161
return MLD_NATIVE_FUNC_FALLBACK;
6262
}
63-
mld_invntt_avx2((__m256i *)data, mld_qdata.vec);
63+
mld_invntt_avx2(data, mld_qdata);
6464
return MLD_NATIVE_FUNC_SUCCESS;
6565
}
6666

@@ -137,7 +137,7 @@ static MLD_INLINE int mld_poly_decompose_32_native(int32_t *a1, int32_t *a0)
137137
{
138138
return MLD_NATIVE_FUNC_FALLBACK;
139139
}
140-
mld_poly_decompose_32_avx2((__m256i *)a1, (__m256i *)a0);
140+
mld_poly_decompose_32_avx2(a1, a0);
141141
return MLD_NATIVE_FUNC_SUCCESS;
142142
}
143143

@@ -147,7 +147,7 @@ static MLD_INLINE int mld_poly_decompose_88_native(int32_t *a1, int32_t *a0)
147147
{
148148
return MLD_NATIVE_FUNC_FALLBACK;
149149
}
150-
mld_poly_decompose_88_avx2((__m256i *)a1, (__m256i *)a0);
150+
mld_poly_decompose_88_avx2(a1, a0);
151151
return MLD_NATIVE_FUNC_SUCCESS;
152152
}
153153

@@ -167,8 +167,7 @@ static MLD_INLINE int mld_poly_use_hint_32_native(int32_t *b, const int32_t *a,
167167
{
168168
return MLD_NATIVE_FUNC_FALLBACK;
169169
}
170-
mld_poly_use_hint_32_avx2((__m256i *)b, (const __m256i *)a,
171-
(const __m256i *)h);
170+
mld_poly_use_hint_32_avx2(b, a, h);
172171
return MLD_NATIVE_FUNC_SUCCESS;
173172
}
174173

@@ -179,8 +178,7 @@ static MLD_INLINE int mld_poly_use_hint_88_native(int32_t *b, const int32_t *a,
179178
{
180179
return MLD_NATIVE_FUNC_FALLBACK;
181180
}
182-
mld_poly_use_hint_88_avx2((__m256i *)b, (const __m256i *)a,
183-
(const __m256i *)h);
181+
mld_poly_use_hint_88_avx2(b, a, h);
184182
return MLD_NATIVE_FUNC_SUCCESS;
185183
}
186184

@@ -190,7 +188,7 @@ static MLD_INLINE int mld_poly_chknorm_native(const int32_t *a, int32_t B)
190188
{
191189
return MLD_NATIVE_FUNC_FALLBACK;
192190
}
193-
return mld_poly_chknorm_avx2((const __m256i *)a, B);
191+
return mld_poly_chknorm_avx2(a, B);
194192
}
195193

196194
static MLD_INLINE int mld_polyz_unpack_17_native(int32_t *r, const uint8_t *a)
@@ -199,7 +197,7 @@ static MLD_INLINE int mld_polyz_unpack_17_native(int32_t *r, const uint8_t *a)
199197
{
200198
return MLD_NATIVE_FUNC_FALLBACK;
201199
}
202-
mld_polyz_unpack_17_avx2((__m256i *)r, a);
200+
mld_polyz_unpack_17_avx2(r, a);
203201
return MLD_NATIVE_FUNC_SUCCESS;
204202
}
205203

@@ -209,7 +207,7 @@ static MLD_INLINE int mld_polyz_unpack_19_native(int32_t *r, const uint8_t *a)
209207
{
210208
return MLD_NATIVE_FUNC_FALLBACK;
211209
}
212-
mld_polyz_unpack_19_avx2((__m256i *)r, a);
210+
mld_polyz_unpack_19_avx2(r, a);
213211
return MLD_NATIVE_FUNC_SUCCESS;
214212
}
215213

@@ -220,8 +218,7 @@ static MLD_INLINE int mld_poly_pointwise_montgomery_native(
220218
{
221219
return MLD_NATIVE_FUNC_FALLBACK;
222220
}
223-
mld_pointwise_avx2((__m256i *)c, (const __m256i *)a, (const __m256i *)b,
224-
mld_qdata.vec);
221+
mld_pointwise_avx2(c, a, b, mld_qdata);
225222
return MLD_NATIVE_FUNC_SUCCESS;
226223
}
227224

@@ -233,8 +230,7 @@ static MLD_INLINE int mld_polyvecl_pointwise_acc_montgomery_l4_native(
233230
{
234231
return MLD_NATIVE_FUNC_FALLBACK;
235232
}
236-
mld_pointwise_acc_l4_avx2((__m256i *)w, (const __m256i *)u,
237-
(const __m256i *)v, mld_qdata.vec);
233+
mld_pointwise_acc_l4_avx2(w, u, v, mld_qdata);
238234
return MLD_NATIVE_FUNC_SUCCESS;
239235
}
240236

@@ -246,8 +242,7 @@ static MLD_INLINE int mld_polyvecl_pointwise_acc_montgomery_l5_native(
246242
{
247243
return MLD_NATIVE_FUNC_FALLBACK;
248244
}
249-
mld_pointwise_acc_l5_avx2((__m256i *)w, (const __m256i *)u,
250-
(const __m256i *)v, mld_qdata.vec);
245+
mld_pointwise_acc_l5_avx2(w, u, v, mld_qdata);
251246
return MLD_NATIVE_FUNC_SUCCESS;
252247
}
253248

@@ -259,8 +254,7 @@ static MLD_INLINE int mld_polyvecl_pointwise_acc_montgomery_l7_native(
259254
{
260255
return MLD_NATIVE_FUNC_FALLBACK;
261256
}
262-
mld_pointwise_acc_l7_avx2((__m256i *)w, (const __m256i *)u,
263-
(const __m256i *)v, mld_qdata.vec);
257+
mld_pointwise_acc_l7_avx2(w, u, v, mld_qdata);
264258
return MLD_NATIVE_FUNC_SUCCESS;
265259
}
266260

dev/x86_64/src/align.h

Lines changed: 0 additions & 34 deletions
This file was deleted.

dev/x86_64/src/arith_native_x86_64.h

Lines changed: 21 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@
77
#define MLD_NATIVE_X86_64_SRC_ARITH_NATIVE_X86_64_H
88
#include "../../../common.h"
99

10-
#include <immintrin.h>
1110
#include <stdint.h>
1211
#include "consts.h"
1312

@@ -34,13 +33,13 @@
3433
extern const uint8_t mld_rej_uniform_table[256][8];
3534

3635
#define mld_ntt_avx2 MLD_NAMESPACE(ntt_avx2)
37-
void mld_ntt_avx2(__m256i *r, const __m256i *mld_qdata);
36+
void mld_ntt_avx2(int32_t *r, const int32_t *mld_qdata);
3837

3938
#define mld_invntt_avx2 MLD_NAMESPACE(invntt_avx2)
40-
void mld_invntt_avx2(__m256i *r, const __m256i *mld_qdata);
39+
void mld_invntt_avx2(int32_t *r, const int32_t *mld_qdata);
4140

4241
#define mld_nttunpack_avx2 MLD_NAMESPACE(nttunpack_avx2)
43-
void mld_nttunpack_avx2(__m256i *r);
42+
void mld_nttunpack_avx2(int32_t *r);
4443

4544
#define mld_rej_uniform_avx2 MLD_NAMESPACE(mld_rej_uniform_avx2)
4645
unsigned mld_rej_uniform_avx2(int32_t *r,
@@ -55,43 +54,46 @@ unsigned mld_rej_uniform_eta4_avx2(
5554
int32_t *r, const uint8_t buf[MLD_AVX2_REJ_UNIFORM_ETA4_BUFLEN]);
5655

5756
#define mld_poly_decompose_32_avx2 MLD_NAMESPACE(mld_poly_decompose_32_avx2)
58-
void mld_poly_decompose_32_avx2(__m256i *a1, __m256i *a0);
57+
void mld_poly_decompose_32_avx2(int32_t *a1, int32_t *a0);
5958

6059
#define mld_poly_decompose_88_avx2 MLD_NAMESPACE(mld_poly_decompose_88_avx2)
61-
void mld_poly_decompose_88_avx2(__m256i *a1, __m256i *a0);
60+
void mld_poly_decompose_88_avx2(int32_t *a1, int32_t *a0);
6261

6362
#define mld_poly_caddq_avx2 MLD_NAMESPACE(poly_caddq_avx2)
6463
void mld_poly_caddq_avx2(int32_t *r);
6564

6665
#define mld_poly_use_hint_32_avx2 MLD_NAMESPACE(mld_poly_use_hint_32_avx2)
67-
void mld_poly_use_hint_32_avx2(__m256i *b, const __m256i *a, const __m256i *h);
66+
void mld_poly_use_hint_32_avx2(int32_t *b, const int32_t *a, const int32_t *h);
6867

6968
#define mld_poly_use_hint_88_avx2 MLD_NAMESPACE(mld_poly_use_hint_88_avx2)
70-
void mld_poly_use_hint_88_avx2(__m256i *b, const __m256i *a, const __m256i *h);
69+
void mld_poly_use_hint_88_avx2(int32_t *b, const int32_t *a, const int32_t *h);
7170

7271
#define mld_poly_chknorm_avx2 MLD_NAMESPACE(mld_poly_chknorm_avx2)
73-
int mld_poly_chknorm_avx2(const __m256i *a, int32_t B);
72+
int mld_poly_chknorm_avx2(const int32_t *a, int32_t B);
7473

7574
#define mld_polyz_unpack_17_avx2 MLD_NAMESPACE(mld_polyz_unpack_17_avx2)
76-
void mld_polyz_unpack_17_avx2(__m256i *r, const uint8_t *a);
75+
void mld_polyz_unpack_17_avx2(int32_t *r, const uint8_t *a);
7776

7877
#define mld_polyz_unpack_19_avx2 MLD_NAMESPACE(mld_polyz_unpack_19_avx2)
79-
void mld_polyz_unpack_19_avx2(__m256i *r, const uint8_t *a);
78+
void mld_polyz_unpack_19_avx2(int32_t *r, const uint8_t *a);
8079

8180
#define mld_pointwise_avx2 MLD_NAMESPACE(pointwise_avx2)
82-
void mld_pointwise_avx2(__m256i *c, const __m256i *a, const __m256i *b,
83-
const __m256i *qdata);
81+
void mld_pointwise_avx2(int32_t *c, const int32_t *a, const int32_t *b,
82+
const int32_t *qdata);
8483

8584
#define mld_pointwise_acc_l4_avx2 MLD_NAMESPACE(pointwise_acc_l4_avx2)
86-
void mld_pointwise_acc_l4_avx2(__m256i *c, const __m256i *a, const __m256i *b,
87-
const __m256i *qdata);
85+
void mld_pointwise_acc_l4_avx2(int32_t c[MLDSA_N], const int32_t a[4][MLDSA_N],
86+
const int32_t b[4][MLDSA_N],
87+
const int32_t *qdata);
8888

8989
#define mld_pointwise_acc_l5_avx2 MLD_NAMESPACE(pointwise_acc_l5_avx2)
90-
void mld_pointwise_acc_l5_avx2(__m256i *c, const __m256i *a, const __m256i *b,
91-
const __m256i *qdata);
90+
void mld_pointwise_acc_l5_avx2(int32_t c[MLDSA_N], const int32_t a[5][MLDSA_N],
91+
const int32_t b[5][MLDSA_N],
92+
const int32_t *qdata);
9293

9394
#define mld_pointwise_acc_l7_avx2 MLD_NAMESPACE(pointwise_acc_l7_avx2)
94-
void mld_pointwise_acc_l7_avx2(__m256i *c, const __m256i *a, const __m256i *b,
95-
const __m256i *qdata);
95+
void mld_pointwise_acc_l7_avx2(int32_t c[MLDSA_N], const int32_t a[7][MLDSA_N],
96+
const int32_t b[7][MLDSA_N],
97+
const int32_t *qdata);
9698

9799
#endif /* !MLD_NATIVE_X86_64_SRC_ARITH_NATIVE_X86_64_H */

dev/x86_64/src/consts.c

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,6 @@
2222
#if defined(MLD_ARITH_BACKEND_X86_64_DEFAULT) && \
2323
!defined(MLD_CONFIG_MULTILEVEL_NO_SHARED)
2424

25-
#include "align.h"
2625
#include "consts.h"
2726
#define MLD_AVX2_Q MLDSA_Q
2827
/* check-magic: 58728449 == pow(MLDSA_Q,-1,2^32) */
@@ -32,7 +31,7 @@
3231
/* check-magic: -8395782 == signed_mod(MLD_AVX2_QINV*MLD_AVX2_DIV,2^32) */
3332
#define MLD_AVX2_DIV_QINV -8395782
3433

35-
const qdata_t mld_qdata = {{
34+
MLD_ALIGN const int32_t mld_qdata[624] = {
3635
#define MLD_AVX2_BACKEND_DATA_OFFSET_8XQ 0
3736
MLD_AVX2_Q, MLD_AVX2_Q, MLD_AVX2_Q, MLD_AVX2_Q,
3837
MLD_AVX2_Q, MLD_AVX2_Q, MLD_AVX2_Q, MLD_AVX2_Q,
@@ -53,7 +52,7 @@ const qdata_t mld_qdata = {{
5352
#define MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS 328
5453
#include "x86_64_zetas.i"
5554

56-
}};
55+
};
5756

5857
#else /* MLD_ARITH_BACKEND_X86_64_DEFAULT && !MLD_CONFIG_MULTILEVEL_NO_SHARED \
5958
*/

dev/x86_64/src/consts.h

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -30,10 +30,8 @@
3030

3131

3232
#ifndef __ASSEMBLER__
33-
#include "align.h"
34-
typedef MLD_ALIGNED_INT32(624) qdata_t;
3533
#define mld_qdata MLD_NAMESPACE(qdata)
36-
extern const qdata_t mld_qdata;
37-
#endif /* !__ASSEMBLER__ */
34+
extern const int32_t mld_qdata[624];
35+
#endif
3836

3937
#endif /* !MLD_NATIVE_X86_64_SRC_CONSTS_H */

dev/x86_64/src/poly_chknorm_avx2.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
#include <stdint.h>
2727
#include "arith_native_x86_64.h"
2828

29-
int mld_poly_chknorm_avx2(const __m256i *a, int32_t B)
29+
int mld_poly_chknorm_avx2(const int32_t *a, int32_t B)
3030
{
3131
unsigned int i;
3232
__m256i f, t;
@@ -35,7 +35,7 @@ int mld_poly_chknorm_avx2(const __m256i *a, int32_t B)
3535
t = _mm256_setzero_si256();
3636
for (i = 0; i < MLDSA_N / 8; i++)
3737
{
38-
f = _mm256_load_si256(&a[i]);
38+
f = _mm256_load_si256((const __m256i *)&a[8 * i]);
3939
f = _mm256_abs_epi32(f);
4040
f = _mm256_cmpgt_epi32(f, bound);
4141
t = _mm256_or_si256(t, f);

dev/x86_64/src/poly_decompose_32_avx2.c

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@
3737
* separate argument that may be aliased with either of the outputs.
3838
* Removing the aliasing eases CBMC proofs.
3939
*/
40-
void mld_poly_decompose_32_avx2(__m256i *a1, __m256i *a0)
40+
void mld_poly_decompose_32_avx2(int32_t *a1, int32_t *a0)
4141
{
4242
unsigned int i;
4343
__m256i f, f0, f1, t;
@@ -50,7 +50,7 @@ void mld_poly_decompose_32_avx2(__m256i *a1, __m256i *a0)
5050

5151
for (i = 0; i < MLDSA_N / 8; i++)
5252
{
53-
f = _mm256_load_si256(&a0[i]);
53+
f = _mm256_load_si256((__m256i *)&a0[8 * i]);
5454

5555
/* check-magic: 4092 == intdiv(2 * intdiv(MLDSA_Q - 1, 32), 128) */
5656
/*
@@ -136,8 +136,8 @@ void mld_poly_decompose_32_avx2(__m256i *a1, __m256i *a0)
136136
f0 = _mm256_add_epi32(f0, t);
137137
/* range: 0 <= f1 <= 15, -GAMMA2 <= f0 <= GAMMA2 */
138138

139-
_mm256_store_si256(&a1[i], f1);
140-
_mm256_store_si256(&a0[i], f0);
139+
_mm256_store_si256((__m256i *)&a1[8 * i], f1);
140+
_mm256_store_si256((__m256i *)&a0[8 * i], f0);
141141
}
142142
}
143143

dev/x86_64/src/poly_decompose_88_avx2.c

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@
3838
* Removing the aliasing eases CBMC proofs.
3939
*/
4040

41-
void mld_poly_decompose_88_avx2(__m256i *a1, __m256i *a0)
41+
void mld_poly_decompose_88_avx2(int32_t *a1, int32_t *a0)
4242
{
4343
unsigned int i;
4444
__m256i f, f0, f1, t;
@@ -51,7 +51,7 @@ void mld_poly_decompose_88_avx2(__m256i *a1, __m256i *a0)
5151

5252
for (i = 0; i < MLDSA_N / 8; i++)
5353
{
54-
f = _mm256_load_si256(&a0[i]);
54+
f = _mm256_load_si256((__m256i *)&a0[8 * i]);
5555

5656
/* check-magic: 1488 == intdiv(2 * intdiv(MLDSA_Q - 1, 88), 128) */
5757
/*
@@ -137,8 +137,8 @@ void mld_poly_decompose_88_avx2(__m256i *a1, __m256i *a0)
137137
f0 = _mm256_add_epi32(f0, t);
138138
/* range: 0 <= f1 <= 43, -GAMMA2 <= f0 <= GAMMA2 */
139139

140-
_mm256_store_si256(&a1[i], f1);
141-
_mm256_store_si256(&a0[i], f0);
140+
_mm256_store_si256((__m256i *)&a1[8 * i], f1);
141+
_mm256_store_si256((__m256i *)&a0[8 * i], f0);
142142
}
143143
}
144144
#else /* MLD_ARITH_BACKEND_X86_64_DEFAULT && !MLD_CONFIG_MULTILEVEL_NO_SHARED \

0 commit comments

Comments
 (0)