Skip to content

Commit efe03f9

Browse files
authored
Merge pull request #501 from pq-code-package/chknorm-asm
Add native implementation of poly_chknorm
2 parents 5fc3ec8 + c160cd9 commit efe03f9

File tree

12 files changed

+188
-5
lines changed

12 files changed

+188
-5
lines changed

.github/actions/ct-test/action.yml

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,4 +41,9 @@ runs:
4141
- shell: ${{ env.SHELL }}
4242
run: |
4343
make clean
44-
tests func --exec-wrapper="valgrind --error-exitcode=1 ${{ inputs.valgrind_flags }}" --cflags="-DMLD_CONFIG_CT_TESTING_ENABLED -DNTESTS=5 ${{ inputs.cflags }}"
44+
# --vex-guest-max-insns=55 (default is 60) is a workaround for
45+
# "VEX temporary storage exhausted" errors in the x86 backend (poly_chknorm)
46+
# It may increase run-time of the valgrind tests.
47+
# TODO: Check with future versions of valgrind if this is still needed (both 3.24 and 3.25 fail without)
48+
# TODO: Check if this is still needed once the poly_chknorm intrinsics implementation is replaced by assembly
49+
tests func --exec-wrapper="valgrind --vex-guest-max-insns=55 --error-exitcode=1 ${{ inputs.valgrind_flags }}" --cflags="-DMLD_CONFIG_CT_TESTING_ENABLED -DNTESTS=5 ${{ inputs.cflags }}"

BIBLIOGRAPHY.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -149,6 +149,7 @@ source code and documentation.
149149
- [mldsa/native/x86_64/src/ntt.S](mldsa/native/x86_64/src/ntt.S)
150150
- [mldsa/native/x86_64/src/nttunpack.S](mldsa/native/x86_64/src/nttunpack.S)
151151
- [mldsa/native/x86_64/src/poly_caddq_avx2.c](mldsa/native/x86_64/src/poly_caddq_avx2.c)
152+
- [mldsa/native/x86_64/src/poly_chknorm_avx2.c](mldsa/native/x86_64/src/poly_chknorm_avx2.c)
152153
- [mldsa/native/x86_64/src/poly_decompose_32_avx2.c](mldsa/native/x86_64/src/poly_decompose_32_avx2.c)
153154
- [mldsa/native/x86_64/src/poly_decompose_88_avx2.c](mldsa/native/x86_64/src/poly_decompose_88_avx2.c)
154155
- [mldsa/native/x86_64/src/poly_use_hint_32_avx2.c](mldsa/native/x86_64/src/poly_use_hint_32_avx2.c)

mldsa/native/aarch64/meta.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
#define MLD_USE_NATIVE_POLY_CADDQ
1919
#define MLD_USE_NATIVE_POLY_USE_HINT_32
2020
#define MLD_USE_NATIVE_POLY_USE_HINT_88
21+
#define MLD_USE_NATIVE_POLY_CHKNORM
2122

2223
/* Identifier for this backend so that source and assembly files
2324
* in the build can be appropriately guarded. */
@@ -127,5 +128,10 @@ static MLD_INLINE void mld_poly_use_hint_88_native(int32_t *b, const int32_t *a,
127128
mld_poly_use_hint_88_asm(b, a, h);
128129
}
129130

131+
static MLD_INLINE uint32_t mld_poly_chknorm_native(const int32_t *a, int32_t B)
132+
{
133+
return mld_poly_chknorm_asm(a, B);
134+
}
135+
130136
#endif /* !__ASSEMBLER__ */
131137
#endif /* !MLD_NATIVE_AARCH64_META_H */

mldsa/native/aarch64/src/arith_native_aarch64.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,4 +79,7 @@ void mld_poly_use_hint_32_asm(int32_t *b, const int32_t *a, const int32_t *h);
7979
#define mld_poly_use_hint_88_asm MLD_NAMESPACE(poly_use_hint_88_asm)
8080
void mld_poly_use_hint_88_asm(int32_t *b, const int32_t *a, const int32_t *h);
8181

82+
#define mld_poly_chknorm_asm MLD_NAMESPACE(poly_chknorm_asm)
83+
uint32_t mld_poly_chknorm_asm(const int32_t *a, int32_t B);
84+
8285
#endif /* !MLD_NATIVE_AARCH64_SRC_ARITH_NATIVE_AARCH64_H */
Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
/*
2+
* Copyright (c) The mldsa-native project authors
3+
* SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
4+
*/
5+
#include "../../../common.h"
6+
7+
#if defined(MLD_ARITH_BACKEND_AARCH64) && !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED)
8+
9+
.macro chknorm a
10+
abs \a\().4s, \a\().4s
11+
cmge \a\().4s, \a\().4s, bound.4s
12+
orr flags.16b, flags.16b, \a\().16b
13+
.endm
14+
15+
/* Parameters */
16+
a_ptr .req x0 // Input polynomial
17+
B .req w1 // Input norm bound
18+
19+
count .req x2
20+
21+
/* Constant register assignments */
22+
bound .req v20
23+
flags .req v21
24+
25+
.text
26+
.global MLD_ASM_NAMESPACE(poly_chknorm_asm)
27+
.balign 4
28+
MLD_ASM_FN_SYMBOL(poly_chknorm_asm)
29+
// Load constants
30+
dup bound.4s, B
31+
32+
movi flags.4s, 0
33+
34+
mov count, #(64/4)
35+
36+
poly_chknorm_loop:
37+
ldr q1, [a_ptr, #1*16]
38+
ldr q2, [a_ptr, #2*16]
39+
ldr q3, [a_ptr, #3*16]
40+
ldr q0, [a_ptr], #4*16
41+
42+
chknorm v1
43+
chknorm v2
44+
chknorm v3
45+
chknorm v0
46+
47+
subs count, count, #1
48+
bne poly_chknorm_loop
49+
50+
// Return 0xffffffff if any of the 4 lanes is 0xffffffff
51+
umaxv s21, flags.4s
52+
fmov w0, s21
53+
54+
ret
55+
56+
.unreq a_ptr
57+
.unreq B
58+
.unreq count
59+
.unreq bound
60+
.unreq flags
61+
62+
#endif /* MLD_ARITH_BACKEND_AARCH64 && !MLD_CONFIG_MULTILEVEL_NO_SHARED */

mldsa/native/api.h

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -254,4 +254,20 @@ static MLD_INLINE void mld_poly_use_hint_88_native(int32_t *b, const int32_t *a,
254254
const int32_t *h);
255255
#endif /* MLD_USE_NATIVE_POLY_USE_HINT_88 */
256256

257+
#if defined(MLD_USE_NATIVE_POLY_CHKNORM)
258+
/*************************************************
259+
* Name: mld_poly_chknorm_native
260+
*
261+
* Description: Check infinity norm of polynomial against given bound.
262+
* Assumes input coefficients were reduced by mld_reduce32().
263+
*
264+
* Arguments: - const int32_t *a: pointer to polynomial
265+
* - int32_t B: norm bound
266+
*
267+
* Returns 0 if the infinity norm is strictly smaller than B, and 0xFFFFFFFF
268+
* otherwise. B must not be larger than MLDSA_Q - REDUCE32_RANGE_MAX.
269+
**************************************************/
270+
static MLD_INLINE uint32_t mld_poly_chknorm_native(const int32_t *a, int32_t B);
271+
#endif /* MLD_USE_NATIVE_POLY_CHKNORM */
272+
257273
#endif /* !MLD_NATIVE_API_H */

mldsa/native/x86_64/meta.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
#define MLD_USE_NATIVE_POLY_CADDQ
2323
#define MLD_USE_NATIVE_POLY_USE_HINT_32
2424
#define MLD_USE_NATIVE_POLY_USE_HINT_88
25+
#define MLD_USE_NATIVE_POLY_CHKNORM
2526

2627
#if !defined(__ASSEMBLER__)
2728
#include <string.h>
@@ -133,6 +134,11 @@ static MLD_INLINE void mld_poly_use_hint_88_native(int32_t *b, const int32_t *a,
133134
(const __m256i *)h);
134135
}
135136

137+
static MLD_INLINE uint32_t mld_poly_chknorm_native(const int32_t *a, int32_t B)
138+
{
139+
return mld_poly_chknorm_avx2((const __m256i *)a, B);
140+
}
141+
136142
#endif /* !__ASSEMBLER__ */
137143

138144
#endif /* !MLD_NATIVE_X86_64_META_H */

mldsa/native/x86_64/src/arith_native_x86_64.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,4 +69,7 @@ void mld_poly_use_hint_32_avx2(__m256i *b, const __m256i *a, const __m256i *h);
6969
#define mld_poly_use_hint_88_avx2 MLD_NAMESPACE(mld_poly_use_hint_88_avx2)
7070
void mld_poly_use_hint_88_avx2(__m256i *b, const __m256i *a, const __m256i *h);
7171

72+
#define mld_poly_chknorm_avx2 MLD_NAMESPACE(mld_poly_chknorm_avx2)
73+
uint32_t mld_poly_chknorm_avx2(const __m256i *a, int32_t B);
74+
7275
#endif /* !MLD_NATIVE_X86_64_SRC_ARITH_NATIVE_X86_64_H */
Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
/*
2+
* Copyright (c) The mldsa-native project authors
3+
* SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
4+
*/
5+
6+
/* References
7+
* ==========
8+
*
9+
* - [REF_AVX2]
10+
* CRYSTALS-Dilithium optimized AVX2 implementation
11+
* Bai, Ducas, Kiltz, Lepoint, Lyubashevsky, Schwabe, Seiler, Stehlé
12+
* https://github.com/pq-crystals/dilithium/tree/master/avx2
13+
*/
14+
15+
/*
16+
* This file is derived from the public domain
17+
* AVX2 Dilithium implementation @[REF_AVX2].
18+
*/
19+
20+
#include "../../../common.h"
21+
22+
#if defined(MLD_ARITH_BACKEND_X86_64_DEFAULT) && \
23+
!defined(MLD_CONFIG_MULTILEVEL_NO_SHARED)
24+
25+
#include <immintrin.h>
26+
#include <stdint.h>
27+
#include "arith_native_x86_64.h"
28+
29+
uint32_t mld_poly_chknorm_avx2(const __m256i *a, int32_t B)
30+
{
31+
unsigned int i;
32+
__m256i f, t;
33+
const __m256i bound = _mm256_set1_epi32(B - 1);
34+
35+
t = _mm256_setzero_si256();
36+
for (i = 0; i < MLDSA_N / 8; i++)
37+
{
38+
f = _mm256_load_si256(&a[i]);
39+
f = _mm256_abs_epi32(f);
40+
f = _mm256_cmpgt_epi32(f, bound);
41+
t = _mm256_or_si256(t, f);
42+
}
43+
44+
return _mm256_testz_si256(t, t) - 1;
45+
}
46+
47+
#else /* MLD_ARITH_BACKEND_X86_64_DEFAULT && !MLD_CONFIG_MULTILEVEL_NO_SHARED \
48+
*/
49+
50+
MLD_EMPTY_CU(avx2_poly_chknorm)
51+
52+
#endif /* !(MLD_ARITH_BACKEND_X86_64_DEFAULT && \
53+
!MLD_CONFIG_MULTILEVEL_NO_SHARED) */

mldsa/poly.c

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -320,10 +320,17 @@ void mld_poly_use_hint(mld_poly *b, const mld_poly *a, const mld_poly *h)
320320
* that it is okay to leak which coefficient violates the bound (while the
321321
* coefficient itself must remain secret).
322322
* We instead perform everything in constant-time.
323+
* Also it is sufficient to check that it is smaller than
324+
* MLDSA_Q - REDUCE32_RANGE_MAX > (MLDSA_Q - 1) / 8).
323325
*/
324326
MLD_INTERNAL_API
325327
uint32_t mld_poly_chknorm(const mld_poly *a, int32_t B)
326328
{
329+
#if defined(MLD_USE_NATIVE_POLY_CHKNORM)
330+
/* TODO: proof */
331+
mld_assert_bound(a->coeffs, MLDSA_N, -REDUCE32_RANGE_MAX, REDUCE32_RANGE_MAX);
332+
return mld_poly_chknorm_native(a->coeffs, B);
333+
#else
327334
unsigned int i;
328335
uint32_t t = 0;
329336
mld_assert_bound(a->coeffs, MLDSA_N, -REDUCE32_RANGE_MAX, REDUCE32_RANGE_MAX);
@@ -336,6 +343,17 @@ uint32_t mld_poly_chknorm(const mld_poly *a, int32_t B)
336343
invariant((t == 0) == array_abs_bound(a->coeffs, 0, i, B))
337344
)
338345
{
346+
/*
347+
* Since we know that -REDUCE32_RANGE_MAX <= a < REDUCE32_RANGE_MAX,
348+
* and B <= MLDSA_Q - REDUCE32_RANGE_MAX, to check if
349+
* -B < (a mod± MLDSA_Q) < B, it suffices to check if -B < a < B.
350+
*
351+
* We prove this to be true using the following CBMC assertions.
352+
* a ==> b expressed as !a || b to also allow run-time assertion.
353+
*/
354+
mld_assert(a->coeffs[i] < B || a->coeffs[i] - MLDSA_Q <= -B);
355+
mld_assert(a->coeffs[i] > -B || a->coeffs[i] + MLDSA_Q >= B);
356+
339357
/* Reference: Leaks which coefficient violates the bound via a conditional.
340358
* We are more conservative to reduce the number of declassifications in
341359
* constant-time testing.
@@ -346,6 +364,7 @@ uint32_t mld_poly_chknorm(const mld_poly *a, int32_t B)
346364
}
347365

348366
return t;
367+
#endif /* !MLD_USE_NATIVE_POLY_CHKNORM */
349368
}
350369

351370
/*************************************************

0 commit comments

Comments
 (0)