Merge pull request #501 from pq-code-package/chknorm-asm

hanno-becker · web-flow · commit efe03f94735b · 2025-10-03T17:04:56.000+01:00
Add native implementation of poly_chknorm
diff --git a/.github/actions/ct-test/action.yml b/.github/actions/ct-test/action.yml
@@ -41,4 +41,9 @@ runs:
       - shell: ${{ env.SHELL }}
         run: |
           make clean
-          tests func --exec-wrapper="valgrind --error-exitcode=1 ${{ inputs.valgrind_flags }}" --cflags="-DMLD_CONFIG_CT_TESTING_ENABLED -DNTESTS=5 ${{ inputs.cflags }}"
+          # --vex-guest-max-insns=55 (default is 60) is a workaround for 
+          # "VEX temporary storage exhausted" errors in the x86 backend (poly_chknorm)
+          # It may increase run-time of the valgrind tests.
+          # TODO: Check with future versions of valgrind if this is still needed (both 3.24 and 3.25 fail without)
+          # TODO: Check if this is still needed once the poly_chknorm intrinsics implementation is replaced by assembly
+          tests func --exec-wrapper="valgrind --vex-guest-max-insns=55 --error-exitcode=1 ${{ inputs.valgrind_flags }}" --cflags="-DMLD_CONFIG_CT_TESTING_ENABLED -DNTESTS=5 ${{ inputs.cflags }}"
diff --git a/BIBLIOGRAPHY.md b/BIBLIOGRAPHY.md
@@ -149,6 +149,7 @@ source code and documentation.
   - [mldsa/native/x86_64/src/ntt.S](mldsa/native/x86_64/src/ntt.S)
   - [mldsa/native/x86_64/src/nttunpack.S](mldsa/native/x86_64/src/nttunpack.S)
   - [mldsa/native/x86_64/src/poly_caddq_avx2.c](mldsa/native/x86_64/src/poly_caddq_avx2.c)
+  - [mldsa/native/x86_64/src/poly_chknorm_avx2.c](mldsa/native/x86_64/src/poly_chknorm_avx2.c)
   - [mldsa/native/x86_64/src/poly_decompose_32_avx2.c](mldsa/native/x86_64/src/poly_decompose_32_avx2.c)
   - [mldsa/native/x86_64/src/poly_decompose_88_avx2.c](mldsa/native/x86_64/src/poly_decompose_88_avx2.c)
   - [mldsa/native/x86_64/src/poly_use_hint_32_avx2.c](mldsa/native/x86_64/src/poly_use_hint_32_avx2.c)
diff --git a/mldsa/native/aarch64/meta.h b/mldsa/native/aarch64/meta.h
@@ -18,6 +18,7 @@
 #define MLD_USE_NATIVE_POLY_CADDQ
 #define MLD_USE_NATIVE_POLY_USE_HINT_32
 #define MLD_USE_NATIVE_POLY_USE_HINT_88
+#define MLD_USE_NATIVE_POLY_CHKNORM
 
 /* Identifier for this backend so that source and assembly files
  * in the build can be appropriately guarded. */
@@ -127,5 +128,10 @@ static MLD_INLINE void mld_poly_use_hint_88_native(int32_t *b, const int32_t *a,
   mld_poly_use_hint_88_asm(b, a, h);
 }
 
+static MLD_INLINE uint32_t mld_poly_chknorm_native(const int32_t *a, int32_t B)
+{
+  return mld_poly_chknorm_asm(a, B);
+}
+
 #endif /* !__ASSEMBLER__ */
 #endif /* !MLD_NATIVE_AARCH64_META_H */
diff --git a/mldsa/native/aarch64/src/arith_native_aarch64.h b/mldsa/native/aarch64/src/arith_native_aarch64.h
@@ -79,4 +79,7 @@ void mld_poly_use_hint_32_asm(int32_t *b, const int32_t *a, const int32_t *h);
 #define mld_poly_use_hint_88_asm MLD_NAMESPACE(poly_use_hint_88_asm)
 void mld_poly_use_hint_88_asm(int32_t *b, const int32_t *a, const int32_t *h);
 
+#define mld_poly_chknorm_asm MLD_NAMESPACE(poly_chknorm_asm)
+uint32_t mld_poly_chknorm_asm(const int32_t *a, int32_t B);
+
 #endif /* !MLD_NATIVE_AARCH64_SRC_ARITH_NATIVE_AARCH64_H */
diff --git a/mldsa/native/aarch64/src/poly_chknorm_asm.S b/mldsa/native/aarch64/src/poly_chknorm_asm.S
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) The mldsa-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+#include "../../../common.h"
+
+#if defined(MLD_ARITH_BACKEND_AARCH64) && !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED)
+
+.macro chknorm a
+        abs \a\().4s, \a\().4s
+        cmge \a\().4s, \a\().4s, bound.4s
+        orr flags.16b, flags.16b, \a\().16b
+.endm
+
+        /* Parameters */
+        a_ptr           .req x0     // Input polynomial
+        B               .req w1     // Input norm bound
+
+        count           .req x2
+
+        /* Constant register assignments */
+        bound           .req v20
+        flags           .req v21
+
+.text
+.global MLD_ASM_NAMESPACE(poly_chknorm_asm)
+.balign 4
+MLD_ASM_FN_SYMBOL(poly_chknorm_asm)
+        // Load constants
+        dup bound.4s, B
+
+        movi flags.4s, 0
+
+        mov count, #(64/4)
+
+poly_chknorm_loop:
+        ldr q1, [a_ptr, #1*16]
+        ldr q2, [a_ptr, #2*16]
+        ldr q3, [a_ptr, #3*16]
+        ldr q0, [a_ptr], #4*16
+
+        chknorm v1
+        chknorm v2
+        chknorm v3
+        chknorm v0
+
+        subs count, count, #1
+        bne poly_chknorm_loop
+
+        // Return 0xffffffff if any of the 4 lanes is 0xffffffff
+        umaxv s21, flags.4s
+        fmov w0, s21
+
+        ret
+
+        .unreq a_ptr
+        .unreq B
+        .unreq count
+        .unreq bound
+        .unreq flags
+
+#endif /* MLD_ARITH_BACKEND_AARCH64 && !MLD_CONFIG_MULTILEVEL_NO_SHARED */
diff --git a/mldsa/native/api.h b/mldsa/native/api.h
@@ -254,4 +254,20 @@ static MLD_INLINE void mld_poly_use_hint_88_native(int32_t *b, const int32_t *a,
                                                    const int32_t *h);
 #endif /* MLD_USE_NATIVE_POLY_USE_HINT_88 */
 
+#if defined(MLD_USE_NATIVE_POLY_CHKNORM)
+/*************************************************
+ * Name:        mld_poly_chknorm_native
+ *
+ * Description: Check infinity norm of polynomial against given bound.
+ *              Assumes input coefficients were reduced by mld_reduce32().
+ *
+ * Arguments:   - const int32_t *a: pointer to polynomial
+ *              - int32_t B: norm bound
+ *
+ * Returns 0 if the infinity norm is strictly smaller than B, and 0xFFFFFFFF
+ * otherwise. B must not be larger than MLDSA_Q - REDUCE32_RANGE_MAX.
+ **************************************************/
+static MLD_INLINE uint32_t mld_poly_chknorm_native(const int32_t *a, int32_t B);
+#endif /* MLD_USE_NATIVE_POLY_CHKNORM */
+
 #endif /* !MLD_NATIVE_API_H */
diff --git a/mldsa/native/x86_64/meta.h b/mldsa/native/x86_64/meta.h
@@ -22,6 +22,7 @@
 #define MLD_USE_NATIVE_POLY_CADDQ
 #define MLD_USE_NATIVE_POLY_USE_HINT_32
 #define MLD_USE_NATIVE_POLY_USE_HINT_88
+#define MLD_USE_NATIVE_POLY_CHKNORM
 
 #if !defined(__ASSEMBLER__)
 #include <string.h>
@@ -133,6 +134,11 @@ static MLD_INLINE void mld_poly_use_hint_88_native(int32_t *b, const int32_t *a,
                             (const __m256i *)h);
 }
 
+static MLD_INLINE uint32_t mld_poly_chknorm_native(const int32_t *a, int32_t B)
+{
+  return mld_poly_chknorm_avx2((const __m256i *)a, B);
+}
+
 #endif /* !__ASSEMBLER__ */
 
 #endif /* !MLD_NATIVE_X86_64_META_H */
diff --git a/mldsa/native/x86_64/src/arith_native_x86_64.h b/mldsa/native/x86_64/src/arith_native_x86_64.h
@@ -69,4 +69,7 @@ void mld_poly_use_hint_32_avx2(__m256i *b, const __m256i *a, const __m256i *h);
 #define mld_poly_use_hint_88_avx2 MLD_NAMESPACE(mld_poly_use_hint_88_avx2)
 void mld_poly_use_hint_88_avx2(__m256i *b, const __m256i *a, const __m256i *h);
 
+#define mld_poly_chknorm_avx2 MLD_NAMESPACE(mld_poly_chknorm_avx2)
+uint32_t mld_poly_chknorm_avx2(const __m256i *a, int32_t B);
+
 #endif /* !MLD_NATIVE_X86_64_SRC_ARITH_NATIVE_X86_64_H */
diff --git a/mldsa/native/x86_64/src/poly_chknorm_avx2.c b/mldsa/native/x86_64/src/poly_chknorm_avx2.c
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) The mldsa-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+
+/* References
+ * ==========
+ *
+ * - [REF_AVX2]
+ *   CRYSTALS-Dilithium optimized AVX2 implementation
+ *   Bai, Ducas, Kiltz, Lepoint, Lyubashevsky, Schwabe, Seiler, Stehlé
+ *   https://github.com/pq-crystals/dilithium/tree/master/avx2
+ */
+
+/*
+ * This file is derived from the public domain
+ * AVX2 Dilithium implementation @[REF_AVX2].
+ */
+
+#include "../../../common.h"
+
+#if defined(MLD_ARITH_BACKEND_X86_64_DEFAULT) && \
+    !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED)
+
+#include <immintrin.h>
+#include <stdint.h>
+#include "arith_native_x86_64.h"
+
+uint32_t mld_poly_chknorm_avx2(const __m256i *a, int32_t B)
+{
+  unsigned int i;
+  __m256i f, t;
+  const __m256i bound = _mm256_set1_epi32(B - 1);
+
+  t = _mm256_setzero_si256();
+  for (i = 0; i < MLDSA_N / 8; i++)
+  {
+    f = _mm256_load_si256(&a[i]);
+    f = _mm256_abs_epi32(f);
+    f = _mm256_cmpgt_epi32(f, bound);
+    t = _mm256_or_si256(t, f);
+  }
+
+  return _mm256_testz_si256(t, t) - 1;
+}
+
+#else /* MLD_ARITH_BACKEND_X86_64_DEFAULT && !MLD_CONFIG_MULTILEVEL_NO_SHARED \
+       */
+
+MLD_EMPTY_CU(avx2_poly_chknorm)
+
+#endif /* !(MLD_ARITH_BACKEND_X86_64_DEFAULT && \
+          !MLD_CONFIG_MULTILEVEL_NO_SHARED) */
diff --git a/mldsa/poly.c b/mldsa/poly.c
@@ -320,10 +320,17 @@ void mld_poly_use_hint(mld_poly *b, const mld_poly *a, const mld_poly *h)
  * that it is okay to leak which coefficient violates the bound (while the
  * coefficient itself must remain secret).
  * We instead perform everything in constant-time.
+ * Also it is sufficient to check that it is smaller than
+ * MLDSA_Q - REDUCE32_RANGE_MAX > (MLDSA_Q - 1) / 8).
  */
 MLD_INTERNAL_API
 uint32_t mld_poly_chknorm(const mld_poly *a, int32_t B)
 {
+#if defined(MLD_USE_NATIVE_POLY_CHKNORM)
+  /* TODO: proof */
+  mld_assert_bound(a->coeffs, MLDSA_N, -REDUCE32_RANGE_MAX, REDUCE32_RANGE_MAX);
+  return mld_poly_chknorm_native(a->coeffs, B);
+#else
   unsigned int i;
   uint32_t t = 0;
   mld_assert_bound(a->coeffs, MLDSA_N, -REDUCE32_RANGE_MAX, REDUCE32_RANGE_MAX);
@@ -336,6 +343,17 @@ uint32_t mld_poly_chknorm(const mld_poly *a, int32_t B)
     invariant((t == 0) == array_abs_bound(a->coeffs, 0, i, B))
   )
   {
+    /*
+     * Since we know that -REDUCE32_RANGE_MAX <= a < REDUCE32_RANGE_MAX,
+     * and B <= MLDSA_Q - REDUCE32_RANGE_MAX, to check if
+     * -B < (a mod± MLDSA_Q) < B, it suffices to check if -B < a < B.
+     *
+     * We prove this to be true using the following CBMC assertions.
+     * a ==> b expressed as !a || b to also allow run-time assertion.
+     */
+    mld_assert(a->coeffs[i] < B || a->coeffs[i] - MLDSA_Q <= -B);
+    mld_assert(a->coeffs[i] > -B || a->coeffs[i] + MLDSA_Q >= B);
+
     /* Reference: Leaks which coefficient violates the bound via a conditional.
      * We are more conservative to reduce the number of declassifications in
      * constant-time testing.
@@ -346,6 +364,7 @@ uint32_t mld_poly_chknorm(const mld_poly *a, int32_t B)
   }
 
   return t;
+#endif /* !MLD_USE_NATIVE_POLY_CHKNORM */
 }
 
 /*************************************************
diff --git a/mldsa/poly.h b/mldsa/poly.h
@@ -307,14 +307,23 @@ __contract__(
  * Arguments:   - const mld_poly *a: pointer to polynomial
  *              - int32_t B: norm bound
  *
- * Returns 0 if norm is strictly smaller than B <= (MLDSA_Q-1)/8 and 0xFFFFFFFF
- * otherwise.
+ * Returns 0 if norm is strictly smaller than
+ * B <= (MLDSA_Q - REDUCE32_RANGE_MAX) and 0xFFFFFFFF otherwise.
+ *
+ * Specification: The definition of this FIPS-204 requires signed canonical
+ *                reduction prior to applying the bounds check.
+ *                However, `-B < (a mod± MLDSA_Q) < B` is equivalent to
+ *                `-B < a < B` under the assumption that
+ *                `B <= MLDSA_Q - REDUCE32_RANGE_MAX` (cf. the assertion in
+ *                the code). Hence, the present spec and implementation are
+ *                correct without reduction.
+ *
  **************************************************/
 MLD_INTERNAL_API
 uint32_t mld_poly_chknorm(const mld_poly *a, int32_t B)
 __contract__(
   requires(memory_no_alias(a, sizeof(mld_poly)))
-  requires(0 <= B && B <= (MLDSA_Q - 1) / 8)
+  requires(0 <= B && B <= MLDSA_Q - REDUCE32_RANGE_MAX)
   requires(array_bound(a->coeffs, 0, MLDSA_N, -REDUCE32_RANGE_MAX, REDUCE32_RANGE_MAX))
   ensures(return_value == 0 || return_value == 0xFFFFFFFF)
   ensures((return_value == 0) == array_abs_bound(a->coeffs, 0, MLDSA_N, B))
diff --git a/proofs/cbmc/poly_pointwise_montgomery/Makefile b/proofs/cbmc/poly_pointwise_montgomery/Makefile
@@ -37,7 +37,7 @@ FUNCTION_NAME = poly_pointwise_montgomery
 # EXPENSIVE = true
 
 # This function is large enough to need...
-CBMC_OBJECT_BITS = 8
+CBMC_OBJECT_BITS = 10
 
 # If you require access to a file-local ("static") function or object to conduct
 # your proof, set the following (and do not include the original source file