AVX2: Add native implementation of polyz_unpack

jammychiou1 · jammychiou1 · commit f2df58e1aa4d · 2025-10-09T12:40:33.000+08:00
This adds the AVX2 intrinsics implementation of polyz_unpack from https://github.com/pq-crystals/dilithium/blob/master/avx2/poly.c. Signed-off-by: jammychiou1 <jammy.chiou1@gmail.com>
diff --git a/BIBLIOGRAPHY.md b/BIBLIOGRAPHY.md
@@ -156,6 +156,8 @@ source code and documentation.
   - [mldsa/native/x86_64/src/poly_decompose_88_avx2.c](mldsa/native/x86_64/src/poly_decompose_88_avx2.c)
   - [mldsa/native/x86_64/src/poly_use_hint_32_avx2.c](mldsa/native/x86_64/src/poly_use_hint_32_avx2.c)
   - [mldsa/native/x86_64/src/poly_use_hint_88_avx2.c](mldsa/native/x86_64/src/poly_use_hint_88_avx2.c)
+  - [mldsa/native/x86_64/src/polyz_unpack_17_avx2.c](mldsa/native/x86_64/src/polyz_unpack_17_avx2.c)
+  - [mldsa/native/x86_64/src/polyz_unpack_19_avx2.c](mldsa/native/x86_64/src/polyz_unpack_19_avx2.c)
   - [mldsa/native/x86_64/src/rej_uniform_avx2.c](mldsa/native/x86_64/src/rej_uniform_avx2.c)
   - [mldsa/native/x86_64/src/rej_uniform_eta2_avx2.c](mldsa/native/x86_64/src/rej_uniform_eta2_avx2.c)
   - [mldsa/native/x86_64/src/rej_uniform_eta4_avx2.c](mldsa/native/x86_64/src/rej_uniform_eta4_avx2.c)
diff --git a/mldsa/native/api.h b/mldsa/native/api.h
@@ -270,4 +270,32 @@ static MLD_INLINE void mld_poly_use_hint_88_native(int32_t *b, const int32_t *a,
 static MLD_INLINE uint32_t mld_poly_chknorm_native(const int32_t *a, int32_t B);
 #endif /* MLD_USE_NATIVE_POLY_CHKNORM */
 
+#if defined(MLD_USE_NATIVE_POLYZ_UNPACK_17)
+/*************************************************
+ * Name:        mld_polyz_unpack_17_native
+ *
+ * Description: Native implementation of polyz_unpack for GAMMA1 = 2^17.
+ *              Unpack polynomial z with coefficients
+ *              in [-(MLDSA_GAMMA1 - 1), MLDSA_GAMMA1].
+ *
+ * Arguments:   - int32_t *r: pointer to output polynomial
+ *              - const uint8_t *a: byte array with bit-packed polynomial
+ **************************************************/
+static MLD_INLINE void mld_polyz_unpack_17_native(int32_t *r, const uint8_t *a);
+#endif /* MLD_USE_NATIVE_POLYZ_UNPACK_17 */
+
+#if defined(MLD_USE_NATIVE_POLYZ_UNPACK_19)
+/*************************************************
+ * Name:        mld_polyz_unpack_19_native
+ *
+ * Description: Native implementation of polyz_unpack for GAMMA1 = 2^19.
+ *              Unpack polynomial z with coefficients
+ *              in [-(MLDSA_GAMMA1 - 1), MLDSA_GAMMA1].
+ *
+ * Arguments:   - int32_t *r: pointer to output polynomial
+ *              - const uint8_t *a: byte array with bit-packed polynomial
+ **************************************************/
+static MLD_INLINE void mld_polyz_unpack_19_native(int32_t *r, const uint8_t *a);
+#endif /* MLD_USE_NATIVE_POLYZ_UNPACK_19 */
+
 #endif /* !MLD_NATIVE_API_H */
diff --git a/mldsa/native/x86_64/meta.h b/mldsa/native/x86_64/meta.h
@@ -23,6 +23,8 @@
 #define MLD_USE_NATIVE_POLY_USE_HINT_32
 #define MLD_USE_NATIVE_POLY_USE_HINT_88
 #define MLD_USE_NATIVE_POLY_CHKNORM
+#define MLD_USE_NATIVE_POLYZ_UNPACK_17
+#define MLD_USE_NATIVE_POLYZ_UNPACK_19
 
 #if !defined(__ASSEMBLER__)
 #include <string.h>
@@ -139,6 +141,16 @@ static MLD_INLINE uint32_t mld_poly_chknorm_native(const int32_t *a, int32_t B)
   return mld_poly_chknorm_avx2((const __m256i *)a, B);
 }
 
+static MLD_INLINE void mld_polyz_unpack_17_native(int32_t *r, const uint8_t *a)
+{
+  mld_polyz_unpack_17_avx2((__m256i *)r, a);
+}
+
+static MLD_INLINE void mld_polyz_unpack_19_native(int32_t *r, const uint8_t *a)
+{
+  mld_polyz_unpack_19_avx2((__m256i *)r, a);
+}
+
 #endif /* !__ASSEMBLER__ */
 
 #endif /* !MLD_NATIVE_X86_64_META_H */
diff --git a/mldsa/native/x86_64/src/arith_native_x86_64.h b/mldsa/native/x86_64/src/arith_native_x86_64.h
@@ -72,4 +72,10 @@ void mld_poly_use_hint_88_avx2(__m256i *b, const __m256i *a, const __m256i *h);
 #define mld_poly_chknorm_avx2 MLD_NAMESPACE(mld_poly_chknorm_avx2)
 uint32_t mld_poly_chknorm_avx2(const __m256i *a, int32_t B);
 
+#define mld_polyz_unpack_17_avx2 MLD_NAMESPACE(mld_polyz_unpack_17_avx2)
+void mld_polyz_unpack_17_avx2(__m256i *r, const uint8_t *a);
+
+#define mld_polyz_unpack_19_avx2 MLD_NAMESPACE(mld_polyz_unpack_19_avx2)
+void mld_polyz_unpack_19_avx2(__m256i *r, const uint8_t *a);
+
 #endif /* !MLD_NATIVE_X86_64_SRC_ARITH_NATIVE_X86_64_H */
diff --git a/mldsa/native/x86_64/src/polyz_unpack_17_avx2.c b/mldsa/native/x86_64/src/polyz_unpack_17_avx2.c
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) The mldsa-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+
+/* References
+ * ==========
+ *
+ * - [REF_AVX2]
+ *   CRYSTALS-Dilithium optimized AVX2 implementation
+ *   Bai, Ducas, Kiltz, Lepoint, Lyubashevsky, Schwabe, Seiler, Stehlé
+ *   https://github.com/pq-crystals/dilithium/tree/master/avx2
+ */
+
+/*
+ * This file is derived from the public domain
+ * AVX2 Dilithium implementation @[REF_AVX2].
+ */
+
+#include "../../../common.h"
+
+#if defined(MLD_ARITH_BACKEND_X86_64_DEFAULT) && \
+    !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED)
+
+#include <immintrin.h>
+#include <stdint.h>
+#include "arith_native_x86_64.h"
+
+void mld_polyz_unpack_17_avx2(__m256i *r, const uint8_t *a)
+{
+  unsigned int i;
+  __m256i f;
+  const __m256i shufbidx =
+      _mm256_set_epi8(-1, 9, 8, 7, -1, 7, 6, 5, -1, 5, 4, 3, -1, 3, 2, 1, -1, 8,
+                      7, 6, -1, 6, 5, 4, -1, 4, 3, 2, -1, 2, 1, 0);
+  const __m256i srlvdidx = _mm256_set_epi32(6, 4, 2, 0, 6, 4, 2, 0);
+  const __m256i mask = _mm256_set1_epi32(0x3FFFF);
+  const __m256i gamma1 = _mm256_set1_epi32(MLDSA_GAMMA1);
+
+  for (i = 0; i < MLDSA_N / 8; i++)
+  {
+    f = _mm256_loadu_si256((__m256i *)&a[18 * i]);
+
+    /* Permute 64-bit lanes
+     * 0x94 = 10010100b rearranges 64-bit lanes as: [3,2,1,0] -> [2,1,1,0]
+     *
+     * ╔═══════════════════════════════════════════════════════════════════════╗
+     * ║                         Original Layout                               ║
+     * ╚═══════════════════════════════════════════════════════════════════════╝
+     * ┌─────────────────┬─────────────────┬─────────────────┬─────────────────┐
+     * │     Lane 0      │     Lane 1      │     Lane 2      │     Lane 3      │
+     * │   bytes 0..7    │   bytes 8..15   │   bytes 16..23  │   bytes 24..31  │
+     * └─────────────────┴─────────────────┴─────────────────┴─────────────────┘
+     *
+     * ╔═══════════════════════════════════════════════════════════════════════╗
+     * ║                        Layout after permute                           ║
+     * ║        Byte indices in high half shifted down by 8 positions          ║
+     * ╚═══════════════════════════════════════════════════════════════════════╝
+     * ┌───────────────┬─────────────────┐ ┌─────────────────┬─────────────────┐
+     * │   Lane 0      │     Lane 1      │ │     Lane 2      │     Lane 3      │
+     * │ bytes 0..7    │   bytes 8..15   │ │   bytes 8..15   │   bytes 16..23  │
+     * └───────────────┴─────────────────┘ └─────────────────┴─────────────────┘
+     *   Lower 128-bit lane (bytes 0-15)      Upper 128-bit lane (bytes 16-31)
+     */
+    f = _mm256_permute4x64_epi64(f, 0x94);
+
+    /* Shuffling 8-bit lanes
+     *
+     * ┌─ Indices 0-8 into low 128-bit half of permuted vector ────────────────┐
+     * │ Shuffle: [-1, 8, 7, 6, -1, 6, 5, 4, -1, 4, 3, 2, -1, 2, 1, 0]         │
+     * │ Result:  [0, byte8, byte7, byte6, ..., 0, byte2, byte1, byte0]        │
+     * └───────────────────────────────────────────────────────────────────────┘
+     *
+     * ┌─ Indices 1-9 into high 128-bit half of permuted vector ───────────────┐
+     * │ Shuffle: [-1, 9, 8, 7, -1, 7, 6, 5, -1, 5, 4, 3, -1, 3, 2, 1]         │
+     * │ Result:  [0, byte17, byte16, byte15, ..., 0, byte11, byte10, byte9]   │
+     * └───────────────────────────────────────────────────────────────────────┘
+     */
+    f = _mm256_shuffle_epi8(f, shufbidx);
+
+    /* Keep only 18 out of 24 bits in each 32-bit lane */
+    /* Bits   0..23     16..39    32..55    48..71
+     *        72..95    88..111   104..127  120..143 */
+    f = _mm256_srlv_epi32(f, srlvdidx);
+    /* Bits   0..23     18..39    36..55    54..71
+     *        72..95    90..111   108..127  126..143 */
+    f = _mm256_and_si256(f, mask);
+    /* Bits   0..17     18..35    36..53    54..71
+     *        72..89    90..107   108..125  126..143 */
+
+    /* Map [0, 1, ..., 2^18-1] to [2^17, 2^17-1, ..., -2^17+1] */
+    f = _mm256_sub_epi32(gamma1, f);
+
+    _mm256_store_si256(&r[i], f);
+  }
+}
+
+#else /* MLD_ARITH_BACKEND_X86_64_DEFAULT && !MLD_CONFIG_MULTILEVEL_NO_SHARED \
+       */
+
+MLD_EMPTY_CU(avx2_polyz_unpack)
+
+#endif /* !(MLD_ARITH_BACKEND_X86_64_DEFAULT && \
+          !MLD_CONFIG_MULTILEVEL_NO_SHARED) */
diff --git a/mldsa/native/x86_64/src/polyz_unpack_19_avx2.c b/mldsa/native/x86_64/src/polyz_unpack_19_avx2.c
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) The mldsa-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+
+/* References
+ * ==========
+ *
+ * - [REF_AVX2]
+ *   CRYSTALS-Dilithium optimized AVX2 implementation
+ *   Bai, Ducas, Kiltz, Lepoint, Lyubashevsky, Schwabe, Seiler, Stehlé
+ *   https://github.com/pq-crystals/dilithium/tree/master/avx2
+ */
+
+/*
+ * This file is derived from the public domain
+ * AVX2 Dilithium implementation @[REF_AVX2].
+ */
+
+#include "../../../common.h"
+
+#if defined(MLD_ARITH_BACKEND_X86_64_DEFAULT) && \
+    !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED)
+
+#include <immintrin.h>
+#include <stdint.h>
+#include "arith_native_x86_64.h"
+
+void mld_polyz_unpack_19_avx2(__m256i *r, const uint8_t *a)
+{
+  unsigned int i;
+  __m256i f;
+  const __m256i shufbidx =
+      _mm256_set_epi8(-1, 11, 10, 9, -1, 9, 8, 7, -1, 6, 5, 4, -1, 4, 3, 2, -1,
+                      9, 8, 7, -1, 7, 6, 5, -1, 4, 3, 2, -1, 2, 1, 0);
+  /* Equivalent to _mm256_set_epi32(4, 0, 4, 0, 4, 0, 4, 0) */
+  const __m256i srlvdidx = _mm256_set1_epi64x((uint64_t)4 << 32);
+  const __m256i mask = _mm256_set1_epi32(0xFFFFF);
+  const __m256i gamma1 = _mm256_set1_epi32(MLDSA_GAMMA1);
+
+  for (i = 0; i < MLDSA_N / 8; i++)
+  {
+    f = _mm256_loadu_si256((__m256i *)&a[20 * i]);
+
+    /* Permute 64-bit lanes
+     * 0x94 = 10010100b rearranges 64-bit lanes as: [3,2,1,0] -> [2,1,1,0]
+     *
+     * ╔═══════════════════════════════════════════════════════════════════════╗
+     * ║                         Original Layout                               ║
+     * ╚═══════════════════════════════════════════════════════════════════════╝
+     * ┌─────────────────┬─────────────────┬─────────────────┬─────────────────┐
+     * │     Lane 0      │     Lane 1      │     Lane 2      │     Lane 3      │
+     * │   bytes 0..7    │   bytes 8..15   │   bytes 16..23  │   bytes 24..31  │
+     * └─────────────────┴─────────────────┴─────────────────┴─────────────────┘
+     *
+     * ╔═══════════════════════════════════════════════════════════════════════╗
+     * ║                        Layout after permute                           ║
+     * ║        Byte indices in high half shifted down by 8 positions          ║
+     * ╚═══════════════════════════════════════════════════════════════════════╝
+     * ┌───────────────┬─────────────────┐ ┌─────────────────┬─────────────────┐
+     * │   Lane 0      │     Lane 1      │ │     Lane 2      │     Lane 3      │
+     * │ bytes 0..7    │   bytes 8..15   │ │   bytes 8..15   │   bytes 16..23  │
+     * └───────────────┴─────────────────┘ └─────────────────┴─────────────────┘
+     *   Lower 128-bit lane (bytes 0-15)      Upper 128-bit lane (bytes 16-31)
+     */
+    f = _mm256_permute4x64_epi64(f, 0x94);
+
+    /* Shuffling 8-bit lanes
+     *
+     * ┌─ Indices 0-9 into low 128-bit half of permuted vector ────────────────┐
+     * │ Shuffle: [-1, 9, 8, 7, -1, 7, 6, 5, -1, 4, 3, 2, -1, 2, 1, 0]         │
+     * │ Result:  [0, byte9, byte8, byte7, ..., 0, byte2, byte1, byte0]        │
+     * └───────────────────────────────────────────────────────────────────────┘
+     *
+     * ┌─ Indices 2-11 into high 128-bit half of permuted vector ──────────────┐
+     * │ Shuffle: [-1, 11, 9, 8, -1, 9, 8, 7, -1, 6, 5, 4, -1, 4, 3, 2]        │
+     * │ Result:  [0, byte19, byte18, byte17, ..., 0, byte12, byte11, byte10]  │
+     * └───────────────────────────────────────────────────────────────────────┘
+     */
+    f = _mm256_shuffle_epi8(f, shufbidx);
+
+    /* Keep only 20 out of 24 bits in each 32-bit lane */
+    /* Bits   0..23     16..39    40..63    56..79
+     *        80..103   96..119   120..143  136..159 */
+    f = _mm256_srlv_epi32(f, srlvdidx);
+    /* Bits   0..23     20..39    40..63    60..79
+     *        80..103   100..119  120..143  140..159 */
+    f = _mm256_and_si256(f, mask);
+    /* Bits   0..19     20..39    40..59    60..79
+     *        80..99    100..119  120..139  140..159 */
+
+    /* Map [0, 1, ..., 2^20-1] to [2^19, 2^19-1, ..., -2^19+1] */
+    f = _mm256_sub_epi32(gamma1, f);
+
+    _mm256_store_si256(&r[i], f);
+  }
+}
+
+#else /* MLD_ARITH_BACKEND_X86_64_DEFAULT && !MLD_CONFIG_MULTILEVEL_NO_SHARED \
+       */
+
+MLD_EMPTY_CU(avx2_polyz_unpack)
+
+#endif /* !(MLD_ARITH_BACKEND_X86_64_DEFAULT && \
+          !MLD_CONFIG_MULTILEVEL_NO_SHARED) */
diff --git a/mldsa/poly_kl.c b/mldsa/poly_kl.c
@@ -702,9 +702,15 @@ void mld_polyz_pack(uint8_t *r, const mld_poly *a)
 MLD_INTERNAL_API
 void mld_polyz_unpack(mld_poly *r, const uint8_t *a)
 {
+#if defined(MLD_USE_NATIVE_POLYZ_UNPACK_17) && MLD_CONFIG_PARAMETER_SET == 44
+  /* TODO: proof */
+  mld_polyz_unpack_17_native(r->coeffs, a);
+#elif defined(MLD_USE_NATIVE_POLYZ_UNPACK_19) && \
+    (MLD_CONFIG_PARAMETER_SET == 65 || MLD_CONFIG_PARAMETER_SET == 87)
+  /* TODO: proof */
+  mld_polyz_unpack_19_native(r->coeffs, a);
+#elif MLD_CONFIG_PARAMETER_SET == 44
   unsigned int i;
-
-#if MLD_CONFIG_PARAMETER_SET == 44
   for (i = 0; i < MLDSA_N / 4; ++i)
   __loop__(
     invariant(i <= MLDSA_N/4)
@@ -735,7 +741,11 @@ void mld_polyz_unpack(mld_poly *r, const uint8_t *a)
     r->coeffs[4 * i + 2] = MLDSA_GAMMA1 - r->coeffs[4 * i + 2];
     r->coeffs[4 * i + 3] = MLDSA_GAMMA1 - r->coeffs[4 * i + 3];
   }
-#else  /* MLD_CONFIG_PARAMETER_SET == 44 */
+#else  /* !(MLD_USE_NATIVE_POLYZ_UNPACK_17 && MLD_CONFIG_PARAMETER_SET == 44)   \
+          && !(MLD_USE_NATIVE_POLYZ_UNPACK_19 && (MLD_CONFIG_PARAMETER_SET ==   \
+          65 || MLD_CONFIG_PARAMETER_SET == 87)) && MLD_CONFIG_PARAMETER_SET == \
+          44 */
+  unsigned int i;
   for (i = 0; i < MLDSA_N / 2; ++i)
   __loop__(
     invariant(i <= MLDSA_N/2)
@@ -755,7 +765,10 @@ void mld_polyz_unpack(mld_poly *r, const uint8_t *a)
     r->coeffs[2 * i + 0] = MLDSA_GAMMA1 - r->coeffs[2 * i + 0];
     r->coeffs[2 * i + 1] = MLDSA_GAMMA1 - r->coeffs[2 * i + 1];
   }
-#endif /* MLD_CONFIG_PARAMETER_SET != 44 */
+#endif /* !(MLD_USE_NATIVE_POLYZ_UNPACK_17 && MLD_CONFIG_PARAMETER_SET == 44) \
+          && !(MLD_USE_NATIVE_POLYZ_UNPACK_19 && (MLD_CONFIG_PARAMETER_SET == \
+          65 || MLD_CONFIG_PARAMETER_SET == 87)) && MLD_CONFIG_PARAMETER_SET  \
+          != 44 */
 
   mld_assert_bound(r->coeffs, MLDSA_N, -(MLDSA_GAMMA1 - 1), MLDSA_GAMMA1 + 1);
 }