Merge pull request #510 from pq-code-package/polyz-unpack-asm

hanno-becker · web-flow · commit abf8281191e9 · 2025-10-11T20:00:46.000+01:00
Add native implementation of polyz_unpack
diff --git a/BIBLIOGRAPHY.md b/BIBLIOGRAPHY.md
@@ -156,6 +156,8 @@ source code and documentation.
   - [mldsa/native/x86_64/src/poly_decompose_88_avx2.c](mldsa/native/x86_64/src/poly_decompose_88_avx2.c)
   - [mldsa/native/x86_64/src/poly_use_hint_32_avx2.c](mldsa/native/x86_64/src/poly_use_hint_32_avx2.c)
   - [mldsa/native/x86_64/src/poly_use_hint_88_avx2.c](mldsa/native/x86_64/src/poly_use_hint_88_avx2.c)
+  - [mldsa/native/x86_64/src/polyz_unpack_17_avx2.c](mldsa/native/x86_64/src/polyz_unpack_17_avx2.c)
+  - [mldsa/native/x86_64/src/polyz_unpack_19_avx2.c](mldsa/native/x86_64/src/polyz_unpack_19_avx2.c)
   - [mldsa/native/x86_64/src/rej_uniform_avx2.c](mldsa/native/x86_64/src/rej_uniform_avx2.c)
   - [mldsa/native/x86_64/src/rej_uniform_eta2_avx2.c](mldsa/native/x86_64/src/rej_uniform_eta2_avx2.c)
   - [mldsa/native/x86_64/src/rej_uniform_eta4_avx2.c](mldsa/native/x86_64/src/rej_uniform_eta4_avx2.c)
diff --git a/mldsa/native/aarch64/meta.h b/mldsa/native/aarch64/meta.h
@@ -19,6 +19,8 @@
 #define MLD_USE_NATIVE_POLY_USE_HINT_32
 #define MLD_USE_NATIVE_POLY_USE_HINT_88
 #define MLD_USE_NATIVE_POLY_CHKNORM
+#define MLD_USE_NATIVE_POLYZ_UNPACK_17
+#define MLD_USE_NATIVE_POLYZ_UNPACK_19
 
 /* Identifier for this backend so that source and assembly files
  * in the build can be appropriately guarded. */
@@ -133,5 +135,17 @@ static MLD_INLINE uint32_t mld_poly_chknorm_native(const int32_t *a, int32_t B)
   return mld_poly_chknorm_asm(a, B);
 }
 
+static MLD_INLINE void mld_polyz_unpack_17_native(int32_t *r,
+                                                  const uint8_t *buf)
+{
+  mld_polyz_unpack_17_asm(r, buf, mld_polyz_unpack_17_indices);
+}
+
+static MLD_INLINE void mld_polyz_unpack_19_native(int32_t *r,
+                                                  const uint8_t *buf)
+{
+  mld_polyz_unpack_19_asm(r, buf, mld_polyz_unpack_19_indices);
+}
+
 #endif /* !__ASSEMBLER__ */
 #endif /* !MLD_NATIVE_AARCH64_META_H */
diff --git a/mldsa/native/aarch64/src/arith_native_aarch64.h b/mldsa/native/aarch64/src/arith_native_aarch64.h
@@ -29,6 +29,11 @@ extern const uint8_t mld_rej_uniform_table[];
 #define mld_rej_uniform_eta_table MLD_NAMESPACE(rej_uniform_eta_table)
 extern const uint8_t mld_rej_uniform_eta_table[];
 
+#define mld_polyz_unpack_17_indices MLD_NAMESPACE(polyz_unpack_17_indices)
+extern const uint8_t mld_polyz_unpack_17_indices[];
+#define mld_polyz_unpack_19_indices MLD_NAMESPACE(polyz_unpack_19_indices)
+extern const uint8_t mld_polyz_unpack_19_indices[];
+
 
 /*
  * Sampling 256 coefficients mod 15 using rejection sampling from 4 bits.
@@ -80,4 +85,12 @@ void mld_poly_use_hint_88_asm(int32_t *b, const int32_t *a, const int32_t *h);
 #define mld_poly_chknorm_asm MLD_NAMESPACE(poly_chknorm_asm)
 uint32_t mld_poly_chknorm_asm(const int32_t *a, int32_t B);
 
+#define mld_polyz_unpack_17_asm MLD_NAMESPACE(polyz_unpack_17_asm)
+void mld_polyz_unpack_17_asm(int32_t *r, const uint8_t *buf,
+                             const uint8_t *indices);
+
+#define mld_polyz_unpack_19_asm MLD_NAMESPACE(polyz_unpack_19_asm)
+void mld_polyz_unpack_19_asm(int32_t *r, const uint8_t *buf,
+                             const uint8_t *indices);
+
 #endif /* !MLD_NATIVE_AARCH64_SRC_ARITH_NATIVE_AARCH64_H */
diff --git a/mldsa/native/aarch64/src/polyz_unpack_17_asm.S b/mldsa/native/aarch64/src/polyz_unpack_17_asm.S
@@ -0,0 +1,103 @@
+/*
+ * Copyright (c) The mldsa-native project authors
+ * Copyright (c) The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+
+ #include "../../../common.h"
+#if defined(MLD_ARITH_BACKEND_AARCH64) && !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED)
+
+.macro trim_map_17 a
+        // Keep only 18 out of 24 bits in each 32-bit lane
+        //     Lane     0       1       2       3
+        //     Bits     0..23   16..39  32..55  48..71
+        ushl \a\().4s, \a\().4s, shifts.4s
+        //     Bits     0..23   18..39  36..55  54..71
+        and \a\().16b, \a\().16b, mask.16b
+        //     Bits     0..17   18..35  36..53  54..71
+
+        // Map [0, 1, ..., 2^18-1] to [2^17, 2^17-1, ..., -2^17+1]
+        sub \a\().4s, gamma1.4s, \a\().4s
+.endm
+
+        /* Parameters */
+        output          .req x0
+        buf             .req x1
+        indices         .req x2
+
+        xtmp            .req x3
+        count           .req x9
+
+        /* Constant register assignments */
+        idx0            .req v24
+        idx1            .req v25
+        idx2            .req v26
+        idx3            .req v27
+        shifts          .req v28
+        mask            .req v29    // 2^18 - 1
+        gamma1          .req v30    // 2^17
+
+.text
+.global MLD_ASM_NAMESPACE(polyz_unpack_17_asm)
+.balign 4
+MLD_ASM_FN_SYMBOL(polyz_unpack_17_asm)
+        // Load indices
+        ldr q24, [indices]
+        ldr q25, [indices, #1*16]
+        ldr q26, [indices, #2*16]
+        ldr q27, [indices, #3*16]
+
+        // Load per-lane shifts 0, -2, -4, -6. (Negative means right shift.)
+        // The shifts for the 4 32-bit lanes are sign-extended from the lowest
+        // 8 bits, so it suffices to set up only byte 0, 4, 8, 12.
+        movz xtmp, 0xfe, lsl 32
+        mov shifts.d[0], xtmp
+        movz xtmp, 0xfc
+        movk xtmp, 0xfa, lsl 32
+        mov shifts.d[1], xtmp
+
+        movi mask.4s, 0x3, msl 16
+
+        movi gamma1.4s, 0x2, lsl 16
+
+        mov count, #(64/4)
+
+polyz_unpack_17_loop:
+        ldr q1, [buf, #16]
+        ldr q2, [buf, #32]
+        ldr q0, [buf], #36
+
+        tbl v4.16b, {v0.16b}, idx0.16b
+        tbl v5.16b, {v0.16b - v1.16b}, idx1.16b
+        tbl v6.16b, {v1.16b}, idx2.16b
+        tbl v7.16b, {v1.16b - v2.16b}, idx3.16b
+
+        trim_map_17 v4
+        trim_map_17 v5
+        trim_map_17 v6
+        trim_map_17 v7
+
+        str q5, [output, #1*16]
+        str q6, [output, #2*16]
+        str q7, [output, #3*16]
+        str q4, [output], #4*16
+
+        subs count, count, #1
+        bne polyz_unpack_17_loop
+
+        ret
+
+        .unreq output
+        .unreq buf
+        .unreq indices
+        .unreq xtmp
+        .unreq count
+        .unreq idx0
+        .unreq idx1
+        .unreq idx2
+        .unreq idx3
+        .unreq shifts
+        .unreq mask
+        .unreq gamma1
+
+#endif /* MLD_ARITH_BACKEND_AARCH64 && !MLD_CONFIG_MULTILEVEL_NO_SHARED */
diff --git a/mldsa/native/aarch64/src/polyz_unpack_19_asm.S b/mldsa/native/aarch64/src/polyz_unpack_19_asm.S
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) The mldsa-native project authors
+ * Copyright (c) The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+
+ #include "../../../common.h"
+#if defined(MLD_ARITH_BACKEND_AARCH64) && !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED)
+
+.macro trim_map_19 a
+        // Keep only 20 out of 24 bits in each 32-bit lane
+        //     Lane     0       1       2       3
+        //     Bits     0..23   16..39  40..63  56..79
+        ushl \a\().4s, \a\().4s, shifts.4s
+        //     Bits     0..23   20..39  40..63  60..79
+        and \a\().16b, \a\().16b, mask.16b
+        //     Bits     0..19   20..39  40..59  60..79
+
+        // Map [0, 1, ..., 2^20-1] to [2^19, 2^19-1, ..., -2^19+1]
+        sub \a\().4s, gamma1.4s, \a\().4s
+.endm
+
+        /* Parameters */
+        output          .req x0
+        buf             .req x1
+        indices         .req x2
+
+        xtmp            .req x3
+        count           .req x9
+
+        /* Constant register assignments */
+        idx0            .req v24
+        idx1            .req v25
+        idx2            .req v26
+        idx3            .req v27
+        shifts          .req v28
+        mask            .req v29    // 2^20 - 1
+        gamma1          .req v30    // 2^19
+
+.text
+.global MLD_ASM_NAMESPACE(polyz_unpack_19_asm)
+.balign 4
+MLD_ASM_FN_SYMBOL(polyz_unpack_19_asm)
+        // Load indices
+        ldr q24, [indices]
+        ldr q25, [indices, #1*16]
+        ldr q26, [indices, #2*16]
+        ldr q27, [indices, #3*16]
+
+        // Load per-lane shifts 0, -4, 0, -4. (Negative means right shift.)
+        // The shifts for the 4 32-bit lanes are sign-extended from the lowest
+        // 8 bits, so it suffices to set up only byte 0, 4, 8, 12.
+        movz xtmp, 0xfc, lsl 32
+        dup shifts.2d, xtmp
+
+        movi mask.4s, 0xf, msl 16
+
+        movi gamma1.4s, 0x8, lsl 16
+
+        mov count, #(64/4)
+
+polyz_unpack_19_loop:
+        ldr q1, [buf, #16]
+        ldr q2, [buf, #32]
+        ldr q0, [buf], #40
+
+        tbl v4.16b, {v0.16b}, idx0.16b
+        tbl v5.16b, {v0.16b - v1.16b}, idx1.16b
+        tbl v6.16b, {v1.16b}, idx2.16b
+        tbl v7.16b, {v1.16b - v2.16b}, idx3.16b
+
+        trim_map_19 v4
+        trim_map_19 v5
+        trim_map_19 v6
+        trim_map_19 v7
+
+        str q5, [output, #1*16]
+        str q6, [output, #2*16]
+        str q7, [output, #3*16]
+        str q4, [output], #4*16
+
+        subs count, count, #1
+        bne polyz_unpack_19_loop
+
+        ret
+
+        .unreq output
+        .unreq buf
+        .unreq indices
+        .unreq xtmp
+        .unreq count
+        .unreq idx0
+        .unreq idx1
+        .unreq idx2
+        .unreq idx3
+        .unreq shifts
+        .unreq mask
+        .unreq gamma1
+
+#endif /* MLD_ARITH_BACKEND_AARCH64 && !MLD_CONFIG_MULTILEVEL_NO_SHARED */
diff --git a/mldsa/native/aarch64/src/polyz_unpack_table.c b/mldsa/native/aarch64/src/polyz_unpack_table.c
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) The mldsa-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+
+#include "../../../common.h"
+
+#if defined(MLD_ARITH_BACKEND_AARCH64) && \
+    !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED)
+
+#include <stdint.h>
+#include "arith_native_aarch64.h"
+
+/* Table of indices used for tbl instructions in polyz_unpack_{17,19}. */
+
+MLD_ALIGN const uint8_t mld_polyz_unpack_17_indices[] = {
+    0,  1,  2,  -1, 2,  3,  4,  -1, 4,  5,  6,  -1, 6,  7,  8,  -1,
+    9,  10, 11, -1, 11, 12, 13, -1, 13, 14, 15, -1, 15, 16, 17, -1,
+    2,  3,  4,  -1, 4,  5,  6,  -1, 6,  7,  8,  -1, 8,  9,  10, -1,
+    11, 12, 13, -1, 13, 14, 15, -1, 15, 16, 17, -1, 17, 18, 19, -1,
+};
+
+MLD_ALIGN const uint8_t mld_polyz_unpack_19_indices[] = {
+    0,  1,  2,  -1, 2,  3,  4,  -1, 5,  6,  7,  -1, 7,  8,  9,  -1,
+    10, 11, 12, -1, 12, 13, 14, -1, 15, 16, 17, -1, 17, 18, 19, -1,
+    4,  5,  6,  -1, 6,  7,  8,  -1, 9,  10, 11, -1, 11, 12, 13, -1,
+    14, 15, 16, -1, 16, 17, 18, -1, 19, 20, 21, -1, 21, 22, 23, -1,
+};
+
+#else /* MLD_ARITH_BACKEND_AARCH64 && !MLD_CONFIG_MULTILEVEL_NO_SHARED */
+
+MLD_EMPTY_CU(aarch64_polyz_unpack_table)
+
+#endif /* !(MLD_ARITH_BACKEND_AARCH64 && !MLD_CONFIG_MULTILEVEL_NO_SHARED) */
diff --git a/mldsa/native/api.h b/mldsa/native/api.h
@@ -270,4 +270,32 @@ static MLD_INLINE void mld_poly_use_hint_88_native(int32_t *b, const int32_t *a,
 static MLD_INLINE uint32_t mld_poly_chknorm_native(const int32_t *a, int32_t B);
 #endif /* MLD_USE_NATIVE_POLY_CHKNORM */
 
+#if defined(MLD_USE_NATIVE_POLYZ_UNPACK_17)
+/*************************************************
+ * Name:        mld_polyz_unpack_17_native
+ *
+ * Description: Native implementation of polyz_unpack for GAMMA1 = 2^17.
+ *              Unpack polynomial z with coefficients
+ *              in [-(MLDSA_GAMMA1 - 1), MLDSA_GAMMA1].
+ *
+ * Arguments:   - int32_t *r: pointer to output polynomial
+ *              - const uint8_t *a: byte array with bit-packed polynomial
+ **************************************************/
+static MLD_INLINE void mld_polyz_unpack_17_native(int32_t *r, const uint8_t *a);
+#endif /* MLD_USE_NATIVE_POLYZ_UNPACK_17 */
+
+#if defined(MLD_USE_NATIVE_POLYZ_UNPACK_19)
+/*************************************************
+ * Name:        mld_polyz_unpack_19_native
+ *
+ * Description: Native implementation of polyz_unpack for GAMMA1 = 2^19.
+ *              Unpack polynomial z with coefficients
+ *              in [-(MLDSA_GAMMA1 - 1), MLDSA_GAMMA1].
+ *
+ * Arguments:   - int32_t *r: pointer to output polynomial
+ *              - const uint8_t *a: byte array with bit-packed polynomial
+ **************************************************/
+static MLD_INLINE void mld_polyz_unpack_19_native(int32_t *r, const uint8_t *a);
+#endif /* MLD_USE_NATIVE_POLYZ_UNPACK_19 */
+
 #endif /* !MLD_NATIVE_API_H */
diff --git a/mldsa/native/x86_64/meta.h b/mldsa/native/x86_64/meta.h
@@ -23,6 +23,8 @@
 #define MLD_USE_NATIVE_POLY_USE_HINT_32
 #define MLD_USE_NATIVE_POLY_USE_HINT_88
 #define MLD_USE_NATIVE_POLY_CHKNORM
+#define MLD_USE_NATIVE_POLYZ_UNPACK_17
+#define MLD_USE_NATIVE_POLYZ_UNPACK_19
 
 #if !defined(__ASSEMBLER__)
 #include <string.h>
@@ -139,6 +141,16 @@ static MLD_INLINE uint32_t mld_poly_chknorm_native(const int32_t *a, int32_t B)
   return mld_poly_chknorm_avx2((const __m256i *)a, B);
 }
 
+static MLD_INLINE void mld_polyz_unpack_17_native(int32_t *r, const uint8_t *a)
+{
+  mld_polyz_unpack_17_avx2((__m256i *)r, a);
+}
+
+static MLD_INLINE void mld_polyz_unpack_19_native(int32_t *r, const uint8_t *a)
+{
+  mld_polyz_unpack_19_avx2((__m256i *)r, a);
+}
+
 #endif /* !__ASSEMBLER__ */
 
 #endif /* !MLD_NATIVE_X86_64_META_H */
diff --git a/mldsa/native/x86_64/src/arith_native_x86_64.h b/mldsa/native/x86_64/src/arith_native_x86_64.h
@@ -72,4 +72,10 @@ void mld_poly_use_hint_88_avx2(__m256i *b, const __m256i *a, const __m256i *h);
 #define mld_poly_chknorm_avx2 MLD_NAMESPACE(mld_poly_chknorm_avx2)
 uint32_t mld_poly_chknorm_avx2(const __m256i *a, int32_t B);
 
+#define mld_polyz_unpack_17_avx2 MLD_NAMESPACE(mld_polyz_unpack_17_avx2)
+void mld_polyz_unpack_17_avx2(__m256i *r, const uint8_t *a);
+
+#define mld_polyz_unpack_19_avx2 MLD_NAMESPACE(mld_polyz_unpack_19_avx2)
+void mld_polyz_unpack_19_avx2(__m256i *r, const uint8_t *a);
+
 #endif /* !MLD_NATIVE_X86_64_SRC_ARITH_NATIVE_X86_64_H */
diff --git a/mldsa/native/x86_64/src/polyz_unpack_17_avx2.c b/mldsa/native/x86_64/src/polyz_unpack_17_avx2.c
diff --git a/mldsa/native/x86_64/src/polyz_unpack_19_avx2.c b/mldsa/native/x86_64/src/polyz_unpack_19_avx2.c
diff --git a/mldsa/poly_kl.c b/mldsa/poly_kl.c