pq-code-package
diff --git a/‎BIBLIOGRAPHY.md‎
Lines changed: 0 additions & 2 deletions b/‎BIBLIOGRAPHY.md‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎dev/x86_64/meta.h‎
Lines changed: 14 additions & 20 deletions b/‎dev/x86_64/meta.h‎
Lines changed: 14 additions & 20 deletions
diff --git a/‎dev/x86_64/src/align.h‎
Lines changed: 0 additions & 34 deletions b/‎dev/x86_64/src/align.h‎
Lines changed: 0 additions & 34 deletions
diff --git a/‎dev/x86_64/src/arith_native_x86_64.h‎
Lines changed: 21 additions & 19 deletions b/‎dev/x86_64/src/arith_native_x86_64.h‎
Lines changed: 21 additions & 19 deletions
diff --git a/‎dev/x86_64/src/consts.c‎
Lines changed: 2 additions & 3 deletions b/‎dev/x86_64/src/consts.c‎
Lines changed: 2 additions & 3 deletions
diff --git a/‎dev/x86_64/src/consts.h‎
Lines changed: 2 additions & 4 deletions b/‎dev/x86_64/src/consts.h‎
Lines changed: 2 additions & 4 deletions
diff --git a/‎dev/x86_64/src/poly_chknorm_avx2.c‎
Lines changed: 2 additions & 2 deletions b/‎dev/x86_64/src/poly_chknorm_avx2.c‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎dev/x86_64/src/poly_decompose_32_avx2.c‎
Lines changed: 4 additions & 4 deletions b/‎dev/x86_64/src/poly_decompose_32_avx2.c‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎dev/x86_64/src/poly_decompose_88_avx2.c‎
Lines changed: 4 additions & 4 deletions b/‎dev/x86_64/src/poly_decompose_88_avx2.c‎
Lines changed: 4 additions & 4 deletions
@@ -206,7 +206,6 @@ source code and documentation.
   - Damien Stehlé
 * URL: https://github.com/pq-crystals/dilithium/tree/master/avx2
 * Referenced from:
-  - [dev/x86_64/src/align.h](dev/x86_64/src/align.h)
   - [dev/x86_64/src/consts.c](dev/x86_64/src/consts.c)
   - [dev/x86_64/src/consts.h](dev/x86_64/src/consts.h)
   - [dev/x86_64/src/intt.S](dev/x86_64/src/intt.S)
@@ -227,7 +226,6 @@ source code and documentation.
   - [dev/x86_64/src/rej_uniform_avx2.c](dev/x86_64/src/rej_uniform_avx2.c)
   - [dev/x86_64/src/rej_uniform_eta2_avx2.c](dev/x86_64/src/rej_uniform_eta2_avx2.c)
   - [dev/x86_64/src/rej_uniform_eta4_avx2.c](dev/x86_64/src/rej_uniform_eta4_avx2.c)
-  - [mldsa/src/native/x86_64/src/align.h](mldsa/src/native/x86_64/src/align.h)
   - [mldsa/src/native/x86_64/src/consts.c](mldsa/src/native/x86_64/src/consts.c)
   - [mldsa/src/native/x86_64/src/consts.h](mldsa/src/native/x86_64/src/consts.h)
   - [mldsa/src/native/x86_64/src/intt.S](mldsa/src/native/x86_64/src/intt.S)
 
@@ -40,7 +40,7 @@ static MLD_INLINE void mld_poly_permute_bitrev_to_custom(int32_t data[MLDSA_N])
 {
   if (mld_sys_check_capability(MLD_SYS_CAP_AVX2))
   {
-    mld_nttunpack_avx2((__m256i *)(data));
+    mld_nttunpack_avx2(data);
   }
 }
 
@@ -51,7 +51,7 @@ static MLD_INLINE int mld_ntt_native(int32_t data[MLDSA_N])
     return MLD_NATIVE_FUNC_FALLBACK;
   }
 
-  mld_ntt_avx2((__m256i *)data, mld_qdata.vec);
+  mld_ntt_avx2(data, mld_qdata);
   return MLD_NATIVE_FUNC_SUCCESS;
 }
 static MLD_INLINE int mld_intt_native(int32_t data[MLDSA_N])
@@ -60,7 +60,7 @@ static MLD_INLINE int mld_intt_native(int32_t data[MLDSA_N])
   {
     return MLD_NATIVE_FUNC_FALLBACK;
   }
-  mld_invntt_avx2((__m256i *)data, mld_qdata.vec);
+  mld_invntt_avx2(data, mld_qdata);
   return MLD_NATIVE_FUNC_SUCCESS;
 }
 
@@ -137,7 +137,7 @@ static MLD_INLINE int mld_poly_decompose_32_native(int32_t *a1, int32_t *a0)
   {
     return MLD_NATIVE_FUNC_FALLBACK;
   }
-  mld_poly_decompose_32_avx2((__m256i *)a1, (__m256i *)a0);
+  mld_poly_decompose_32_avx2(a1, a0);
   return MLD_NATIVE_FUNC_SUCCESS;
 }
 
@@ -147,7 +147,7 @@ static MLD_INLINE int mld_poly_decompose_88_native(int32_t *a1, int32_t *a0)
   {
     return MLD_NATIVE_FUNC_FALLBACK;
   }
-  mld_poly_decompose_88_avx2((__m256i *)a1, (__m256i *)a0);
+  mld_poly_decompose_88_avx2(a1, a0);
   return MLD_NATIVE_FUNC_SUCCESS;
 }
 
@@ -167,8 +167,7 @@ static MLD_INLINE int mld_poly_use_hint_32_native(int32_t *b, const int32_t *a,
   {
     return MLD_NATIVE_FUNC_FALLBACK;
   }
-  mld_poly_use_hint_32_avx2((__m256i *)b, (const __m256i *)a,
-                            (const __m256i *)h);
+  mld_poly_use_hint_32_avx2(b, a, h);
   return MLD_NATIVE_FUNC_SUCCESS;
 }
 
@@ -179,8 +178,7 @@ static MLD_INLINE int mld_poly_use_hint_88_native(int32_t *b, const int32_t *a,
   {
     return MLD_NATIVE_FUNC_FALLBACK;
   }
-  mld_poly_use_hint_88_avx2((__m256i *)b, (const __m256i *)a,
-                            (const __m256i *)h);
+  mld_poly_use_hint_88_avx2(b, a, h);
   return MLD_NATIVE_FUNC_SUCCESS;
 }
 
@@ -190,7 +188,7 @@ static MLD_INLINE int mld_poly_chknorm_native(const int32_t *a, int32_t B)
   {
     return MLD_NATIVE_FUNC_FALLBACK;
   }
-  return mld_poly_chknorm_avx2((const __m256i *)a, B);
+  return mld_poly_chknorm_avx2(a, B);
 }
 
 static MLD_INLINE int mld_polyz_unpack_17_native(int32_t *r, const uint8_t *a)
@@ -199,7 +197,7 @@ static MLD_INLINE int mld_polyz_unpack_17_native(int32_t *r, const uint8_t *a)
   {
     return MLD_NATIVE_FUNC_FALLBACK;
   }
-  mld_polyz_unpack_17_avx2((__m256i *)r, a);
+  mld_polyz_unpack_17_avx2(r, a);
   return MLD_NATIVE_FUNC_SUCCESS;
 }
 
@@ -209,7 +207,7 @@ static MLD_INLINE int mld_polyz_unpack_19_native(int32_t *r, const uint8_t *a)
   {
     return MLD_NATIVE_FUNC_FALLBACK;
   }
-  mld_polyz_unpack_19_avx2((__m256i *)r, a);
+  mld_polyz_unpack_19_avx2(r, a);
   return MLD_NATIVE_FUNC_SUCCESS;
 }
 
@@ -220,8 +218,7 @@ static MLD_INLINE int mld_poly_pointwise_montgomery_native(
   {
     return MLD_NATIVE_FUNC_FALLBACK;
   }
-  mld_pointwise_avx2((__m256i *)c, (const __m256i *)a, (const __m256i *)b,
-                     mld_qdata.vec);
+  mld_pointwise_avx2(c, a, b, mld_qdata);
   return MLD_NATIVE_FUNC_SUCCESS;
 }
 
@@ -233,8 +230,7 @@ static MLD_INLINE int mld_polyvecl_pointwise_acc_montgomery_l4_native(
   {
     return MLD_NATIVE_FUNC_FALLBACK;
   }
-  mld_pointwise_acc_l4_avx2((__m256i *)w, (const __m256i *)u,
-                            (const __m256i *)v, mld_qdata.vec);
+  mld_pointwise_acc_l4_avx2(w, u, v, mld_qdata);
   return MLD_NATIVE_FUNC_SUCCESS;
 }
 
@@ -246,8 +242,7 @@ static MLD_INLINE int mld_polyvecl_pointwise_acc_montgomery_l5_native(
   {
     return MLD_NATIVE_FUNC_FALLBACK;
   }
-  mld_pointwise_acc_l5_avx2((__m256i *)w, (const __m256i *)u,
-                            (const __m256i *)v, mld_qdata.vec);
+  mld_pointwise_acc_l5_avx2(w, u, v, mld_qdata);
   return MLD_NATIVE_FUNC_SUCCESS;
 }
 
@@ -259,8 +254,7 @@ static MLD_INLINE int mld_polyvecl_pointwise_acc_montgomery_l7_native(
   {
     return MLD_NATIVE_FUNC_FALLBACK;
   }
-  mld_pointwise_acc_l7_avx2((__m256i *)w, (const __m256i *)u,
-                            (const __m256i *)v, mld_qdata.vec);
+  mld_pointwise_acc_l7_avx2(w, u, v, mld_qdata);
   return MLD_NATIVE_FUNC_SUCCESS;
 }
 
 
@@ -7,7 +7,6 @@
 #define MLD_NATIVE_X86_64_SRC_ARITH_NATIVE_X86_64_H
 #include "../../../common.h"
 
-#include <immintrin.h>
 #include <stdint.h>
 #include "consts.h"
 
@@ -34,13 +33,13 @@
 extern const uint8_t mld_rej_uniform_table[256][8];
 
 #define mld_ntt_avx2 MLD_NAMESPACE(ntt_avx2)
-void mld_ntt_avx2(__m256i *r, const __m256i *mld_qdata);
+void mld_ntt_avx2(int32_t *r, const int32_t *mld_qdata);
 
 #define mld_invntt_avx2 MLD_NAMESPACE(invntt_avx2)
-void mld_invntt_avx2(__m256i *r, const __m256i *mld_qdata);
+void mld_invntt_avx2(int32_t *r, const int32_t *mld_qdata);
 
 #define mld_nttunpack_avx2 MLD_NAMESPACE(nttunpack_avx2)
-void mld_nttunpack_avx2(__m256i *r);
+void mld_nttunpack_avx2(int32_t *r);
 
 #define mld_rej_uniform_avx2 MLD_NAMESPACE(mld_rej_uniform_avx2)
 unsigned mld_rej_uniform_avx2(int32_t *r,
@@ -55,43 +54,46 @@ unsigned mld_rej_uniform_eta4_avx2(
     int32_t *r, const uint8_t buf[MLD_AVX2_REJ_UNIFORM_ETA4_BUFLEN]);
 
 #define mld_poly_decompose_32_avx2 MLD_NAMESPACE(mld_poly_decompose_32_avx2)
-void mld_poly_decompose_32_avx2(__m256i *a1, __m256i *a0);
+void mld_poly_decompose_32_avx2(int32_t *a1, int32_t *a0);
 
 #define mld_poly_decompose_88_avx2 MLD_NAMESPACE(mld_poly_decompose_88_avx2)
-void mld_poly_decompose_88_avx2(__m256i *a1, __m256i *a0);
+void mld_poly_decompose_88_avx2(int32_t *a1, int32_t *a0);
 
 #define mld_poly_caddq_avx2 MLD_NAMESPACE(poly_caddq_avx2)
 void mld_poly_caddq_avx2(int32_t *r);
 
 #define mld_poly_use_hint_32_avx2 MLD_NAMESPACE(mld_poly_use_hint_32_avx2)
-void mld_poly_use_hint_32_avx2(__m256i *b, const __m256i *a, const __m256i *h);
+void mld_poly_use_hint_32_avx2(int32_t *b, const int32_t *a, const int32_t *h);
 
 #define mld_poly_use_hint_88_avx2 MLD_NAMESPACE(mld_poly_use_hint_88_avx2)
-void mld_poly_use_hint_88_avx2(__m256i *b, const __m256i *a, const __m256i *h);
+void mld_poly_use_hint_88_avx2(int32_t *b, const int32_t *a, const int32_t *h);
 
 #define mld_poly_chknorm_avx2 MLD_NAMESPACE(mld_poly_chknorm_avx2)
-int mld_poly_chknorm_avx2(const __m256i *a, int32_t B);
+int mld_poly_chknorm_avx2(const int32_t *a, int32_t B);
 
 #define mld_polyz_unpack_17_avx2 MLD_NAMESPACE(mld_polyz_unpack_17_avx2)
-void mld_polyz_unpack_17_avx2(__m256i *r, const uint8_t *a);
+void mld_polyz_unpack_17_avx2(int32_t *r, const uint8_t *a);
 
 #define mld_polyz_unpack_19_avx2 MLD_NAMESPACE(mld_polyz_unpack_19_avx2)
-void mld_polyz_unpack_19_avx2(__m256i *r, const uint8_t *a);
+void mld_polyz_unpack_19_avx2(int32_t *r, const uint8_t *a);
 
 #define mld_pointwise_avx2 MLD_NAMESPACE(pointwise_avx2)
-void mld_pointwise_avx2(__m256i *c, const __m256i *a, const __m256i *b,
-                        const __m256i *qdata);
+void mld_pointwise_avx2(int32_t *c, const int32_t *a, const int32_t *b,
+                        const int32_t *qdata);
 
 #define mld_pointwise_acc_l4_avx2 MLD_NAMESPACE(pointwise_acc_l4_avx2)
-void mld_pointwise_acc_l4_avx2(__m256i *c, const __m256i *a, const __m256i *b,
-                               const __m256i *qdata);
+void mld_pointwise_acc_l4_avx2(int32_t c[MLDSA_N], const int32_t a[4][MLDSA_N],
+                               const int32_t b[4][MLDSA_N],
+                               const int32_t *qdata);
 
 #define mld_pointwise_acc_l5_avx2 MLD_NAMESPACE(pointwise_acc_l5_avx2)
-void mld_pointwise_acc_l5_avx2(__m256i *c, const __m256i *a, const __m256i *b,
-                               const __m256i *qdata);
+void mld_pointwise_acc_l5_avx2(int32_t c[MLDSA_N], const int32_t a[5][MLDSA_N],
+                               const int32_t b[5][MLDSA_N],
+                               const int32_t *qdata);
 
 #define mld_pointwise_acc_l7_avx2 MLD_NAMESPACE(pointwise_acc_l7_avx2)
-void mld_pointwise_acc_l7_avx2(__m256i *c, const __m256i *a, const __m256i *b,
-                               const __m256i *qdata);
+void mld_pointwise_acc_l7_avx2(int32_t c[MLDSA_N], const int32_t a[7][MLDSA_N],
+                               const int32_t b[7][MLDSA_N],
+                               const int32_t *qdata);
 
 #endif /* !MLD_NATIVE_X86_64_SRC_ARITH_NATIVE_X86_64_H */
@@ -22,7 +22,6 @@
 #if defined(MLD_ARITH_BACKEND_X86_64_DEFAULT) && \
     !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED)
 
-#include "align.h"
 #include "consts.h"
 #define MLD_AVX2_Q MLDSA_Q
 /* check-magic: 58728449 == pow(MLDSA_Q,-1,2^32) */
@@ -32,7 +31,7 @@
 /* check-magic: -8395782 == signed_mod(MLD_AVX2_QINV*MLD_AVX2_DIV,2^32) */
 #define MLD_AVX2_DIV_QINV -8395782
 
-const qdata_t mld_qdata = {{
+MLD_ALIGN const int32_t mld_qdata[624] = {
 #define MLD_AVX2_BACKEND_DATA_OFFSET_8XQ 0
     MLD_AVX2_Q,        MLD_AVX2_Q,        MLD_AVX2_Q,        MLD_AVX2_Q,
     MLD_AVX2_Q,        MLD_AVX2_Q,        MLD_AVX2_Q,        MLD_AVX2_Q,
@@ -53,7 +52,7 @@ const qdata_t mld_qdata = {{
 #define MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS 328
 #include "x86_64_zetas.i"
 
-}};
+};
 
 #else /* MLD_ARITH_BACKEND_X86_64_DEFAULT && !MLD_CONFIG_MULTILEVEL_NO_SHARED \
        */
 
@@ -30,10 +30,8 @@
 
 
 #ifndef __ASSEMBLER__
-#include "align.h"
-typedef MLD_ALIGNED_INT32(624) qdata_t;
 #define mld_qdata MLD_NAMESPACE(qdata)
-extern const qdata_t mld_qdata;
-#endif /* !__ASSEMBLER__ */
+extern const int32_t mld_qdata[624];
+#endif
 
 #endif /* !MLD_NATIVE_X86_64_SRC_CONSTS_H */
@@ -26,7 +26,7 @@
 #include <stdint.h>
 #include "arith_native_x86_64.h"
 
-int mld_poly_chknorm_avx2(const __m256i *a, int32_t B)
+int mld_poly_chknorm_avx2(const int32_t *a, int32_t B)
 {
   unsigned int i;
   __m256i f, t;
@@ -35,7 +35,7 @@ int mld_poly_chknorm_avx2(const __m256i *a, int32_t B)
   t = _mm256_setzero_si256();
   for (i = 0; i < MLDSA_N / 8; i++)
   {
-    f = _mm256_load_si256(&a[i]);
+    f = _mm256_load_si256((const __m256i *)&a[8 * i]);
     f = _mm256_abs_epi32(f);
     f = _mm256_cmpgt_epi32(f, bound);
     t = _mm256_or_si256(t, f);
 
@@ -37,7 +37,7 @@
  *            separate argument that may be aliased with either of the outputs.
  *            Removing the aliasing eases CBMC proofs.
  */
-void mld_poly_decompose_32_avx2(__m256i *a1, __m256i *a0)
+void mld_poly_decompose_32_avx2(int32_t *a1, int32_t *a0)
 {
   unsigned int i;
   __m256i f, f0, f1, t;
@@ -50,7 +50,7 @@ void mld_poly_decompose_32_avx2(__m256i *a1, __m256i *a0)
 
   for (i = 0; i < MLDSA_N / 8; i++)
   {
-    f = _mm256_load_si256(&a0[i]);
+    f = _mm256_load_si256((__m256i *)&a0[8 * i]);
 
     /* check-magic: 4092 == intdiv(2 * intdiv(MLDSA_Q - 1, 32), 128) */
     /*
@@ -136,8 +136,8 @@ void mld_poly_decompose_32_avx2(__m256i *a1, __m256i *a0)
     f0 = _mm256_add_epi32(f0, t);
     /* range: 0 <= f1 <= 15, -GAMMA2 <= f0 <= GAMMA2 */
 
-    _mm256_store_si256(&a1[i], f1);
-    _mm256_store_si256(&a0[i], f0);
+    _mm256_store_si256((__m256i *)&a1[8 * i], f1);
+    _mm256_store_si256((__m256i *)&a0[8 * i], f0);
   }
 }
 
 
@@ -38,7 +38,7 @@
  *            Removing the aliasing eases CBMC proofs.
  */
 
-void mld_poly_decompose_88_avx2(__m256i *a1, __m256i *a0)
+void mld_poly_decompose_88_avx2(int32_t *a1, int32_t *a0)
 {
   unsigned int i;
   __m256i f, f0, f1, t;
@@ -51,7 +51,7 @@ void mld_poly_decompose_88_avx2(__m256i *a1, __m256i *a0)
 
   for (i = 0; i < MLDSA_N / 8; i++)
   {
-    f = _mm256_load_si256(&a0[i]);
+    f = _mm256_load_si256((__m256i *)&a0[8 * i]);
 
     /* check-magic: 1488 == intdiv(2 * intdiv(MLDSA_Q - 1, 88), 128) */
     /*
@@ -137,8 +137,8 @@ void mld_poly_decompose_88_avx2(__m256i *a1, __m256i *a0)
     f0 = _mm256_add_epi32(f0, t);
     /* range: 0 <= f1 <= 43, -GAMMA2 <= f0 <= GAMMA2 */
 
-    _mm256_store_si256(&a1[i], f1);
-    _mm256_store_si256(&a0[i], f0);
+    _mm256_store_si256((__m256i *)&a1[8 * i], f1);
+    _mm256_store_si256((__m256i *)&a0[8 * i], f0);
   }
 }
 #else /* MLD_ARITH_BACKEND_X86_64_DEFAULT && !MLD_CONFIG_MULTILEVEL_NO_SHARED \
Original file line number	Diff line number	Diff line change
`@@ -40,7 +40,7 @@ static MLD_INLINE void mld_poly_permute_bitrev_to_custom(int32_t data[MLDSA_N])`
`40`	`40`	`{`
`41`	`41`	`if (mld_sys_check_capability(MLD_SYS_CAP_AVX2))`
`42`	`42`	`{`
`43`		`- mld_nttunpack_avx2((__m256i *)(data));`
	`43`	`+ mld_nttunpack_avx2(data);`
`44`	`44`	`}`
`45`	`45`	`}`
`46`	`46`
`@@ -51,7 +51,7 @@ static MLD_INLINE int mld_ntt_native(int32_t data[MLDSA_N])`
`51`	`51`	`return MLD_NATIVE_FUNC_FALLBACK;`
`52`	`52`	`}`
`53`	`53`
`54`		`- mld_ntt_avx2((__m256i *)data, mld_qdata.vec);`
	`54`	`+ mld_ntt_avx2(data, mld_qdata);`
`55`	`55`	`return MLD_NATIVE_FUNC_SUCCESS;`
`56`	`56`	`}`
`57`	`57`	`static MLD_INLINE int mld_intt_native(int32_t data[MLDSA_N])`
`@@ -60,7 +60,7 @@ static MLD_INLINE int mld_intt_native(int32_t data[MLDSA_N])`
`60`	`60`	`{`
`61`	`61`	`return MLD_NATIVE_FUNC_FALLBACK;`
`62`	`62`	`}`
`63`		`- mld_invntt_avx2((__m256i *)data, mld_qdata.vec);`
	`63`	`+ mld_invntt_avx2(data, mld_qdata);`
`64`	`64`	`return MLD_NATIVE_FUNC_SUCCESS;`
`65`	`65`	`}`
`66`	`66`
`@@ -137,7 +137,7 @@ static MLD_INLINE int mld_poly_decompose_32_native(int32_t a1, int32_t a0)`
`137`	`137`	`{`
`138`	`138`	`return MLD_NATIVE_FUNC_FALLBACK;`
`139`	`139`	`}`
`140`		`- mld_poly_decompose_32_avx2((__m256i )a1, (__m256i )a0);`
	`140`	`+ mld_poly_decompose_32_avx2(a1, a0);`
`141`	`141`	`return MLD_NATIVE_FUNC_SUCCESS;`
`142`	`142`	`}`
`143`	`143`
`@@ -147,7 +147,7 @@ static MLD_INLINE int mld_poly_decompose_88_native(int32_t a1, int32_t a0)`
`147`	`147`	`{`
`148`	`148`	`return MLD_NATIVE_FUNC_FALLBACK;`
`149`	`149`	`}`
`150`		`- mld_poly_decompose_88_avx2((__m256i )a1, (__m256i )a0);`
	`150`	`+ mld_poly_decompose_88_avx2(a1, a0);`
`151`	`151`	`return MLD_NATIVE_FUNC_SUCCESS;`
`152`	`152`	`}`
`153`	`153`
`@@ -167,8 +167,7 @@ static MLD_INLINE int mld_poly_use_hint_32_native(int32_t b, const int32_t a,`
`167`	`167`	`{`
`168`	`168`	`return MLD_NATIVE_FUNC_FALLBACK;`
`169`	`169`	`}`
`170`		`- mld_poly_use_hint_32_avx2((__m256i )b, (const __m256i )a,`
`171`		`- (const __m256i *)h);`
	`170`	`+ mld_poly_use_hint_32_avx2(b, a, h);`
`172`	`171`	`return MLD_NATIVE_FUNC_SUCCESS;`
`173`	`172`	`}`
`174`	`173`
`@@ -179,8 +178,7 @@ static MLD_INLINE int mld_poly_use_hint_88_native(int32_t b, const int32_t a,`
`179`	`178`	`{`
`180`	`179`	`return MLD_NATIVE_FUNC_FALLBACK;`
`181`	`180`	`}`
`182`		`- mld_poly_use_hint_88_avx2((__m256i )b, (const __m256i )a,`
`183`		`- (const __m256i *)h);`
	`181`	`+ mld_poly_use_hint_88_avx2(b, a, h);`
`184`	`182`	`return MLD_NATIVE_FUNC_SUCCESS;`
`185`	`183`	`}`
`186`	`184`
`@@ -190,7 +188,7 @@ static MLD_INLINE int mld_poly_chknorm_native(const int32_t *a, int32_t B)`
`190`	`188`	`{`
`191`	`189`	`return MLD_NATIVE_FUNC_FALLBACK;`
`192`	`190`	`}`
`193`		`- return mld_poly_chknorm_avx2((const __m256i *)a, B);`
	`191`	`+ return mld_poly_chknorm_avx2(a, B);`
`194`	`192`	`}`
`195`	`193`
`196`	`194`	`static MLD_INLINE int mld_polyz_unpack_17_native(int32_t r, const uint8_t a)`
`@@ -199,7 +197,7 @@ static MLD_INLINE int mld_polyz_unpack_17_native(int32_t r, const uint8_t a)`
`199`	`197`	`{`
`200`	`198`	`return MLD_NATIVE_FUNC_FALLBACK;`
`201`	`199`	`}`
`202`		`- mld_polyz_unpack_17_avx2((__m256i *)r, a);`
	`200`	`+ mld_polyz_unpack_17_avx2(r, a);`
`203`	`201`	`return MLD_NATIVE_FUNC_SUCCESS;`
`204`	`202`	`}`
`205`	`203`
`@@ -209,7 +207,7 @@ static MLD_INLINE int mld_polyz_unpack_19_native(int32_t r, const uint8_t a)`
`209`	`207`	`{`
`210`	`208`	`return MLD_NATIVE_FUNC_FALLBACK;`
`211`	`209`	`}`
`212`		`- mld_polyz_unpack_19_avx2((__m256i *)r, a);`
	`210`	`+ mld_polyz_unpack_19_avx2(r, a);`
`213`	`211`	`return MLD_NATIVE_FUNC_SUCCESS;`
`214`	`212`	`}`
`215`	`213`
`@@ -220,8 +218,7 @@ static MLD_INLINE int mld_poly_pointwise_montgomery_native(`
`220`	`218`	`{`
`221`	`219`	`return MLD_NATIVE_FUNC_FALLBACK;`
`222`	`220`	`}`
`223`		`- mld_pointwise_avx2((__m256i )c, (const __m256i )a, (const __m256i *)b,`
`224`		`- mld_qdata.vec);`
	`221`	`+ mld_pointwise_avx2(c, a, b, mld_qdata);`
`225`	`222`	`return MLD_NATIVE_FUNC_SUCCESS;`
`226`	`223`	`}`
`227`	`224`
`@@ -233,8 +230,7 @@ static MLD_INLINE int mld_polyvecl_pointwise_acc_montgomery_l4_native(`
`233`	`230`	`{`
`234`	`231`	`return MLD_NATIVE_FUNC_FALLBACK;`
`235`	`232`	`}`
`236`		`- mld_pointwise_acc_l4_avx2((__m256i )w, (const __m256i )u,`
`237`		`- (const __m256i *)v, mld_qdata.vec);`
	`233`	`+ mld_pointwise_acc_l4_avx2(w, u, v, mld_qdata);`
`238`	`234`	`return MLD_NATIVE_FUNC_SUCCESS;`
`239`	`235`	`}`
`240`	`236`
`@@ -246,8 +242,7 @@ static MLD_INLINE int mld_polyvecl_pointwise_acc_montgomery_l5_native(`
`246`	`242`	`{`
`247`	`243`	`return MLD_NATIVE_FUNC_FALLBACK;`
`248`	`244`	`}`
`249`		`- mld_pointwise_acc_l5_avx2((__m256i )w, (const __m256i )u,`
`250`		`- (const __m256i *)v, mld_qdata.vec);`
	`245`	`+ mld_pointwise_acc_l5_avx2(w, u, v, mld_qdata);`
`251`	`246`	`return MLD_NATIVE_FUNC_SUCCESS;`
`252`	`247`	`}`
`253`	`248`
`@@ -259,8 +254,7 @@ static MLD_INLINE int mld_polyvecl_pointwise_acc_montgomery_l7_native(`
`259`	`254`	`{`
`260`	`255`	`return MLD_NATIVE_FUNC_FALLBACK;`
`261`	`256`	`}`
`262`		`- mld_pointwise_acc_l7_avx2((__m256i )w, (const __m256i )u,`
`263`		`- (const __m256i *)v, mld_qdata.vec);`
	`257`	`+ mld_pointwise_acc_l7_avx2(w, u, v, mld_qdata);`
`264`	`258`	`return MLD_NATIVE_FUNC_SUCCESS;`
`265`	`259`	`}`
`266`	`260`
Original file line number	Diff line number	Diff line change
`@@ -37,7 +37,7 @@`
`37`	`37`	`* separate argument that may be aliased with either of the outputs.`
`38`	`38`	`* Removing the aliasing eases CBMC proofs.`
`39`	`39`	`*/`
`40`		`-void mld_poly_decompose_32_avx2(__m256i a1, __m256i a0)`
	`40`	`+void mld_poly_decompose_32_avx2(int32_t a1, int32_t a0)`
`41`	`41`	`{`
`42`	`42`	`unsigned int i;`
`43`	`43`	`__m256i f, f0, f1, t;`
`@@ -50,7 +50,7 @@ void mld_poly_decompose_32_avx2(__m256i a1, __m256i a0)`
`50`	`50`
`51`	`51`	`for (i = 0; i < MLDSA_N / 8; i++)`
`52`	`52`	`{`
`53`		`- f = _mm256_load_si256(&a0[i]);`
	`53`	`+ f = _mm256_load_si256((__m256i )&a0[8 i]);`
`54`	`54`
`55`	`55`	`/* check-magic: 4092 == intdiv(2 * intdiv(MLDSA_Q - 1, 32), 128) */`
`56`	`56`	`/*`
`@@ -136,8 +136,8 @@ void mld_poly_decompose_32_avx2(__m256i a1, __m256i a0)`
`136`	`136`	`f0 = _mm256_add_epi32(f0, t);`
`137`	`137`	`/* range: 0 <= f1 <= 15, -GAMMA2 <= f0 <= GAMMA2 */`
`138`	`138`
`139`		`- _mm256_store_si256(&a1[i], f1);`
`140`		`- _mm256_store_si256(&a0[i], f0);`
	`139`	`+ _mm256_store_si256((__m256i )&a1[8 i], f1);`
	`140`	`+ _mm256_store_si256((__m256i )&a0[8 i], f0);`
`141`	`141`	`}`
`142`	`142`	`}`
`143`	`143`
Original file line number	Diff line number	Diff line change
`@@ -38,7 +38,7 @@`
`38`	`38`	`* Removing the aliasing eases CBMC proofs.`
`39`	`39`	`*/`
`40`	`40`
`41`		`-void mld_poly_decompose_88_avx2(__m256i a1, __m256i a0)`
	`41`	`+void mld_poly_decompose_88_avx2(int32_t a1, int32_t a0)`
`42`	`42`	`{`
`43`	`43`	`unsigned int i;`
`44`	`44`	`__m256i f, f0, f1, t;`
`@@ -51,7 +51,7 @@ void mld_poly_decompose_88_avx2(__m256i a1, __m256i a0)`
`51`	`51`
`52`	`52`	`for (i = 0; i < MLDSA_N / 8; i++)`
`53`	`53`	`{`
`54`		`- f = _mm256_load_si256(&a0[i]);`
	`54`	`+ f = _mm256_load_si256((__m256i )&a0[8 i]);`
`55`	`55`
`56`	`56`	`/* check-magic: 1488 == intdiv(2 * intdiv(MLDSA_Q - 1, 88), 128) */`
`57`	`57`	`/*`
`@@ -137,8 +137,8 @@ void mld_poly_decompose_88_avx2(__m256i a1, __m256i a0)`
`137`	`137`	`f0 = _mm256_add_epi32(f0, t);`
`138`	`138`	`/* range: 0 <= f1 <= 43, -GAMMA2 <= f0 <= GAMMA2 */`
`139`	`139`
`140`		`- _mm256_store_si256(&a1[i], f1);`
`141`		`- _mm256_store_si256(&a0[i], f0);`
	`140`	`+ _mm256_store_si256((__m256i )&a1[8 i], f1);`
	`141`	`+ _mm256_store_si256((__m256i )&a0[8 i], f0);`
`142`	`142`	`}`
`143`	`143`	`}`
`144`	`144`	`#else /* MLD_ARITH_BACKEND_X86_64_DEFAULT && !MLD_CONFIG_MULTILEVEL_NO_SHARED \`