pq-code-package · mkannwischer · Oct 26, 2025 · Oct 27, 2025 · Oct 30, 2025 · Oct 30, 2025
@@ -257,6 +257,7 @@
 #undef mld_polyvec_matrix_pointwise_montgomery
 #undef mld_polyveck
 #undef mld_polyveck_add
+#undef mld_polyveck_add_error
 #undef mld_polyveck_caddq
 #undef mld_polyveck_chknorm
 #undef mld_polyveck_decompose

@@ -21,17 +21,47 @@
 #include "poly_kl.h"
 #include "polyvec.h"
 
-#if !defined(MLD_USE_NATIVE_NTT_CUSTOM_ORDER)
 /* This namespacing is not done at the top to avoid a naming conflict
  * with native backends, which are currently not yet namespaced. */
-#define mld_poly_permute_bitrev_to_custom \
-  MLD_NAMESPACE_KL(mld_poly_permute_bitrev_to_custom)
+#define mld_matrix_permute_bitrev_to_custom \
+  MLD_NAMESPACE_KL(mld_matrix_permute_bitrev_to_custom)
+
+/* Helper function to ensure that the polynomial entries in the output
+ * of mld_polyvec_matrix_expand use the standard (bitreversed) ordering
+ * of coefficients.
+ * No-op unless a native backend with a custom ordering is used.
+ */
+static void mld_matrix_permute_bitrev_to_custom(mld_polyvecl mat[MLDSA_K])
+__contract__(
+  /* We don't specify that this should be a permutation, but only
+   * that it does not change the bound established at the end of
+   * mld_polyvec_matrix_expand.
+   */
+  requires(memory_no_alias(mat, MLDSA_K * sizeof(mld_polyvecl)))
+  requires(forall(k1, 0, MLDSA_K, forall(l1, 0, MLDSA_L,
+    array_bound(mat[k1].vec[l1].coeffs, 0, MLDSA_N, 0, MLDSA_Q))))
+  assigns(object_whole(mat))
+  ensures(forall(k1, 0, MLDSA_K, forall(l1, 0, MLDSA_L,
+    array_bound(mat[k1].vec[l1].coeffs, 0, MLDSA_N, 0, MLDSA_Q))))
+)
+{
+#if defined(MLD_USE_NATIVE_NTT_CUSTOM_ORDER)
+  unsigned int i, j;
+  for (i = 0; i < MLDSA_K; i++)
+  {
+    for (j = 0; j < MLDSA_L; j++)
+    {
+      mld_poly_permute_bitrev_to_custom(mat[i].vec[j].coeffs);
+    }
+  }
+
+#else /* MLD_USE_NATIVE_NTT_CUSTOM_ORDER */
+
+  /* Nothing to do */
+  ((void)mat);
 
-static MLD_INLINE void mld_poly_permute_bitrev_to_custom(int32_t data[MLDSA_N])
-{
-  ((void)data);
-}
 #endif /* !MLD_USE_NATIVE_NTT_CUSTOM_ORDER */
+}
 
 
 MLD_INTERNAL_API
@@ -45,20 +75,12 @@ void mld_polyvec_matrix_expand(mld_polyvecl mat[MLDSA_K],
    * of the same parent object.
    */
 
-  MLD_ALIGN uint8_t seed_ext[4][MLD_ALIGN_UP(MLDSA_SEEDBYTES + 2)];
-
-  for (j = 0; j < 4; j++)
-  __loop__(
-    assigns(j, object_whole(seed_ext))
-    invariant(j <= 4)
-  )
-  {
-    mld_memcpy(seed_ext[j], rho, MLDSA_SEEDBYTES);
-  }
+  MLD_ALIGN uint8_t single_seed[MLD_ALIGN_UP(MLDSA_SEEDBYTES + 2)];
+  MLD_ALIGN uint8_t batched_seeds[4][MLD_ALIGN_UP(MLDSA_SEEDBYTES + 2)];
   /* Sample 4 matrix entries a time. */
   for (i = 0; i < (MLDSA_K * MLDSA_L / 4) * 4; i += 4)
   __loop__(
-    assigns(i, j, object_whole(seed_ext), memory_slice(mat, MLDSA_K * sizeof(mld_polyvecl)))
+    assigns(i, j, object_whole(batched_seeds), memory_slice(mat, MLDSA_K * sizeof(mld_polyvecl)))
     invariant(i <= (MLDSA_K * MLDSA_L / 4) * 4 && i % 4 == 0)
     /* vectors 0 .. i / MLDSA_L are completely sampled */
     invariant(forall(k1, 0, i / MLDSA_L, forall(l1, 0, MLDSA_L,
@@ -70,28 +92,31 @@ void mld_polyvec_matrix_expand(mld_polyvecl mat[MLDSA_K],
   {
     for (j = 0; j < 4; j++)
     __loop__(
-      assigns(j, object_whole(seed_ext))
+      assigns(j, object_whole(batched_seeds))
       invariant(j <= 4)
     )
     {
       uint8_t x = (uint8_t)((i + j) / MLDSA_L);
       uint8_t y = (uint8_t)((i + j) % MLDSA_L);
 
-      seed_ext[j][MLDSA_SEEDBYTES + 0] = y;
-      seed_ext[j][MLDSA_SEEDBYTES + 1] = x;
+      mld_memcpy(batched_seeds[j], rho, MLDSA_SEEDBYTES);
+      batched_seeds[j][MLDSA_SEEDBYTES + 0] = y;
+      batched_seeds[j][MLDSA_SEEDBYTES + 1] = x;
     }
 
     mld_poly_uniform_4x(&mat[i / MLDSA_L].vec[i % MLDSA_L],
                         &mat[(i + 1) / MLDSA_L].vec[(i + 1) % MLDSA_L],
                         &mat[(i + 2) / MLDSA_L].vec[(i + 2) % MLDSA_L],
                         &mat[(i + 3) / MLDSA_L].vec[(i + 3) % MLDSA_L],
-                        seed_ext);
+                        batched_seeds);
   }
 
+  mld_memcpy(single_seed, rho, MLDSA_SEEDBYTES);
+
   /* For MLDSA_K=6, MLDSA_L=5, process the last two entries individually */
   while (i < MLDSA_K * MLDSA_L)
   __loop__(
-    assigns(i, object_whole(seed_ext), memory_slice(mat, MLDSA_K * sizeof(mld_polyvecl)))
+    assigns(i, object_whole(single_seed), memory_slice(mat, MLDSA_K * sizeof(mld_polyvecl)))
     invariant(i <= MLDSA_K * MLDSA_L)
     /* vectors 0 .. i / MLDSA_L are completely sampled */
     invariant(forall(k1, 0, i / MLDSA_L, forall(l1, 0, MLDSA_L,
@@ -105,27 +130,17 @@ void mld_polyvec_matrix_expand(mld_polyvecl mat[MLDSA_K],
     uint8_t y = (uint8_t)(i % MLDSA_L);
     mld_poly *this_poly = &mat[i / MLDSA_L].vec[i % MLDSA_L];
 
-    seed_ext[0][MLDSA_SEEDBYTES + 0] = y;
-    seed_ext[0][MLDSA_SEEDBYTES + 1] = x;
+    single_seed[MLDSA_SEEDBYTES + 0] = y;
+    single_seed[MLDSA_SEEDBYTES + 1] = x;
 
-    mld_poly_uniform(this_poly, seed_ext[0]);
+    mld_poly_uniform(this_poly, single_seed);
     i++;
   }
 
-  /*
-   * The public matrix is generated in NTT domain. If the native backend
-   * uses a custom order in NTT domain, permute A accordingly.
-   */
-  for (i = 0; i < MLDSA_K; i++)
-  {
-    for (j = 0; j < MLDSA_L; j++)
-    {
-      mld_poly_permute_bitrev_to_custom(mat[i].vec[j].coeffs);
-    }
-  }
+  mld_matrix_permute_bitrev_to_custom(mat);
 
   /* @[FIPS204, Section 3.6.3] Destruction of intermediate values. */
-  mld_zeroize(seed_ext, sizeof(seed_ext));
+  mld_zeroize(single_seed, sizeof(single_seed));
 }
 
 MLD_INTERNAL_API
@@ -161,7 +176,7 @@ void mld_polyvecl_uniform_gamma1(mld_polyvecl *v,
   /* Safety: nonce is at most ((UINT16_MAX - MLDSA_L) / MLDSA_L), and, hence,
    * this cast is safe. See NONCE_UB comment in sign.c. */
   nonce = (uint16_t)(MLDSA_L * nonce);
-  /* Now, nonce <= UINT16_MAX - (MLDSA_L - 1), so the casts below are safe. */
+/* Now, nonce <= UINT16_MAX - (MLDSA_L - 1), so the casts below are safe. */
 #if MLDSA_L == 4
   mld_poly_uniform_gamma1_4x(&v->vec[0], &v->vec[1], &v->vec[2], &v->vec[3],
                              seed, nonce, (uint16_t)(nonce + 1),
@@ -219,7 +234,6 @@ void mld_polyvecl_add(mld_polyvecl *u, const mld_polyvecl *v)
     invariant(i <= MLDSA_L)
     invariant(forall(k0, i, MLDSA_L,
               forall(k1, 0, MLDSA_N, u->vec[k0].coeffs[k1] == loop_entry(*u).vec[k0].coeffs[k1])))
-    invariant(forall(k4, 0, i, forall(k5, 0, MLDSA_N, u->vec[k4].coeffs[k5] == loop_entry(*u).vec[k4].coeffs[k5] + v->vec[k4].coeffs[k5])))
     invariant(forall(k6, 0, i, array_bound(u->vec[k6].coeffs, 0, MLDSA_N, INT32_MIN, REDUCE32_DOMAIN_MAX)))
   )
   {
@@ -287,87 +301,131 @@ void mld_polyvecl_pointwise_poly_montgomery(mld_polyvecl *r, const mld_poly *a,
   mld_assert_abs_bound_2d(r->vec, MLDSA_L, MLDSA_N, MLDSA_Q);
 }
 
+#if defined(MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L4) && \
+    MLD_CONFIG_PARAMETER_SET == 44
+
 MLD_INTERNAL_API
 void mld_polyvecl_pointwise_acc_montgomery(mld_poly *w, const mld_polyvecl *u,
                                            const mld_polyvecl *v)
 {
-#if defined(MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L4) && \
-    MLD_CONFIG_PARAMETER_SET == 44
   /* TODO: proof */
   mld_assert_bound_2d(u->vec, MLDSA_L, MLDSA_N, 0, MLDSA_Q);
   mld_assert_abs_bound_2d(v->vec, MLDSA_L, MLDSA_N, MLD_NTT_BOUND);
   mld_polyvecl_pointwise_acc_montgomery_l4_native(
       w->coeffs, (const int32_t(*)[MLDSA_N])u->vec,
       (const int32_t(*)[MLDSA_N])v->vec);
   mld_assert_abs_bound(w->coeffs, MLDSA_N, MLDSA_Q);
+}
+
 #elif defined(MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L5) && \
     MLD_CONFIG_PARAMETER_SET == 65
+
+void mld_polyvecl_pointwise_acc_montgomery(mld_poly *w, const mld_polyvecl *u,
+                                           const mld_polyvecl *v)
+{
   /* TODO: proof */
   mld_assert_bound_2d(u->vec, MLDSA_L, MLDSA_N, 0, MLDSA_Q);
   mld_assert_abs_bound_2d(v->vec, MLDSA_L, MLDSA_N, MLD_NTT_BOUND);
   mld_polyvecl_pointwise_acc_montgomery_l5_native(
       w->coeffs, (const int32_t(*)[MLDSA_N])u->vec,
       (const int32_t(*)[MLDSA_N])v->vec);
   mld_assert_abs_bound(w->coeffs, MLDSA_N, MLDSA_Q);
+}
+
 #elif defined(MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L7) && \
     MLD_CONFIG_PARAMETER_SET == 87
+void mld_polyvecl_pointwise_acc_montgomery(mld_poly *w, const mld_polyvecl *u,
+                                           const mld_polyvecl *v)
+{
   /* TODO: proof */
   mld_assert_bound_2d(u->vec, MLDSA_L, MLDSA_N, 0, MLDSA_Q);
   mld_assert_abs_bound_2d(v->vec, MLDSA_L, MLDSA_N, MLD_NTT_BOUND);
   mld_polyvecl_pointwise_acc_montgomery_l7_native(
       w->coeffs, (const int32_t(*)[MLDSA_N])u->vec,
       (const int32_t(*)[MLDSA_N])v->vec);
   mld_assert_abs_bound(w->coeffs, MLDSA_N, MLDSA_Q);
-#else  /* !(MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L4 && \
-          MLD_CONFIG_PARAMETER_SET == 44) &&                       \
-          !(MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L5 && \
-          MLD_CONFIG_PARAMETER_SET == 65) &&                       \
-          MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L7 &&   \
-          MLD_CONFIG_PARAMETER_SET == 87 */
-  unsigned int i, j;
-  mld_assert_bound_2d(u->vec, MLDSA_L, MLDSA_N, 0, MLDSA_Q);
-  mld_assert_abs_bound_2d(v->vec, MLDSA_L, MLDSA_N, MLD_NTT_BOUND);
-  /* The first input is bounded by [0, Q-1] inclusive
-   * The second input is bounded by [-9Q+1, 9Q-1] inclusive . Hence, we can
+}
+
+#else /* !(MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L4 && \
+         MLD_CONFIG_PARAMETER_SET == 44) &&                       \
+         !(MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L5 && \
+         MLD_CONFIG_PARAMETER_SET == 65) &&                       \
+         MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L7 &&   \
+         MLD_CONFIG_PARAMETER_SET == 87 */
+
+#define mld_pointwise_sum_of_products \
+  MLD_NAMESPACE_KL(mld_pointwise_sum_of_products)
+static int64_t mld_pointwise_sum_of_products(const mld_polyvecl *u,
+                                             const mld_polyvecl *v,
+                                             unsigned int i)
+__contract__(
+  requires(memory_no_alias(u, sizeof(mld_polyvecl)))
+  requires(memory_no_alias(v, sizeof(mld_polyvecl)))
+  requires(i < MLDSA_N)
+  requires(forall(l0, 0, MLDSA_L,
+                  array_bound(u->vec[l0].coeffs, 0, MLDSA_N, 0, MLDSA_Q)))
+  requires(forall(l1, 0, MLDSA_L,
+    array_abs_bound(v->vec[l1].coeffs, 0, MLDSA_N, MLD_NTT_BOUND)))
+  ensures(return_value >= -(int64_t) MLDSA_L*(MLDSA_Q - 1)*(MLD_NTT_BOUND - 1))
+  ensures(return_value <=  (int64_t) MLDSA_L*(MLDSA_Q - 1)*(MLD_NTT_BOUND - 1))
+)
+{
+  /* Input vector u is bounded by [0, Q-1] inclusive
+   * Input vector v is bounded by [-9Q+1, 9Q-1] inclusive . Hence, we can
    * safely accumulate in 64-bits without intermediate reductions as
    * MLDSA_L * (MLD_NTT_BOUND-1) * (Q-1) < INT64_MAX
    *
    * The worst case is ML-DSA-87: 7 * (9Q-1) * (Q-1) < 2**52
    * (and likewise for negative values)
    */
 
+  int64_t t = 0;
+  unsigned int j;
+  for (j = 0; j < MLDSA_L; j++)
+  __loop__(
+    assigns(j, t)
+    invariant(j <= MLDSA_L)
+    invariant(t >= -(int64_t)j*(MLDSA_Q - 1)*(MLD_NTT_BOUND - 1))
+    invariant(t <= (int64_t)j*(MLDSA_Q - 1)*(MLD_NTT_BOUND - 1))
+  )
+  {
+    const int64_t u64 = (int64_t)u->vec[j].coeffs[i];
+    const int64_t v64 = (int64_t)v->vec[j].coeffs[i];
+    /* Helper assertions for proof efficiency. Do not remove */
+    mld_assert(u64 >= 0 && u64 < MLDSA_Q);
+    mld_assert(v64 > -MLD_NTT_BOUND && v64 < MLD_NTT_BOUND);
+    t += (u64 * v64);
+  }
+  return t;
+}
+
+void mld_polyvecl_pointwise_acc_montgomery(mld_poly *w, const mld_polyvecl *u,
+                                           const mld_polyvecl *v)
+{
+  unsigned int i;
+
+  mld_assert_bound_2d(u->vec, MLDSA_L, MLDSA_N, 0, MLDSA_Q);
+  mld_assert_abs_bound_2d(v->vec, MLDSA_L, MLDSA_N, MLD_NTT_BOUND);
   for (i = 0; i < MLDSA_N; i++)
   __loop__(
-    assigns(i, j, object_whole(w))
+    assigns(i, object_whole(w))
     invariant(i <= MLDSA_N)
     invariant(array_abs_bound(w->coeffs, 0, i, MLDSA_Q))
   )
   {
-    int64_t t = 0;
-    int32_t r;
-    for (j = 0; j < MLDSA_L; j++)
-    __loop__(
-      assigns(j, t)
-      invariant(j <= MLDSA_L)
-      invariant(t >= -(int64_t)j*(MLDSA_Q - 1)*(MLD_NTT_BOUND - 1))
-      invariant(t <= (int64_t)j*(MLDSA_Q - 1)*(MLD_NTT_BOUND - 1))
-    )
-    {
-      t += (int64_t)u->vec[j].coeffs[i] * v->vec[j].coeffs[i];
-    }
-
-    r = mld_montgomery_reduce(t);
-    w->coeffs[i] = r;
+    w->coeffs[i] =
+        mld_montgomery_reduce(mld_pointwise_sum_of_products(u, v, i));
   }
 
   mld_assert_abs_bound(w->coeffs, MLDSA_N, MLDSA_Q);
+}
+
 #endif /* !(MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L4 && \
           MLD_CONFIG_PARAMETER_SET == 44) &&                       \
           !(MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L5 && \
           MLD_CONFIG_PARAMETER_SET == 65) &&                       \
           !(MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L7 && \
           MLD_CONFIG_PARAMETER_SET == 87) */
-}
 
 MLD_INTERNAL_API
 uint32_t mld_polyvecl_chknorm(const mld_polyvecl *v, int32_t bound)
@@ -450,7 +508,6 @@ void mld_polyveck_add(mld_polyveck *u, const mld_polyveck *v)
     invariant(i <= MLDSA_K)
     invariant(forall(k0, i, MLDSA_K,
               forall(k1, 0, MLDSA_N, u->vec[k0].coeffs[k1] == loop_entry(*u).vec[k0].coeffs[k1])))
-    invariant(forall(k4, 0, i, forall(k5, 0, MLDSA_N, u->vec[k4].coeffs[k5] == loop_entry(*u).vec[k4].coeffs[k5] + v->vec[k4].coeffs[k5])))
     invariant(forall(k6, 0, i, array_bound(u->vec[k6].coeffs, 0, MLDSA_N, INT32_MIN, REDUCE32_DOMAIN_MAX)))
   )
   {
@@ -459,6 +516,28 @@ void mld_polyveck_add(mld_polyveck *u, const mld_polyveck *v)
   mld_assert_bound_2d(u->vec, MLDSA_L, MLDSA_N, INT32_MIN, REDUCE32_DOMAIN_MAX);
 }
 
+/* Reference: We use destructive version (output=first input) to avoid
+ *            reasoning about aliasing in the CBMC specification */
+MLD_INTERNAL_API
+void mld_polyveck_add_error(mld_polyveck *u, const mld_polyveck *v)
+{
+  unsigned int i;
+
+  for (i = 0; i < MLDSA_K; ++i)
+  __loop__(
+    assigns(i, memory_slice(u, sizeof(mld_polyveck)))
+    invariant(i <= MLDSA_K)
+    invariant(forall(k0, i, MLDSA_K,
+              forall(k1, 0, MLDSA_N, u->vec[k0].coeffs[k1] == loop_entry(*u).vec[k0].coeffs[k1])))
+    invariant(forall(k6, 0, i, array_abs_bound(u->vec[k6].coeffs, 0, MLDSA_N, MLDSA_Q)))
+  )
+  {
+    mld_poly_add(&u->vec[i], &v->vec[i]);
+  }
+  mld_assert_abs_bound_2d(u->vec, MLDSA_L, MLDSA_N, MLDSA_Q);
+}
+
+
 MLD_INTERNAL_API
 void mld_polyveck_sub(mld_polyveck *u, const mld_polyveck *v)
 {