diff --git a/dev/aarch64_clean/meta.h b/dev/aarch64_clean/meta.h index 9f4a01835..37cad88b8 100644 --- a/dev/aarch64_clean/meta.h +++ b/dev/aarch64_clean/meta.h @@ -111,17 +111,15 @@ static MLD_INLINE int mld_rej_uniform_eta4_native(int32_t *r, unsigned len, return (int)outlen; } -static MLD_INLINE int mld_poly_decompose_32_native(int32_t *a1, int32_t *a0, - const int32_t *a) +static MLD_INLINE int mld_poly_decompose_32_native(int32_t *a1, int32_t *a0) { - mld_poly_decompose_32_asm(a1, a0, a); + mld_poly_decompose_32_asm(a1, a0); return MLD_NATIVE_FUNC_SUCCESS; } -static MLD_INLINE int mld_poly_decompose_88_native(int32_t *a1, int32_t *a0, - const int32_t *a) +static MLD_INLINE int mld_poly_decompose_88_native(int32_t *a1, int32_t *a0) { - mld_poly_decompose_88_asm(a1, a0, a); + mld_poly_decompose_88_asm(a1, a0); return MLD_NATIVE_FUNC_SUCCESS; } diff --git a/dev/aarch64_clean/src/arith_native_aarch64.h b/dev/aarch64_clean/src/arith_native_aarch64.h index 22f193585..dccf270ab 100644 --- a/dev/aarch64_clean/src/arith_native_aarch64.h +++ b/dev/aarch64_clean/src/arith_native_aarch64.h @@ -68,10 +68,10 @@ unsigned mld_rej_uniform_eta4_asm(int32_t *r, const uint8_t *buf, unsigned buflen, const uint8_t *table); #define mld_poly_decompose_32_asm MLD_NAMESPACE(poly_decompose_32_asm) -void mld_poly_decompose_32_asm(int32_t *a1, int32_t *a0, const int32_t *a); +void mld_poly_decompose_32_asm(int32_t *a1, int32_t *a0); #define mld_poly_decompose_88_asm MLD_NAMESPACE(poly_decompose_88_asm) -void mld_poly_decompose_88_asm(int32_t *a1, int32_t *a0, const int32_t *a); +void mld_poly_decompose_88_asm(int32_t *a1, int32_t *a0); #define mld_poly_caddq_asm MLD_NAMESPACE(poly_caddq_asm) void mld_poly_caddq_asm(int32_t *a); diff --git a/dev/aarch64_clean/src/poly_decompose_32_asm.S b/dev/aarch64_clean/src/poly_decompose_32_asm.S index aa3896199..6e9b7138c 100644 --- a/dev/aarch64_clean/src/poly_decompose_32_asm.S +++ b/dev/aarch64_clean/src/poly_decompose_32_asm.S @@ -40,8 +40,7 @@ /* Parameters */ a1_ptr .req x0 // Output polynomial with coefficients c1 - a0_ptr .req x1 // Output polynomial with coefficients c0 - a_ptr .req x2 // Input polynomial + a0_ptr .req x1 // Input/Output polynomial with coefficients c0 count .req x3 @@ -76,10 +75,10 @@ MLD_ASM_FN_SYMBOL(poly_decompose_32_asm) mov count, #(64/4) poly_decompose_32_loop: - ldr q1, [a_ptr, #1*16] - ldr q2, [a_ptr, #2*16] - ldr q3, [a_ptr, #3*16] - ldr q0, [a_ptr], #4*16 + ldr q0, [a0_ptr, #0*16] + ldr q1, [a0_ptr, #1*16] + ldr q2, [a0_ptr, #2*16] + ldr q3, [a0_ptr, #3*16] decompose32 v5, v1, v24 decompose32 v6, v2, v24 @@ -102,7 +101,6 @@ poly_decompose_32_loop: .unreq a1_ptr .unreq a0_ptr - .unreq a_ptr .unreq count .unreq q .unreq q_bound diff --git a/dev/aarch64_clean/src/poly_decompose_88_asm.S b/dev/aarch64_clean/src/poly_decompose_88_asm.S index 4152b1eab..97d56ae0b 100644 --- a/dev/aarch64_clean/src/poly_decompose_88_asm.S +++ b/dev/aarch64_clean/src/poly_decompose_88_asm.S @@ -40,8 +40,7 @@ /* Parameters */ a1_ptr .req x0 // Output polynomial with coefficients c1 - a0_ptr .req x1 // Output polynomial with coefficients c0 - a_ptr .req x2 // Input polynomial + a0_ptr .req x1 // Input/Output polynomial with coefficients c0 count .req x3 @@ -74,10 +73,10 @@ MLD_ASM_FN_SYMBOL(poly_decompose_88_asm) mov count, #(64/4) poly_decompose_88_loop: - ldr q1, [a_ptr, #1*16] - ldr q2, [a_ptr, #2*16] - ldr q3, [a_ptr, #3*16] - ldr q0, [a_ptr], #4*16 + ldr q0, [a0_ptr, #0*16] + ldr q1, [a0_ptr, #1*16] + ldr q2, [a0_ptr, #2*16] + ldr q3, [a0_ptr, #3*16] decompose88 v5, v1, v24 decompose88 v6, v2, v24 @@ -100,7 +99,6 @@ poly_decompose_88_loop: .unreq a1_ptr .unreq a0_ptr - .unreq a_ptr .unreq count .unreq q .unreq q_bound diff --git a/dev/aarch64_opt/meta.h b/dev/aarch64_opt/meta.h index 9f4a01835..37cad88b8 100644 --- a/dev/aarch64_opt/meta.h +++ b/dev/aarch64_opt/meta.h @@ -111,17 +111,15 @@ static MLD_INLINE int mld_rej_uniform_eta4_native(int32_t *r, unsigned len, return (int)outlen; } -static MLD_INLINE int mld_poly_decompose_32_native(int32_t *a1, int32_t *a0, - const int32_t *a) +static MLD_INLINE int mld_poly_decompose_32_native(int32_t *a1, int32_t *a0) { - mld_poly_decompose_32_asm(a1, a0, a); + mld_poly_decompose_32_asm(a1, a0); return MLD_NATIVE_FUNC_SUCCESS; } -static MLD_INLINE int mld_poly_decompose_88_native(int32_t *a1, int32_t *a0, - const int32_t *a) +static MLD_INLINE int mld_poly_decompose_88_native(int32_t *a1, int32_t *a0) { - mld_poly_decompose_88_asm(a1, a0, a); + mld_poly_decompose_88_asm(a1, a0); return MLD_NATIVE_FUNC_SUCCESS; } diff --git a/dev/aarch64_opt/src/arith_native_aarch64.h b/dev/aarch64_opt/src/arith_native_aarch64.h index 22f193585..dccf270ab 100644 --- a/dev/aarch64_opt/src/arith_native_aarch64.h +++ b/dev/aarch64_opt/src/arith_native_aarch64.h @@ -68,10 +68,10 @@ unsigned mld_rej_uniform_eta4_asm(int32_t *r, const uint8_t *buf, unsigned buflen, const uint8_t *table); #define mld_poly_decompose_32_asm MLD_NAMESPACE(poly_decompose_32_asm) -void mld_poly_decompose_32_asm(int32_t *a1, int32_t *a0, const int32_t *a); +void mld_poly_decompose_32_asm(int32_t *a1, int32_t *a0); #define mld_poly_decompose_88_asm MLD_NAMESPACE(poly_decompose_88_asm) -void mld_poly_decompose_88_asm(int32_t *a1, int32_t *a0, const int32_t *a); +void mld_poly_decompose_88_asm(int32_t *a1, int32_t *a0); #define mld_poly_caddq_asm MLD_NAMESPACE(poly_caddq_asm) void mld_poly_caddq_asm(int32_t *a); diff --git a/dev/aarch64_opt/src/poly_decompose_32_asm.S b/dev/aarch64_opt/src/poly_decompose_32_asm.S index aa3896199..6e9b7138c 100644 --- a/dev/aarch64_opt/src/poly_decompose_32_asm.S +++ b/dev/aarch64_opt/src/poly_decompose_32_asm.S @@ -40,8 +40,7 @@ /* Parameters */ a1_ptr .req x0 // Output polynomial with coefficients c1 - a0_ptr .req x1 // Output polynomial with coefficients c0 - a_ptr .req x2 // Input polynomial + a0_ptr .req x1 // Input/Output polynomial with coefficients c0 count .req x3 @@ -76,10 +75,10 @@ MLD_ASM_FN_SYMBOL(poly_decompose_32_asm) mov count, #(64/4) poly_decompose_32_loop: - ldr q1, [a_ptr, #1*16] - ldr q2, [a_ptr, #2*16] - ldr q3, [a_ptr, #3*16] - ldr q0, [a_ptr], #4*16 + ldr q0, [a0_ptr, #0*16] + ldr q1, [a0_ptr, #1*16] + ldr q2, [a0_ptr, #2*16] + ldr q3, [a0_ptr, #3*16] decompose32 v5, v1, v24 decompose32 v6, v2, v24 @@ -102,7 +101,6 @@ poly_decompose_32_loop: .unreq a1_ptr .unreq a0_ptr - .unreq a_ptr .unreq count .unreq q .unreq q_bound diff --git a/dev/aarch64_opt/src/poly_decompose_88_asm.S b/dev/aarch64_opt/src/poly_decompose_88_asm.S index 4152b1eab..97d56ae0b 100644 --- a/dev/aarch64_opt/src/poly_decompose_88_asm.S +++ b/dev/aarch64_opt/src/poly_decompose_88_asm.S @@ -40,8 +40,7 @@ /* Parameters */ a1_ptr .req x0 // Output polynomial with coefficients c1 - a0_ptr .req x1 // Output polynomial with coefficients c0 - a_ptr .req x2 // Input polynomial + a0_ptr .req x1 // Input/Output polynomial with coefficients c0 count .req x3 @@ -74,10 +73,10 @@ MLD_ASM_FN_SYMBOL(poly_decompose_88_asm) mov count, #(64/4) poly_decompose_88_loop: - ldr q1, [a_ptr, #1*16] - ldr q2, [a_ptr, #2*16] - ldr q3, [a_ptr, #3*16] - ldr q0, [a_ptr], #4*16 + ldr q0, [a0_ptr, #0*16] + ldr q1, [a0_ptr, #1*16] + ldr q2, [a0_ptr, #2*16] + ldr q3, [a0_ptr, #3*16] decompose88 v5, v1, v24 decompose88 v6, v2, v24 @@ -100,7 +99,6 @@ poly_decompose_88_loop: .unreq a1_ptr .unreq a0_ptr - .unreq a_ptr .unreq count .unreq q .unreq q_bound diff --git a/dev/x86_64/meta.h b/dev/x86_64/meta.h index 7978443f4..fb661c4a3 100644 --- a/dev/x86_64/meta.h +++ b/dev/x86_64/meta.h @@ -131,25 +131,23 @@ static MLD_INLINE int mld_rej_uniform_eta4_native(int32_t *r, unsigned len, return (int)outlen; } -static MLD_INLINE int mld_poly_decompose_32_native(int32_t *a1, int32_t *a0, - const int32_t *a) +static MLD_INLINE int mld_poly_decompose_32_native(int32_t *a1, int32_t *a0) { if (!mld_sys_check_capability(MLD_SYS_CAP_AVX2)) { return MLD_NATIVE_FUNC_FALLBACK; } - mld_poly_decompose_32_avx2((__m256i *)a1, (__m256i *)a0, (const __m256i *)a); + mld_poly_decompose_32_avx2((__m256i *)a1, (__m256i *)a0); return MLD_NATIVE_FUNC_SUCCESS; } -static MLD_INLINE int mld_poly_decompose_88_native(int32_t *a1, int32_t *a0, - const int32_t *a) +static MLD_INLINE int mld_poly_decompose_88_native(int32_t *a1, int32_t *a0) { if (!mld_sys_check_capability(MLD_SYS_CAP_AVX2)) { return MLD_NATIVE_FUNC_FALLBACK; } - mld_poly_decompose_88_avx2((__m256i *)a1, (__m256i *)a0, (const __m256i *)a); + mld_poly_decompose_88_avx2((__m256i *)a1, (__m256i *)a0); return MLD_NATIVE_FUNC_SUCCESS; } diff --git a/dev/x86_64/src/arith_native_x86_64.h b/dev/x86_64/src/arith_native_x86_64.h index fd59a21e0..1cf36a0fd 100644 --- a/dev/x86_64/src/arith_native_x86_64.h +++ b/dev/x86_64/src/arith_native_x86_64.h @@ -55,10 +55,10 @@ unsigned mld_rej_uniform_eta4_avx2( int32_t *r, const uint8_t buf[MLD_AVX2_REJ_UNIFORM_ETA4_BUFLEN]); #define mld_poly_decompose_32_avx2 MLD_NAMESPACE(mld_poly_decompose_32_avx2) -void mld_poly_decompose_32_avx2(__m256i *a1, __m256i *a0, const __m256i *a); +void mld_poly_decompose_32_avx2(__m256i *a1, __m256i *a0); #define mld_poly_decompose_88_avx2 MLD_NAMESPACE(mld_poly_decompose_88_avx2) -void mld_poly_decompose_88_avx2(__m256i *a1, __m256i *a0, const __m256i *a); +void mld_poly_decompose_88_avx2(__m256i *a1, __m256i *a0); #define mld_poly_caddq_avx2 MLD_NAMESPACE(poly_caddq_avx2) void mld_poly_caddq_avx2(int32_t *r); diff --git a/dev/x86_64/src/poly_decompose_32_avx2.c b/dev/x86_64/src/poly_decompose_32_avx2.c index c293836b3..2cab13c1c 100644 --- a/dev/x86_64/src/poly_decompose_32_avx2.c +++ b/dev/x86_64/src/poly_decompose_32_avx2.c @@ -32,7 +32,12 @@ #include "arith_native_x86_64.h" #include "consts.h" -void mld_poly_decompose_32_avx2(__m256i *a1, __m256i *a0, const __m256i *a) +/* + * Reference: The reference implementation has the input polynomial as a + * separate argument that may be aliased with either of the outputs. + * Removing the aliasing eases CBMC proofs. + */ +void mld_poly_decompose_32_avx2(__m256i *a1, __m256i *a0) { unsigned int i; __m256i f, f0, f1, t; @@ -45,7 +50,7 @@ void mld_poly_decompose_32_avx2(__m256i *a1, __m256i *a0, const __m256i *a) for (i = 0; i < MLDSA_N / 8; i++) { - f = _mm256_load_si256(&a[i]); + f = _mm256_load_si256(&a0[i]); /* check-magic: 4092 == intdiv(2 * intdiv(MLDSA_Q - 1, 32), 128) */ /* diff --git a/dev/x86_64/src/poly_decompose_88_avx2.c b/dev/x86_64/src/poly_decompose_88_avx2.c index 133dd80d0..397bde3e2 100644 --- a/dev/x86_64/src/poly_decompose_88_avx2.c +++ b/dev/x86_64/src/poly_decompose_88_avx2.c @@ -32,7 +32,13 @@ #include "arith_native_x86_64.h" #include "consts.h" -void mld_poly_decompose_88_avx2(__m256i *a1, __m256i *a0, const __m256i *a) +/* + * Reference: The reference implementation has the input polynomial as a + * separate argument that may be aliased with either of the outputs. + * Removing the aliasing eases CBMC proofs. + */ + +void mld_poly_decompose_88_avx2(__m256i *a1, __m256i *a0) { unsigned int i; __m256i f, f0, f1, t; @@ -45,7 +51,7 @@ void mld_poly_decompose_88_avx2(__m256i *a1, __m256i *a0, const __m256i *a) for (i = 0; i < MLDSA_N / 8; i++) { - f = _mm256_load_si256(&a[i]); + f = _mm256_load_si256(&a0[i]); /* check-magic: 1488 == intdiv(2 * intdiv(MLDSA_Q - 1, 88), 128) */ /* diff --git a/mldsa/src/native/aarch64/meta.h b/mldsa/src/native/aarch64/meta.h index 9f4a01835..37cad88b8 100644 --- a/mldsa/src/native/aarch64/meta.h +++ b/mldsa/src/native/aarch64/meta.h @@ -111,17 +111,15 @@ static MLD_INLINE int mld_rej_uniform_eta4_native(int32_t *r, unsigned len, return (int)outlen; } -static MLD_INLINE int mld_poly_decompose_32_native(int32_t *a1, int32_t *a0, - const int32_t *a) +static MLD_INLINE int mld_poly_decompose_32_native(int32_t *a1, int32_t *a0) { - mld_poly_decompose_32_asm(a1, a0, a); + mld_poly_decompose_32_asm(a1, a0); return MLD_NATIVE_FUNC_SUCCESS; } -static MLD_INLINE int mld_poly_decompose_88_native(int32_t *a1, int32_t *a0, - const int32_t *a) +static MLD_INLINE int mld_poly_decompose_88_native(int32_t *a1, int32_t *a0) { - mld_poly_decompose_88_asm(a1, a0, a); + mld_poly_decompose_88_asm(a1, a0); return MLD_NATIVE_FUNC_SUCCESS; } diff --git a/mldsa/src/native/aarch64/src/arith_native_aarch64.h b/mldsa/src/native/aarch64/src/arith_native_aarch64.h index 22f193585..dccf270ab 100644 --- a/mldsa/src/native/aarch64/src/arith_native_aarch64.h +++ b/mldsa/src/native/aarch64/src/arith_native_aarch64.h @@ -68,10 +68,10 @@ unsigned mld_rej_uniform_eta4_asm(int32_t *r, const uint8_t *buf, unsigned buflen, const uint8_t *table); #define mld_poly_decompose_32_asm MLD_NAMESPACE(poly_decompose_32_asm) -void mld_poly_decompose_32_asm(int32_t *a1, int32_t *a0, const int32_t *a); +void mld_poly_decompose_32_asm(int32_t *a1, int32_t *a0); #define mld_poly_decompose_88_asm MLD_NAMESPACE(poly_decompose_88_asm) -void mld_poly_decompose_88_asm(int32_t *a1, int32_t *a0, const int32_t *a); +void mld_poly_decompose_88_asm(int32_t *a1, int32_t *a0); #define mld_poly_caddq_asm MLD_NAMESPACE(poly_caddq_asm) void mld_poly_caddq_asm(int32_t *a); diff --git a/mldsa/src/native/aarch64/src/poly_decompose_32_asm.S b/mldsa/src/native/aarch64/src/poly_decompose_32_asm.S index dc22b8234..0824cbeb9 100644 --- a/mldsa/src/native/aarch64/src/poly_decompose_32_asm.S +++ b/mldsa/src/native/aarch64/src/poly_decompose_32_asm.S @@ -36,10 +36,10 @@ MLD_ASM_FN_SYMBOL(poly_decompose_32_asm) mov x3, #0x10 // =16 Lpoly_decompose_32_loop: - ldr q1, [x2, #0x10] - ldr q2, [x2, #0x20] - ldr q3, [x2, #0x30] - ldr q0, [x2], #0x40 + ldr q0, [x1] + ldr q1, [x1, #0x10] + ldr q2, [x1, #0x20] + ldr q3, [x1, #0x30] sqdmulh v5.4s, v1.4s, v23.4s srshr v5.4s, v5.4s, #0x12 cmgt v24.4s, v1.4s, v21.4s diff --git a/mldsa/src/native/aarch64/src/poly_decompose_88_asm.S b/mldsa/src/native/aarch64/src/poly_decompose_88_asm.S index 86a0acbd2..1e02cdabd 100644 --- a/mldsa/src/native/aarch64/src/poly_decompose_88_asm.S +++ b/mldsa/src/native/aarch64/src/poly_decompose_88_asm.S @@ -36,10 +36,10 @@ MLD_ASM_FN_SYMBOL(poly_decompose_88_asm) mov x3, #0x10 // =16 Lpoly_decompose_88_loop: - ldr q1, [x2, #0x10] - ldr q2, [x2, #0x20] - ldr q3, [x2, #0x30] - ldr q0, [x2], #0x40 + ldr q0, [x1] + ldr q1, [x1, #0x10] + ldr q2, [x1, #0x20] + ldr q3, [x1, #0x30] sqdmulh v5.4s, v1.4s, v23.4s srshr v5.4s, v5.4s, #0x11 cmgt v24.4s, v1.4s, v21.4s diff --git a/mldsa/src/native/api.h b/mldsa/src/native/api.h index 4e033ca19..e210ff2a4 100644 --- a/mldsa/src/native/api.h +++ b/mldsa/src/native/api.h @@ -260,23 +260,21 @@ __contract__( * Assumes coefficients to be standard representatives. * * Arguments: - int32_t *a1: output polynomial with coefficients c1 - * - int32_t *a0: output polynomial with coefficients c0 - * - const int32_t *a: input polynomial + * - int32_t *a0: input/output polynomial. + * Output has coefficients c0 **************************************************/ -static MLD_INLINE int mld_poly_decompose_32_native(int32_t *a1, int32_t *a0, - const int32_t *a) +static MLD_INLINE int mld_poly_decompose_32_native(int32_t *a1, int32_t *a0) __contract__( requires(memory_no_alias(a1, sizeof(int32_t) * MLDSA_N)) requires(memory_no_alias(a0, sizeof(int32_t) * MLDSA_N)) - requires(memory_no_alias(a, sizeof(int32_t) * MLDSA_N)) - requires(array_bound(a, 0, MLDSA_N, 0, MLDSA_Q)) + requires(array_bound(a0, 0, MLDSA_N, 0, MLDSA_Q)) assigns(memory_slice(a1, sizeof(int32_t) * MLDSA_N)) assigns(memory_slice(a0, sizeof(int32_t) * MLDSA_N)) ensures(return_value == MLD_NATIVE_FUNC_FALLBACK || return_value == MLD_NATIVE_FUNC_SUCCESS) ensures((return_value == MLD_NATIVE_FUNC_SUCCESS) ==> array_bound(a1, 0, MLDSA_N, 0, (MLDSA_Q-1)/(2*MLDSA_GAMMA2))) ensures((return_value == MLD_NATIVE_FUNC_SUCCESS) ==> array_abs_bound(a0, 0, MLDSA_N, MLDSA_GAMMA2+1)) - ensures((return_value == MLD_NATIVE_FUNC_FALLBACK) ==> array_bound(a, 0, MLDSA_N, 0, MLDSA_Q)) - ensures((return_value == MLD_NATIVE_FUNC_FALLBACK) ==> array_unchanged(a, MLDSA_N)) + ensures((return_value == MLD_NATIVE_FUNC_FALLBACK) ==> array_bound(a0, 0, MLDSA_N, 0, MLDSA_Q)) + ensures((return_value == MLD_NATIVE_FUNC_FALLBACK) ==> array_unchanged(a0, MLDSA_N)) ); #endif /* MLD_USE_NATIVE_POLY_DECOMPOSE_32 */ @@ -294,23 +292,21 @@ __contract__( * Assumes coefficients to be standard representatives. * * Arguments: - int32_t *a1: output polynomial with coefficients c1 - * - int32_t *a0: output polynomial with coefficients c0 - * - const int32_t *a: input polynomial + * - int32_t *a0: output polynomial with coefficients c0. + * Output has coefficients c0 **************************************************/ -static MLD_INLINE int mld_poly_decompose_88_native(int32_t *a1, int32_t *a0, - const int32_t *a) +static MLD_INLINE int mld_poly_decompose_88_native(int32_t *a1, int32_t *a0) __contract__( requires(memory_no_alias(a1, sizeof(int32_t) * MLDSA_N)) requires(memory_no_alias(a0, sizeof(int32_t) * MLDSA_N)) - requires(memory_no_alias(a, sizeof(int32_t) * MLDSA_N)) - requires(array_bound(a, 0, MLDSA_N, 0, MLDSA_Q)) + requires(array_bound(a0, 0, MLDSA_N, 0, MLDSA_Q)) assigns(memory_slice(a1, sizeof(int32_t) * MLDSA_N)) assigns(memory_slice(a0, sizeof(int32_t) * MLDSA_N)) ensures(return_value == MLD_NATIVE_FUNC_FALLBACK || return_value == MLD_NATIVE_FUNC_SUCCESS) ensures((return_value == MLD_NATIVE_FUNC_SUCCESS) ==> array_bound(a1, 0, MLDSA_N, 0, (MLDSA_Q-1)/(2*MLDSA_GAMMA2))) ensures((return_value == MLD_NATIVE_FUNC_SUCCESS) ==> array_abs_bound(a0, 0, MLDSA_N, MLDSA_GAMMA2+1)) - ensures((return_value == MLD_NATIVE_FUNC_FALLBACK) ==> array_bound(a, 0, MLDSA_N, 0, MLDSA_Q)) - ensures((return_value == MLD_NATIVE_FUNC_FALLBACK) ==> array_unchanged(a, MLDSA_N)) + ensures((return_value == MLD_NATIVE_FUNC_FALLBACK) ==> array_bound(a0, 0, MLDSA_N, 0, MLDSA_Q)) + ensures((return_value == MLD_NATIVE_FUNC_FALLBACK) ==> array_unchanged(a0, MLDSA_N)) ); #endif /* MLD_USE_NATIVE_POLY_DECOMPOSE_88 */ diff --git a/mldsa/src/native/x86_64/meta.h b/mldsa/src/native/x86_64/meta.h index 7978443f4..fb661c4a3 100644 --- a/mldsa/src/native/x86_64/meta.h +++ b/mldsa/src/native/x86_64/meta.h @@ -131,25 +131,23 @@ static MLD_INLINE int mld_rej_uniform_eta4_native(int32_t *r, unsigned len, return (int)outlen; } -static MLD_INLINE int mld_poly_decompose_32_native(int32_t *a1, int32_t *a0, - const int32_t *a) +static MLD_INLINE int mld_poly_decompose_32_native(int32_t *a1, int32_t *a0) { if (!mld_sys_check_capability(MLD_SYS_CAP_AVX2)) { return MLD_NATIVE_FUNC_FALLBACK; } - mld_poly_decompose_32_avx2((__m256i *)a1, (__m256i *)a0, (const __m256i *)a); + mld_poly_decompose_32_avx2((__m256i *)a1, (__m256i *)a0); return MLD_NATIVE_FUNC_SUCCESS; } -static MLD_INLINE int mld_poly_decompose_88_native(int32_t *a1, int32_t *a0, - const int32_t *a) +static MLD_INLINE int mld_poly_decompose_88_native(int32_t *a1, int32_t *a0) { if (!mld_sys_check_capability(MLD_SYS_CAP_AVX2)) { return MLD_NATIVE_FUNC_FALLBACK; } - mld_poly_decompose_88_avx2((__m256i *)a1, (__m256i *)a0, (const __m256i *)a); + mld_poly_decompose_88_avx2((__m256i *)a1, (__m256i *)a0); return MLD_NATIVE_FUNC_SUCCESS; } diff --git a/mldsa/src/native/x86_64/src/arith_native_x86_64.h b/mldsa/src/native/x86_64/src/arith_native_x86_64.h index fd59a21e0..1cf36a0fd 100644 --- a/mldsa/src/native/x86_64/src/arith_native_x86_64.h +++ b/mldsa/src/native/x86_64/src/arith_native_x86_64.h @@ -55,10 +55,10 @@ unsigned mld_rej_uniform_eta4_avx2( int32_t *r, const uint8_t buf[MLD_AVX2_REJ_UNIFORM_ETA4_BUFLEN]); #define mld_poly_decompose_32_avx2 MLD_NAMESPACE(mld_poly_decompose_32_avx2) -void mld_poly_decompose_32_avx2(__m256i *a1, __m256i *a0, const __m256i *a); +void mld_poly_decompose_32_avx2(__m256i *a1, __m256i *a0); #define mld_poly_decompose_88_avx2 MLD_NAMESPACE(mld_poly_decompose_88_avx2) -void mld_poly_decompose_88_avx2(__m256i *a1, __m256i *a0, const __m256i *a); +void mld_poly_decompose_88_avx2(__m256i *a1, __m256i *a0); #define mld_poly_caddq_avx2 MLD_NAMESPACE(poly_caddq_avx2) void mld_poly_caddq_avx2(int32_t *r); diff --git a/mldsa/src/native/x86_64/src/poly_decompose_32_avx2.c b/mldsa/src/native/x86_64/src/poly_decompose_32_avx2.c index c293836b3..2cab13c1c 100644 --- a/mldsa/src/native/x86_64/src/poly_decompose_32_avx2.c +++ b/mldsa/src/native/x86_64/src/poly_decompose_32_avx2.c @@ -32,7 +32,12 @@ #include "arith_native_x86_64.h" #include "consts.h" -void mld_poly_decompose_32_avx2(__m256i *a1, __m256i *a0, const __m256i *a) +/* + * Reference: The reference implementation has the input polynomial as a + * separate argument that may be aliased with either of the outputs. + * Removing the aliasing eases CBMC proofs. + */ +void mld_poly_decompose_32_avx2(__m256i *a1, __m256i *a0) { unsigned int i; __m256i f, f0, f1, t; @@ -45,7 +50,7 @@ void mld_poly_decompose_32_avx2(__m256i *a1, __m256i *a0, const __m256i *a) for (i = 0; i < MLDSA_N / 8; i++) { - f = _mm256_load_si256(&a[i]); + f = _mm256_load_si256(&a0[i]); /* check-magic: 4092 == intdiv(2 * intdiv(MLDSA_Q - 1, 32), 128) */ /* diff --git a/mldsa/src/native/x86_64/src/poly_decompose_88_avx2.c b/mldsa/src/native/x86_64/src/poly_decompose_88_avx2.c index 133dd80d0..397bde3e2 100644 --- a/mldsa/src/native/x86_64/src/poly_decompose_88_avx2.c +++ b/mldsa/src/native/x86_64/src/poly_decompose_88_avx2.c @@ -32,7 +32,13 @@ #include "arith_native_x86_64.h" #include "consts.h" -void mld_poly_decompose_88_avx2(__m256i *a1, __m256i *a0, const __m256i *a) +/* + * Reference: The reference implementation has the input polynomial as a + * separate argument that may be aliased with either of the outputs. + * Removing the aliasing eases CBMC proofs. + */ + +void mld_poly_decompose_88_avx2(__m256i *a1, __m256i *a0) { unsigned int i; __m256i f, f0, f1, t; @@ -45,7 +51,7 @@ void mld_poly_decompose_88_avx2(__m256i *a1, __m256i *a0, const __m256i *a) for (i = 0; i < MLDSA_N / 8; i++) { - f = _mm256_load_si256(&a[i]); + f = _mm256_load_si256(&a0[i]); /* check-magic: 1488 == intdiv(2 * intdiv(MLDSA_Q - 1, 88), 128) */ /* diff --git a/mldsa/src/poly_kl.c b/mldsa/src/poly_kl.c index bf481b527..7a319ca51 100644 --- a/mldsa/src/poly_kl.c +++ b/mldsa/src/poly_kl.c @@ -39,13 +39,12 @@ /* End of parameter set namespacing */ -MLD_STATIC_TESTABLE void mld_poly_decompose_c(mld_poly *a1, mld_poly *a0, - const mld_poly *a) +MLD_STATIC_TESTABLE +void mld_poly_decompose_c(mld_poly *a1, mld_poly *a0) __contract__( requires(memory_no_alias(a1, sizeof(mld_poly))) requires(memory_no_alias(a0, sizeof(mld_poly))) - requires(memory_no_alias(a, sizeof(mld_poly))) - requires(array_bound(a->coeffs, 0, MLDSA_N, 0, MLDSA_Q)) + requires(array_bound(a0->coeffs, 0, MLDSA_N, 0, MLDSA_Q)) assigns(memory_slice(a1, sizeof(mld_poly))) assigns(memory_slice(a0, sizeof(mld_poly))) ensures(array_bound(a1->coeffs, 0, MLDSA_N, 0, (MLDSA_Q-1)/(2*MLDSA_GAMMA2))) @@ -53,16 +52,17 @@ __contract__( ) { unsigned int i; - mld_assert_bound(a->coeffs, MLDSA_N, 0, MLDSA_Q); + mld_assert_bound(a0->coeffs, MLDSA_N, 0, MLDSA_Q); for (i = 0; i < MLDSA_N; ++i) __loop__( assigns(i, memory_slice(a0, sizeof(mld_poly)), memory_slice(a1, sizeof(mld_poly))) invariant(i <= MLDSA_N) + invariant(array_bound(a0->coeffs, i, MLDSA_N, 0, MLDSA_Q)) invariant(array_bound(a1->coeffs, 0, i, 0, (MLDSA_Q-1)/(2*MLDSA_GAMMA2))) invariant(array_abs_bound(a0->coeffs, 0, i, MLDSA_GAMMA2+1)) ) { - mld_decompose(&a0->coeffs[i], &a1->coeffs[i], a->coeffs[i]); + mld_decompose(&a0->coeffs[i], &a1->coeffs[i], a0->coeffs[i]); } mld_assert_abs_bound(a0->coeffs, MLDSA_N, MLDSA_GAMMA2 + 1); @@ -70,12 +70,12 @@ __contract__( } MLD_INTERNAL_API -void mld_poly_decompose(mld_poly *a1, mld_poly *a0, const mld_poly *a) +void mld_poly_decompose(mld_poly *a1, mld_poly *a0) { #if defined(MLD_USE_NATIVE_POLY_DECOMPOSE_88) && MLD_CONFIG_PARAMETER_SET == 44 int ret; - mld_assert_bound(a->coeffs, MLDSA_N, 0, MLDSA_Q); - ret = mld_poly_decompose_88_native(a1->coeffs, a0->coeffs, a->coeffs); + mld_assert_bound(a0->coeffs, MLDSA_N, 0, MLDSA_Q); + ret = mld_poly_decompose_88_native(a1->coeffs, a0->coeffs); if (ret == MLD_NATIVE_FUNC_SUCCESS) { mld_assert_abs_bound(a0->coeffs, MLDSA_N, MLDSA_GAMMA2 + 1); @@ -86,8 +86,8 @@ void mld_poly_decompose(mld_poly *a1, mld_poly *a0, const mld_poly *a) #elif defined(MLD_USE_NATIVE_POLY_DECOMPOSE_32) && \ (MLD_CONFIG_PARAMETER_SET == 65 || MLD_CONFIG_PARAMETER_SET == 87) int ret; - mld_assert_bound(a->coeffs, MLDSA_N, 0, MLDSA_Q); - ret = mld_poly_decompose_32_native(a1->coeffs, a0->coeffs, a->coeffs); + mld_assert_bound(a0->coeffs, MLDSA_N, 0, MLDSA_Q); + ret = mld_poly_decompose_32_native(a1->coeffs, a0->coeffs); if (ret == MLD_NATIVE_FUNC_SUCCESS) { mld_assert_abs_bound(a0->coeffs, MLDSA_N, MLDSA_GAMMA2 + 1); @@ -98,7 +98,7 @@ void mld_poly_decompose(mld_poly *a1, mld_poly *a0, const mld_poly *a) #endif /* !(MLD_USE_NATIVE_POLY_DECOMPOSE_88 && MLD_CONFIG_PARAMETER_SET == \ 44) && MLD_USE_NATIVE_POLY_DECOMPOSE_32 && (MLD_CONFIG_PARAMETER_SET \ == 65 || MLD_CONFIG_PARAMETER_SET == 87) */ - mld_poly_decompose_c(a1, a0, a); + mld_poly_decompose_c(a1, a0); } MLD_INTERNAL_API diff --git a/mldsa/src/poly_kl.h b/mldsa/src/poly_kl.h index 23c525efe..c6aa49835 100644 --- a/mldsa/src/poly_kl.h +++ b/mldsa/src/poly_kl.h @@ -21,18 +21,21 @@ * Assumes coefficients to be standard representatives. * * Arguments: - mld_poly *a1: pointer to output polynomial with coefficients - *c1 - * - mld_poly *a0: pointer to output polynomial with coefficients - *c0 - * - const mld_poly *a: pointer to input polynomial + * c1 + * - mld_poly *a0: pointer to input/output polynomial. Output + * polynomial has coefficients c0 + * + * Reference: The reference implementation has the input polynomial as a + * separate argument that may be aliased with either of the outputs. + * Removing the aliasing eases CBMC proofs. + * **************************************************/ MLD_INTERNAL_API -void mld_poly_decompose(mld_poly *a1, mld_poly *a0, const mld_poly *a) +void mld_poly_decompose(mld_poly *a1, mld_poly *a0) __contract__( requires(memory_no_alias(a1, sizeof(mld_poly))) requires(memory_no_alias(a0, sizeof(mld_poly))) - requires(memory_no_alias(a, sizeof(mld_poly))) - requires(array_bound(a->coeffs, 0, MLDSA_N, 0, MLDSA_Q)) + requires(array_bound(a0->coeffs, 0, MLDSA_N, 0, MLDSA_Q)) assigns(memory_slice(a1, sizeof(mld_poly))) assigns(memory_slice(a0, sizeof(mld_poly))) ensures(array_bound(a1->coeffs, 0, MLDSA_N, 0, (MLDSA_Q-1)/(2*MLDSA_GAMMA2))) diff --git a/mldsa/src/polyvec.c b/mldsa/src/polyvec.c index 56a707fe3..05c67d9ba 100644 --- a/mldsa/src/polyvec.c +++ b/mldsa/src/polyvec.c @@ -672,11 +672,10 @@ void mld_polyveck_power2round(mld_polyveck *v1, mld_polyveck *v0, } MLD_INTERNAL_API -void mld_polyveck_decompose(mld_polyveck *v1, mld_polyveck *v0, - const mld_polyveck *v) +void mld_polyveck_decompose(mld_polyveck *v1, mld_polyveck *v0) { unsigned int i; - mld_assert_bound_2d(v->vec, MLDSA_K, MLDSA_N, 0, MLDSA_Q); + mld_assert_bound_2d(v0->vec, MLDSA_K, MLDSA_N, 0, MLDSA_Q); for (i = 0; i < MLDSA_K; ++i) __loop__( @@ -686,9 +685,11 @@ void mld_polyveck_decompose(mld_polyveck *v1, mld_polyveck *v0, array_bound(v1->vec[k1].coeffs, 0, MLDSA_N, 0, (MLDSA_Q-1)/(2*MLDSA_GAMMA2)))) invariant(forall(k2, 0, i, array_abs_bound(v0->vec[k2].coeffs, 0, MLDSA_N, MLDSA_GAMMA2+1))) + invariant(forall(k3, i, MLDSA_K, + array_bound(v0->vec[k3].coeffs, 0, MLDSA_N, 0, MLDSA_Q))) ) { - mld_poly_decompose(&v1->vec[i], &v0->vec[i], &v->vec[i]); + mld_poly_decompose(&v1->vec[i], &v0->vec[i]); } mld_assert_bound_2d(v1->vec, MLDSA_K, MLDSA_N, 0, diff --git a/mldsa/src/polyvec.h b/mldsa/src/polyvec.h index f8e1795b5..57f65fdd2 100644 --- a/mldsa/src/polyvec.h +++ b/mldsa/src/polyvec.h @@ -475,20 +475,23 @@ __contract__( * to be standard representatives. * * Arguments: - mld_polyveck *v1: pointer to output vector of polynomials with - * coefficients a1 - * - mld_polyveck *v0: pointer to output vector of polynomials with - * coefficients a0 - * - const mld_polyveck *v: pointer to input vector + * coefficients a1 + * - mld_polyveck *v0: pointer to input/output vector of + * polynomials with. Output polynomial has + * coefficients a0 + * + * Reference: The reference implementation has the input polynomial as a + * separate argument that may be aliased with either of the outputs. + * Removing the aliasing eases CBMC proofs. + * **************************************************/ MLD_INTERNAL_API -void mld_polyveck_decompose(mld_polyveck *v1, mld_polyveck *v0, - const mld_polyveck *v) +void mld_polyveck_decompose(mld_polyveck *v1, mld_polyveck *v0) __contract__( requires(memory_no_alias(v1, sizeof(mld_polyveck))) requires(memory_no_alias(v0, sizeof(mld_polyveck))) - requires(memory_no_alias(v, sizeof(mld_polyveck))) requires(forall(k0, 0, MLDSA_K, - array_bound(v->vec[k0].coeffs, 0, MLDSA_N, 0, MLDSA_Q))) + array_bound(v0->vec[k0].coeffs, 0, MLDSA_N, 0, MLDSA_Q))) assigns(memory_slice(v1, sizeof(mld_polyveck))) assigns(memory_slice(v0, sizeof(mld_polyveck))) ensures(forall(k1, 0, MLDSA_K, diff --git a/mldsa/src/sign.c b/mldsa/src/sign.c index 15cf261c1..5c79ccf1e 100644 --- a/mldsa/src/sign.c +++ b/mldsa/src/sign.c @@ -447,7 +447,7 @@ __contract__( MLD_ALIGN uint8_t challenge_bytes[MLDSA_CTILDEBYTES]; unsigned int n; mld_polyvecl y, z; - mld_polyveck w, w1, w0, h; + mld_polyveck w1, w0, h; mld_poly cp; uint32_t z_invalid, w0_invalid, h_invalid; int res; @@ -458,13 +458,13 @@ __contract__( /* Matrix-vector multiplication */ z = y; mld_polyvecl_ntt(&z); - mld_polyvec_matrix_pointwise_montgomery(&w, mat, &z); - mld_polyveck_reduce(&w); - mld_polyveck_invntt_tomont(&w); + mld_polyvec_matrix_pointwise_montgomery(&w0, mat, &z); + mld_polyveck_reduce(&w0); + mld_polyveck_invntt_tomont(&w0); /* Decompose w and call the random oracle */ - mld_polyveck_caddq(&w); - mld_polyveck_decompose(&w1, &w0, &w); + mld_polyveck_caddq(&w0); + mld_polyveck_decompose(&w1, &w0); mld_polyveck_pack_w1(sig, &w1); mld_H(challenge_bytes, MLDSA_CTILDEBYTES, mu, MLDSA_CRHBYTES, sig, @@ -564,7 +564,6 @@ __contract__( mld_zeroize(challenge_bytes, MLDSA_CTILDEBYTES); mld_zeroize(&y, sizeof(y)); mld_zeroize(&z, sizeof(z)); - mld_zeroize(&w, sizeof(w)); mld_zeroize(&w1, sizeof(w1)); mld_zeroize(&w0, sizeof(w0)); mld_zeroize(&h, sizeof(h)); diff --git a/proofs/cbmc/poly_decompose/poly_decompose_harness.c b/proofs/cbmc/poly_decompose/poly_decompose_harness.c index 309afec1e..bd08c2090 100644 --- a/proofs/cbmc/poly_decompose/poly_decompose_harness.c +++ b/proofs/cbmc/poly_decompose/poly_decompose_harness.c @@ -6,6 +6,6 @@ void harness(void) { - mld_poly *a0, *a1, *a; - mld_poly_decompose(a1, a0, a); + mld_poly *a0, *a1; + mld_poly_decompose(a1, a0); } diff --git a/proofs/cbmc/poly_decompose_c/poly_decompose_c_harness.c b/proofs/cbmc/poly_decompose_c/poly_decompose_c_harness.c index 67e87f110..d5f71205e 100644 --- a/proofs/cbmc/poly_decompose_c/poly_decompose_c_harness.c +++ b/proofs/cbmc/poly_decompose_c/poly_decompose_c_harness.c @@ -5,10 +5,10 @@ // Prototype for the function under test #define mld_poly_decompose_c MLD_ADD_PARAM_SET(mld_poly_decompose_c) -void mld_poly_decompose_c(mld_poly *a1, mld_poly *a0, mld_poly *a); +void mld_poly_decompose_c(mld_poly *a1, mld_poly *a0); void harness(void) { - mld_poly *a0, *a1, *a; - mld_poly_decompose_c(a1, a0, a); + mld_poly *a0, *a1; + mld_poly_decompose_c(a1, a0); } diff --git a/proofs/cbmc/poly_decompose_native/poly_decompose_native_harness.c b/proofs/cbmc/poly_decompose_native/poly_decompose_native_harness.c index 309afec1e..bd08c2090 100644 --- a/proofs/cbmc/poly_decompose_native/poly_decompose_native_harness.c +++ b/proofs/cbmc/poly_decompose_native/poly_decompose_native_harness.c @@ -6,6 +6,6 @@ void harness(void) { - mld_poly *a0, *a1, *a; - mld_poly_decompose(a1, a0, a); + mld_poly *a0, *a1; + mld_poly_decompose(a1, a0); } diff --git a/proofs/cbmc/polyveck_decompose/polyveck_decompose_harness.c b/proofs/cbmc/polyveck_decompose/polyveck_decompose_harness.c index 6b011fa1e..68215df3a 100644 --- a/proofs/cbmc/polyveck_decompose/polyveck_decompose_harness.c +++ b/proofs/cbmc/polyveck_decompose/polyveck_decompose_harness.c @@ -5,6 +5,6 @@ void harness(void) { - mld_polyveck *a0, *a1, *a; - mld_polyveck_decompose(a1, a0, a); + mld_polyveck *a0, *a1; + mld_polyveck_decompose(a1, a0); } diff --git a/test/test_unit.c b/test/test_unit.c index f08b04c8f..48cae81f4 100644 --- a/test/test_unit.c +++ b/test/test_unit.c @@ -36,7 +36,7 @@ void mld_poly_ntt_c(mld_poly *a); void mld_poly_invntt_tomont_c(mld_poly *a); void mld_poly_caddq_c(mld_poly *a); -void mld_poly_decompose_c(mld_poly *a1, mld_poly *a0, const mld_poly *a); +void mld_poly_decompose_c(mld_poly *a1, mld_poly *a0); void mld_poly_use_hint_c(mld_poly *b, const mld_poly *a, const mld_poly *h); uint32_t mld_poly_chknorm_c(const mld_poly *a, int32_t B); void mld_poly_pointwise_montgomery_c(mld_poly *c, const mld_poly *a, @@ -247,8 +247,11 @@ static int test_poly_decompose_core(const mld_poly *input_poly, { mld_poly test_a1, test_a0, ref_a1, ref_a0; - mld_poly_decompose(&test_a1, &test_a0, input_poly); - mld_poly_decompose_c(&ref_a1, &ref_a0, input_poly); + mld_memcpy(&test_a0, input_poly, sizeof(mld_poly)); + mld_memcpy(&ref_a0, input_poly, sizeof(mld_poly)); + + mld_poly_decompose(&test_a1, &test_a0); + mld_poly_decompose_c(&ref_a1, &ref_a0); CHECK(compare_i32_arrays(test_a1.coeffs, ref_a1.coeffs, MLDSA_N, test_name, input_poly->coeffs));