Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 4 additions & 6 deletions dev/aarch64_clean/meta.h
Original file line number Diff line number Diff line change
Expand Up @@ -111,17 +111,15 @@ static MLD_INLINE int mld_rej_uniform_eta4_native(int32_t *r, unsigned len,
return (int)outlen;
}

static MLD_INLINE int mld_poly_decompose_32_native(int32_t *a1, int32_t *a0,
const int32_t *a)
static MLD_INLINE int mld_poly_decompose_32_native(int32_t *a1, int32_t *a0)
{
mld_poly_decompose_32_asm(a1, a0, a);
mld_poly_decompose_32_asm(a1, a0);
return MLD_NATIVE_FUNC_SUCCESS;
}

static MLD_INLINE int mld_poly_decompose_88_native(int32_t *a1, int32_t *a0,
const int32_t *a)
static MLD_INLINE int mld_poly_decompose_88_native(int32_t *a1, int32_t *a0)
{
mld_poly_decompose_88_asm(a1, a0, a);
mld_poly_decompose_88_asm(a1, a0);
return MLD_NATIVE_FUNC_SUCCESS;
}

Expand Down
4 changes: 2 additions & 2 deletions dev/aarch64_clean/src/arith_native_aarch64.h
Original file line number Diff line number Diff line change
Expand Up @@ -68,10 +68,10 @@ unsigned mld_rej_uniform_eta4_asm(int32_t *r, const uint8_t *buf,
unsigned buflen, const uint8_t *table);

#define mld_poly_decompose_32_asm MLD_NAMESPACE(poly_decompose_32_asm)
void mld_poly_decompose_32_asm(int32_t *a1, int32_t *a0, const int32_t *a);
void mld_poly_decompose_32_asm(int32_t *a1, int32_t *a0);

#define mld_poly_decompose_88_asm MLD_NAMESPACE(poly_decompose_88_asm)
void mld_poly_decompose_88_asm(int32_t *a1, int32_t *a0, const int32_t *a);
void mld_poly_decompose_88_asm(int32_t *a1, int32_t *a0);

#define mld_poly_caddq_asm MLD_NAMESPACE(poly_caddq_asm)
void mld_poly_caddq_asm(int32_t *a);
Expand Down
12 changes: 5 additions & 7 deletions dev/aarch64_clean/src/poly_decompose_32_asm.S
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,7 @@

/* Parameters */
a1_ptr .req x0 // Output polynomial with coefficients c1
a0_ptr .req x1 // Output polynomial with coefficients c0
a_ptr .req x2 // Input polynomial
a0_ptr .req x1 // Input/Output polynomial with coefficients c0

count .req x3

Expand Down Expand Up @@ -76,10 +75,10 @@ MLD_ASM_FN_SYMBOL(poly_decompose_32_asm)
mov count, #(64/4)

poly_decompose_32_loop:
ldr q1, [a_ptr, #1*16]
ldr q2, [a_ptr, #2*16]
ldr q3, [a_ptr, #3*16]
ldr q0, [a_ptr], #4*16
ldr q0, [a0_ptr, #0*16]
ldr q1, [a0_ptr, #1*16]
ldr q2, [a0_ptr, #2*16]
ldr q3, [a0_ptr, #3*16]

decompose32 v5, v1, v24
decompose32 v6, v2, v24
Expand All @@ -102,7 +101,6 @@ poly_decompose_32_loop:

.unreq a1_ptr
.unreq a0_ptr
.unreq a_ptr
.unreq count
.unreq q
.unreq q_bound
Expand Down
12 changes: 5 additions & 7 deletions dev/aarch64_clean/src/poly_decompose_88_asm.S
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,7 @@

/* Parameters */
a1_ptr .req x0 // Output polynomial with coefficients c1
a0_ptr .req x1 // Output polynomial with coefficients c0
a_ptr .req x2 // Input polynomial
a0_ptr .req x1 // Input/Output polynomial with coefficients c0

count .req x3

Expand Down Expand Up @@ -74,10 +73,10 @@ MLD_ASM_FN_SYMBOL(poly_decompose_88_asm)

mov count, #(64/4)
poly_decompose_88_loop:
ldr q1, [a_ptr, #1*16]
ldr q2, [a_ptr, #2*16]
ldr q3, [a_ptr, #3*16]
ldr q0, [a_ptr], #4*16
ldr q0, [a0_ptr, #0*16]
ldr q1, [a0_ptr, #1*16]
ldr q2, [a0_ptr, #2*16]
ldr q3, [a0_ptr, #3*16]

decompose88 v5, v1, v24
decompose88 v6, v2, v24
Expand All @@ -100,7 +99,6 @@ poly_decompose_88_loop:

.unreq a1_ptr
.unreq a0_ptr
.unreq a_ptr
.unreq count
.unreq q
.unreq q_bound
Expand Down
10 changes: 4 additions & 6 deletions dev/aarch64_opt/meta.h
Original file line number Diff line number Diff line change
Expand Up @@ -111,17 +111,15 @@ static MLD_INLINE int mld_rej_uniform_eta4_native(int32_t *r, unsigned len,
return (int)outlen;
}

static MLD_INLINE int mld_poly_decompose_32_native(int32_t *a1, int32_t *a0,
const int32_t *a)
static MLD_INLINE int mld_poly_decompose_32_native(int32_t *a1, int32_t *a0)
{
mld_poly_decompose_32_asm(a1, a0, a);
mld_poly_decompose_32_asm(a1, a0);
return MLD_NATIVE_FUNC_SUCCESS;
}

static MLD_INLINE int mld_poly_decompose_88_native(int32_t *a1, int32_t *a0,
const int32_t *a)
static MLD_INLINE int mld_poly_decompose_88_native(int32_t *a1, int32_t *a0)
{
mld_poly_decompose_88_asm(a1, a0, a);
mld_poly_decompose_88_asm(a1, a0);
return MLD_NATIVE_FUNC_SUCCESS;
}

Expand Down
4 changes: 2 additions & 2 deletions dev/aarch64_opt/src/arith_native_aarch64.h
Original file line number Diff line number Diff line change
Expand Up @@ -68,10 +68,10 @@ unsigned mld_rej_uniform_eta4_asm(int32_t *r, const uint8_t *buf,
unsigned buflen, const uint8_t *table);

#define mld_poly_decompose_32_asm MLD_NAMESPACE(poly_decompose_32_asm)
void mld_poly_decompose_32_asm(int32_t *a1, int32_t *a0, const int32_t *a);
void mld_poly_decompose_32_asm(int32_t *a1, int32_t *a0);

#define mld_poly_decompose_88_asm MLD_NAMESPACE(poly_decompose_88_asm)
void mld_poly_decompose_88_asm(int32_t *a1, int32_t *a0, const int32_t *a);
void mld_poly_decompose_88_asm(int32_t *a1, int32_t *a0);

#define mld_poly_caddq_asm MLD_NAMESPACE(poly_caddq_asm)
void mld_poly_caddq_asm(int32_t *a);
Expand Down
12 changes: 5 additions & 7 deletions dev/aarch64_opt/src/poly_decompose_32_asm.S
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,7 @@

/* Parameters */
a1_ptr .req x0 // Output polynomial with coefficients c1
a0_ptr .req x1 // Output polynomial with coefficients c0
a_ptr .req x2 // Input polynomial
a0_ptr .req x1 // Input/Output polynomial with coefficients c0

count .req x3

Expand Down Expand Up @@ -76,10 +75,10 @@ MLD_ASM_FN_SYMBOL(poly_decompose_32_asm)
mov count, #(64/4)

poly_decompose_32_loop:
ldr q1, [a_ptr, #1*16]
ldr q2, [a_ptr, #2*16]
ldr q3, [a_ptr, #3*16]
ldr q0, [a_ptr], #4*16
ldr q0, [a0_ptr, #0*16]
ldr q1, [a0_ptr, #1*16]
ldr q2, [a0_ptr, #2*16]
ldr q3, [a0_ptr, #3*16]

decompose32 v5, v1, v24
decompose32 v6, v2, v24
Expand All @@ -102,7 +101,6 @@ poly_decompose_32_loop:

.unreq a1_ptr
.unreq a0_ptr
.unreq a_ptr
.unreq count
.unreq q
.unreq q_bound
Expand Down
12 changes: 5 additions & 7 deletions dev/aarch64_opt/src/poly_decompose_88_asm.S
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,7 @@

/* Parameters */
a1_ptr .req x0 // Output polynomial with coefficients c1
a0_ptr .req x1 // Output polynomial with coefficients c0
a_ptr .req x2 // Input polynomial
a0_ptr .req x1 // Input/Output polynomial with coefficients c0

count .req x3

Expand Down Expand Up @@ -74,10 +73,10 @@ MLD_ASM_FN_SYMBOL(poly_decompose_88_asm)

mov count, #(64/4)
poly_decompose_88_loop:
ldr q1, [a_ptr, #1*16]
ldr q2, [a_ptr, #2*16]
ldr q3, [a_ptr, #3*16]
ldr q0, [a_ptr], #4*16
ldr q0, [a0_ptr, #0*16]
ldr q1, [a0_ptr, #1*16]
ldr q2, [a0_ptr, #2*16]
ldr q3, [a0_ptr, #3*16]

decompose88 v5, v1, v24
decompose88 v6, v2, v24
Expand All @@ -100,7 +99,6 @@ poly_decompose_88_loop:

.unreq a1_ptr
.unreq a0_ptr
.unreq a_ptr
.unreq count
.unreq q
.unreq q_bound
Expand Down
10 changes: 4 additions & 6 deletions dev/x86_64/meta.h
Original file line number Diff line number Diff line change
Expand Up @@ -131,25 +131,23 @@ static MLD_INLINE int mld_rej_uniform_eta4_native(int32_t *r, unsigned len,
return (int)outlen;
}

static MLD_INLINE int mld_poly_decompose_32_native(int32_t *a1, int32_t *a0,
const int32_t *a)
static MLD_INLINE int mld_poly_decompose_32_native(int32_t *a1, int32_t *a0)
{
if (!mld_sys_check_capability(MLD_SYS_CAP_AVX2))
{
return MLD_NATIVE_FUNC_FALLBACK;
}
mld_poly_decompose_32_avx2((__m256i *)a1, (__m256i *)a0, (const __m256i *)a);
mld_poly_decompose_32_avx2((__m256i *)a1, (__m256i *)a0);
return MLD_NATIVE_FUNC_SUCCESS;
}

static MLD_INLINE int mld_poly_decompose_88_native(int32_t *a1, int32_t *a0,
const int32_t *a)
static MLD_INLINE int mld_poly_decompose_88_native(int32_t *a1, int32_t *a0)
{
if (!mld_sys_check_capability(MLD_SYS_CAP_AVX2))
{
return MLD_NATIVE_FUNC_FALLBACK;
}
mld_poly_decompose_88_avx2((__m256i *)a1, (__m256i *)a0, (const __m256i *)a);
mld_poly_decompose_88_avx2((__m256i *)a1, (__m256i *)a0);
return MLD_NATIVE_FUNC_SUCCESS;
}

Expand Down
4 changes: 2 additions & 2 deletions dev/x86_64/src/arith_native_x86_64.h
Original file line number Diff line number Diff line change
Expand Up @@ -55,10 +55,10 @@ unsigned mld_rej_uniform_eta4_avx2(
int32_t *r, const uint8_t buf[MLD_AVX2_REJ_UNIFORM_ETA4_BUFLEN]);

#define mld_poly_decompose_32_avx2 MLD_NAMESPACE(mld_poly_decompose_32_avx2)
void mld_poly_decompose_32_avx2(__m256i *a1, __m256i *a0, const __m256i *a);
void mld_poly_decompose_32_avx2(__m256i *a1, __m256i *a0);

#define mld_poly_decompose_88_avx2 MLD_NAMESPACE(mld_poly_decompose_88_avx2)
void mld_poly_decompose_88_avx2(__m256i *a1, __m256i *a0, const __m256i *a);
void mld_poly_decompose_88_avx2(__m256i *a1, __m256i *a0);

#define mld_poly_caddq_avx2 MLD_NAMESPACE(poly_caddq_avx2)
void mld_poly_caddq_avx2(int32_t *r);
Expand Down
9 changes: 7 additions & 2 deletions dev/x86_64/src/poly_decompose_32_avx2.c
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,12 @@
#include "arith_native_x86_64.h"
#include "consts.h"

void mld_poly_decompose_32_avx2(__m256i *a1, __m256i *a0, const __m256i *a)
/*
* Reference: The reference implementation has the input polynomial as a
* separate argument that may be aliased with either of the outputs.
* Removing the aliasing eases CBMC proofs.
*/
void mld_poly_decompose_32_avx2(__m256i *a1, __m256i *a0)
{
unsigned int i;
__m256i f, f0, f1, t;
Expand All @@ -45,7 +50,7 @@ void mld_poly_decompose_32_avx2(__m256i *a1, __m256i *a0, const __m256i *a)

for (i = 0; i < MLDSA_N / 8; i++)
{
f = _mm256_load_si256(&a[i]);
f = _mm256_load_si256(&a0[i]);

/* check-magic: 4092 == intdiv(2 * intdiv(MLDSA_Q - 1, 32), 128) */
/*
Expand Down
10 changes: 8 additions & 2 deletions dev/x86_64/src/poly_decompose_88_avx2.c
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,13 @@
#include "arith_native_x86_64.h"
#include "consts.h"

void mld_poly_decompose_88_avx2(__m256i *a1, __m256i *a0, const __m256i *a)
/*
* Reference: The reference implementation has the input polynomial as a
* separate argument that may be aliased with either of the outputs.
* Removing the aliasing eases CBMC proofs.
*/

void mld_poly_decompose_88_avx2(__m256i *a1, __m256i *a0)
{
unsigned int i;
__m256i f, f0, f1, t;
Expand All @@ -45,7 +51,7 @@ void mld_poly_decompose_88_avx2(__m256i *a1, __m256i *a0, const __m256i *a)

for (i = 0; i < MLDSA_N / 8; i++)
{
f = _mm256_load_si256(&a[i]);
f = _mm256_load_si256(&a0[i]);

/* check-magic: 1488 == intdiv(2 * intdiv(MLDSA_Q - 1, 88), 128) */
/*
Expand Down
10 changes: 4 additions & 6 deletions mldsa/src/native/aarch64/meta.h
Original file line number Diff line number Diff line change
Expand Up @@ -111,17 +111,15 @@ static MLD_INLINE int mld_rej_uniform_eta4_native(int32_t *r, unsigned len,
return (int)outlen;
}

static MLD_INLINE int mld_poly_decompose_32_native(int32_t *a1, int32_t *a0,
const int32_t *a)
static MLD_INLINE int mld_poly_decompose_32_native(int32_t *a1, int32_t *a0)
{
mld_poly_decompose_32_asm(a1, a0, a);
mld_poly_decompose_32_asm(a1, a0);
return MLD_NATIVE_FUNC_SUCCESS;
}

static MLD_INLINE int mld_poly_decompose_88_native(int32_t *a1, int32_t *a0,
const int32_t *a)
static MLD_INLINE int mld_poly_decompose_88_native(int32_t *a1, int32_t *a0)
{
mld_poly_decompose_88_asm(a1, a0, a);
mld_poly_decompose_88_asm(a1, a0);
return MLD_NATIVE_FUNC_SUCCESS;
}

Expand Down
4 changes: 2 additions & 2 deletions mldsa/src/native/aarch64/src/arith_native_aarch64.h
Original file line number Diff line number Diff line change
Expand Up @@ -68,10 +68,10 @@ unsigned mld_rej_uniform_eta4_asm(int32_t *r, const uint8_t *buf,
unsigned buflen, const uint8_t *table);

#define mld_poly_decompose_32_asm MLD_NAMESPACE(poly_decompose_32_asm)
void mld_poly_decompose_32_asm(int32_t *a1, int32_t *a0, const int32_t *a);
void mld_poly_decompose_32_asm(int32_t *a1, int32_t *a0);

#define mld_poly_decompose_88_asm MLD_NAMESPACE(poly_decompose_88_asm)
void mld_poly_decompose_88_asm(int32_t *a1, int32_t *a0, const int32_t *a);
void mld_poly_decompose_88_asm(int32_t *a1, int32_t *a0);

#define mld_poly_caddq_asm MLD_NAMESPACE(poly_caddq_asm)
void mld_poly_caddq_asm(int32_t *a);
Expand Down
8 changes: 4 additions & 4 deletions mldsa/src/native/aarch64/src/poly_decompose_32_asm.S
Original file line number Diff line number Diff line change
Expand Up @@ -36,10 +36,10 @@ MLD_ASM_FN_SYMBOL(poly_decompose_32_asm)
mov x3, #0x10 // =16

Lpoly_decompose_32_loop:
ldr q1, [x2, #0x10]
ldr q2, [x2, #0x20]
ldr q3, [x2, #0x30]
ldr q0, [x2], #0x40
ldr q0, [x1]
ldr q1, [x1, #0x10]
ldr q2, [x1, #0x20]
ldr q3, [x1, #0x30]
sqdmulh v5.4s, v1.4s, v23.4s
srshr v5.4s, v5.4s, #0x12
cmgt v24.4s, v1.4s, v21.4s
Expand Down
8 changes: 4 additions & 4 deletions mldsa/src/native/aarch64/src/poly_decompose_88_asm.S
Original file line number Diff line number Diff line change
Expand Up @@ -36,10 +36,10 @@ MLD_ASM_FN_SYMBOL(poly_decompose_88_asm)
mov x3, #0x10 // =16

Lpoly_decompose_88_loop:
ldr q1, [x2, #0x10]
ldr q2, [x2, #0x20]
ldr q3, [x2, #0x30]
ldr q0, [x2], #0x40
ldr q0, [x1]
ldr q1, [x1, #0x10]
ldr q2, [x1, #0x20]
ldr q3, [x1, #0x30]
sqdmulh v5.4s, v1.4s, v23.4s
srshr v5.4s, v5.4s, #0x11
cmgt v24.4s, v1.4s, v21.4s
Expand Down
Loading