Skip to content

Commit 1578bd0

Browse files
committed
Fix formatting
1 parent 4f60463 commit 1578bd0

File tree

7 files changed

+496
-340
lines changed

7 files changed

+496
-340
lines changed

mldsa/fips202/keccakf1600.c

Lines changed: 5 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -91,11 +91,9 @@ void mld_keccakf1600x4_extract_bytes(uint64_t *state, unsigned char *data0,
9191
unsigned length)
9292
{
9393
#if defined(MLD_USE_FIPS202_X4_XOR_NATIVE)
94-
mld_keccakf1600_extract_bytes_x4_native(state, data0,
95-
data1, data2,
96-
data3, offset,
97-
length);
98-
#else /* MLD_USE_FIPS202_X4_XOR_NATIVE */
94+
mld_keccakf1600_extract_bytes_x4_native(state, data0, data1, data2, data3,
95+
offset, length);
96+
#else /* MLD_USE_FIPS202_X4_XOR_NATIVE */
9997
mld_keccakf1600_extract_bytes(state + MLD_KECCAK_LANES * 0, data0, offset,
10098
length);
10199
mld_keccakf1600_extract_bytes(state + MLD_KECCAK_LANES * 1, data1, offset,
@@ -114,12 +112,9 @@ void mld_keccakf1600x4_xor_bytes(uint64_t *state, const unsigned char *data0,
114112
unsigned length)
115113
{
116114
#if defined(MLD_USE_FIPS202_X4_XOR_NATIVE)
117-
mld_keccakf1600_xor_bytes_x4_native(state, data0,
118-
data1,
119-
data2,
120-
data3, offset,
115+
mld_keccakf1600_xor_bytes_x4_native(state, data0, data1, data2, data3, offset,
121116
length);
122-
#else /* MLD_USE_FIPS202_X4_XOR_NATIVE */
117+
#else /* MLD_USE_FIPS202_X4_XOR_NATIVE */
123118
mld_keccakf1600_xor_bytes(state + MLD_KECCAK_LANES * 0, data0, offset,
124119
length);
125120
mld_keccakf1600_xor_bytes(state + MLD_KECCAK_LANES * 1, data1, offset,

mldsa/fips202/native/armv8.1-m/src/keccakf1600_adomnicai_m4_opt_m7.S

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3998,4 +3998,4 @@ KeccakF1600_StatePermute_RoundLoop:
39983998
add sp, #mSize
39993999
pop { r4 - r12, pc }
40004000

4001-
.size KeccakF1600_StatePermute_adomnicai_m4_opt_m7, .-KeccakF1600_StatePermute_adomnicai_m4_opt_m7
4001+
.size KeccakF1600_StatePermute_adomnicai_m4_opt_m7, .-KeccakF1600_StatePermute_adomnicai_m4_opt_m7

mldsa/fips202/native/armv8.1-m/src/mve-keccak-4x.c

Lines changed: 150 additions & 122 deletions
Original file line numberDiff line numberDiff line change
@@ -10,138 +10,166 @@
1010

1111
extern uint32x4x2_t to_bit_interleaving_4x(uint32x4_t, uint32x4_t);
1212
extern uint32x4x2_t from_bit_interleaving_4x(uint32x4_t, uint32x4_t);
13-
uint32x4x2_t KeccakF1600x4_LoadBytesInLane(uint32x4_t data_ptrs, uint32_t length, uint32_t offset);
14-
uint32x4_t KeccakF1600x4_StateXORBytes_aligned(uint32_t nvecs, uint8_t* state, uint32x4_t data_ptrs);
13+
uint32x4x2_t KeccakF1600x4_LoadBytesInLane(uint32x4_t data_ptrs,
14+
uint32_t length, uint32_t offset);
15+
uint32x4_t KeccakF1600x4_StateXORBytes_aligned(uint32_t nvecs, uint8_t *state,
16+
uint32x4_t data_ptrs);
1517

16-
void KeccakF1600x4_StateXORBytes(void* state,
17-
const uint8_t *data0, const uint8_t *data1,
18-
const uint8_t *data2, const uint8_t *data3,
19-
uint32_t offset, uint32_t length )
18+
void KeccakF1600x4_StateXORBytes(void *state, const uint8_t *data0,
19+
const uint8_t *data1, const uint8_t *data2,
20+
const uint8_t *data3, uint32_t offset,
21+
uint32_t length)
2022
{
21-
uintptr_t offset_in_lane = offset & 7;
22-
uintptr_t lane_offset = offset & ~7;
23-
uint32x4_t data_ptrs;
24-
__asm__ volatile (
25-
"vmov %q[o][2], %q[o][0], %[i0], %[i2]\n"
26-
"vmov %q[o][3], %q[o][1], %[i1], %[i3]\n"
27-
: [o] "=&w" (data_ptrs)
28-
: [i0] "r" (data0), [i1] "r" (data1), [i2] "r" (data2), [i3] "r" (data3)
29-
:);
30-
if (offset_in_lane) {
31-
uint32x4x2_t l;
32-
size_t nBytes = length < 8-offset_in_lane ? length : 8-offset_in_lane;
23+
uintptr_t offset_in_lane = offset & 7;
24+
uintptr_t lane_offset = offset & ~7;
25+
uint32x4_t data_ptrs;
26+
__asm__ volatile(
27+
"vmov %q[o][2], %q[o][0], %[i0], %[i2]\n"
28+
"vmov %q[o][3], %q[o][1], %[i1], %[i3]\n"
29+
: [o] "=&w"(data_ptrs)
30+
: [i0] "r"(data0), [i1] "r"(data1), [i2] "r"(data2), [i3] "r"(data3)
31+
:);
32+
if (offset_in_lane)
33+
{
34+
uint32x4x2_t l;
35+
size_t nBytes = length < 8 - offset_in_lane ? length : 8 - offset_in_lane;
3336

34-
l = KeccakF1600x4_LoadBytesInLane(data_ptrs, nBytes, offset_in_lane);
37+
l = KeccakF1600x4_LoadBytesInLane(data_ptrs, nBytes, offset_in_lane);
3538

36-
// Now convert to bit interleaving
37-
uint32x4x2_t bint = to_bit_interleaving_4x(l.val[0], l.val[1]);
38-
uint32x4_t s0 = vldrwq_u32((uint32_t *)((uintptr_t)state + lane_offset/2 * 4));
39-
uint32x4_t s1 = vldrwq_u32((uint32_t *)((uintptr_t)state + 400 + lane_offset/2 * 4));
40-
s0 = veorq_u32(s0, bint.val[0]);
41-
s1 = veorq_u32(s1, bint.val[1]);
42-
vstrwq_u32((uint32_t *)((uintptr_t)state + lane_offset/2 * 4), s0);
43-
vstrwq_u32((uint32_t *)((uintptr_t)state + 400 + lane_offset/2 * 4), s1);
44-
length -= nBytes;
45-
lane_offset += 8;
46-
data_ptrs = vaddq_n_u32(data_ptrs, nBytes);
47-
}
48-
if(length >= 8) {
49-
uint8_t *sp0 = (uint8_t *)((uintptr_t)state + lane_offset/2 * 4 - 16);
50-
uint32_t bytes_left_in_frame = 25*8 - lane_offset;
51-
uint32_t nlanes = (bytes_left_in_frame < length ? bytes_left_in_frame : length)/8;
52-
53-
data_ptrs = KeccakF1600x4_StateXORBytes_aligned(nlanes, sp0, data_ptrs);
54-
length -= nlanes * 8;
55-
lane_offset += nlanes * 8;
56-
}
57-
if (length) {
58-
uint32x4x2_t l;
59-
l = KeccakF1600x4_LoadBytesInLane(data_ptrs, length, 0);
39+
// Now convert to bit interleaving
40+
uint32x4x2_t bint = to_bit_interleaving_4x(l.val[0], l.val[1]);
41+
uint32x4_t s0 =
42+
vldrwq_u32((uint32_t *)((uintptr_t)state + lane_offset / 2 * 4));
43+
uint32x4_t s1 =
44+
vldrwq_u32((uint32_t *)((uintptr_t)state + 400 + lane_offset / 2 * 4));
45+
s0 = veorq_u32(s0, bint.val[0]);
46+
s1 = veorq_u32(s1, bint.val[1]);
47+
vstrwq_u32((uint32_t *)((uintptr_t)state + lane_offset / 2 * 4), s0);
48+
vstrwq_u32((uint32_t *)((uintptr_t)state + 400 + lane_offset / 2 * 4), s1);
49+
length -= nBytes;
50+
lane_offset += 8;
51+
data_ptrs = vaddq_n_u32(data_ptrs, nBytes);
52+
}
53+
if (length >= 8)
54+
{
55+
uint8_t *sp0 = (uint8_t *)((uintptr_t)state + lane_offset / 2 * 4 - 16);
56+
uint32_t bytes_left_in_frame = 25 * 8 - lane_offset;
57+
uint32_t nlanes =
58+
(bytes_left_in_frame < length ? bytes_left_in_frame : length) / 8;
6059

61-
uint32x4x2_t bint = to_bit_interleaving_4x(l.val[0], l.val[1]);
62-
uint32x4_t s0 = vldrwq_u32((uint32_t *)((uintptr_t)state + lane_offset/2 * 4));
63-
uint32x4_t s1 = vldrwq_u32((uint32_t *)((uintptr_t)state + 400 + lane_offset/2 * 4));
64-
s0 = veorq_u32(s0, bint.val[0]);
65-
s1 = veorq_u32(s1, bint.val[1]);
66-
vstrwq_u32((uint32_t *)((uintptr_t)state + lane_offset/2 * 4), s0);
67-
vstrwq_u32((uint32_t *)((uintptr_t)state + 400 + lane_offset/2 * 4), s1);
68-
}
60+
data_ptrs = KeccakF1600x4_StateXORBytes_aligned(nlanes, sp0, data_ptrs);
61+
length -= nlanes * 8;
62+
lane_offset += nlanes * 8;
63+
}
64+
if (length)
65+
{
66+
uint32x4x2_t l;
67+
l = KeccakF1600x4_LoadBytesInLane(data_ptrs, length, 0);
68+
69+
uint32x4x2_t bint = to_bit_interleaving_4x(l.val[0], l.val[1]);
70+
uint32x4_t s0 =
71+
vldrwq_u32((uint32_t *)((uintptr_t)state + lane_offset / 2 * 4));
72+
uint32x4_t s1 =
73+
vldrwq_u32((uint32_t *)((uintptr_t)state + 400 + lane_offset / 2 * 4));
74+
s0 = veorq_u32(s0, bint.val[0]);
75+
s1 = veorq_u32(s1, bint.val[1]);
76+
vstrwq_u32((uint32_t *)((uintptr_t)state + lane_offset / 2 * 4), s0);
77+
vstrwq_u32((uint32_t *)((uintptr_t)state + 400 + lane_offset / 2 * 4), s1);
78+
}
6979
}
7080

71-
static inline
72-
uint32_t extract_bytes_in_lane(void *state, unsigned char *data0,
73-
unsigned char *data1, unsigned char *data2,
74-
unsigned char *data3, unsigned offset,
75-
unsigned length ) {
76-
// For load, need full-lane offset
77-
uint32_t lane_offset = offset & ~7;
78-
// Load the first vector.
79-
uint32x4_t evens = vldrwq_u32((uint32_t *)((uintptr_t)state + lane_offset*2));
80-
uint32x4_t odds = vldrwq_u32((uint32_t *)((uintptr_t)state + 400 + lane_offset*2));
81-
// Deinterleave
82-
uint32x4x2_t dint = from_bit_interleaving_4x(evens, odds);
83-
// Transpose the two vectors into four half-vectors
84-
uint32x4_t out[4];
85-
for (size_t i = 0; i < 4; i++)
86-
{
87-
uint32_t l = vgetq_lane_u32(dint.val[0], i);
88-
uint32_t h = vgetq_lane_u32(dint.val[1], i);
89-
out[i] = vcreateq_u32(l | ((uint64_t) h << 32), 0);
90-
}
91-
// Use predication to write the partial vector
92-
// Make the predicate
93-
uint32_t offset_in_lane = offset & 7;
94-
uint32_t nBytes = 8-offset_in_lane < length? 8-offset_in_lane : length;
95-
mve_pred16_t wr_pred = ((1 << nBytes)-1) << offset_in_lane;
96-
uint8x16_t ov = vidupq_n_u8(0, 1);
97-
vstrbq_scatter_offset_p_u8(data0 - offset_in_lane, ov, (uint8x16_t)out[0], wr_pred);
98-
vstrbq_scatter_offset_p_u8(data1 - offset_in_lane, ov, (uint8x16_t)out[1], wr_pred);
99-
vstrbq_scatter_offset_p_u8(data2 - offset_in_lane, ov, (uint8x16_t)out[2], wr_pred);
100-
vstrbq_scatter_offset_p_u8(data3 - offset_in_lane, ov, (uint8x16_t)out[3], wr_pred);
101-
return nBytes;
81+
static inline uint32_t extract_bytes_in_lane(void *state, unsigned char *data0,
82+
unsigned char *data1,
83+
unsigned char *data2,
84+
unsigned char *data3,
85+
unsigned offset, unsigned length)
86+
{
87+
// For load, need full-lane offset
88+
uint32_t lane_offset = offset & ~7;
89+
// Load the first vector.
90+
uint32x4_t evens =
91+
vldrwq_u32((uint32_t *)((uintptr_t)state + lane_offset * 2));
92+
uint32x4_t odds =
93+
vldrwq_u32((uint32_t *)((uintptr_t)state + 400 + lane_offset * 2));
94+
// Deinterleave
95+
uint32x4x2_t dint = from_bit_interleaving_4x(evens, odds);
96+
// Transpose the two vectors into four half-vectors
97+
uint32x4_t out[4];
98+
for (size_t i = 0; i < 4; i++)
99+
{
100+
uint32_t l = vgetq_lane_u32(dint.val[0], i);
101+
uint32_t h = vgetq_lane_u32(dint.val[1], i);
102+
out[i] = vcreateq_u32(l | ((uint64_t)h << 32), 0);
103+
}
104+
// Use predication to write the partial vector
105+
// Make the predicate
106+
uint32_t offset_in_lane = offset & 7;
107+
uint32_t nBytes = 8 - offset_in_lane < length ? 8 - offset_in_lane : length;
108+
mve_pred16_t wr_pred = ((1 << nBytes) - 1) << offset_in_lane;
109+
uint8x16_t ov = vidupq_n_u8(0, 1);
110+
vstrbq_scatter_offset_p_u8(data0 - offset_in_lane, ov, (uint8x16_t)out[0],
111+
wr_pred);
112+
vstrbq_scatter_offset_p_u8(data1 - offset_in_lane, ov, (uint8x16_t)out[1],
113+
wr_pred);
114+
vstrbq_scatter_offset_p_u8(data2 - offset_in_lane, ov, (uint8x16_t)out[2],
115+
wr_pred);
116+
vstrbq_scatter_offset_p_u8(data3 - offset_in_lane, ov, (uint8x16_t)out[3],
117+
wr_pred);
118+
return nBytes;
102119
}
103120

104121
void KeccakF1600x4_StateExtract_bytes(void *state, unsigned char *data0,
105-
unsigned char *data1, unsigned char *data2,
106-
unsigned char *data3, unsigned offset,
107-
unsigned length)
122+
unsigned char *data1,
123+
unsigned char *data2,
124+
unsigned char *data3, unsigned offset,
125+
unsigned length)
108126
{
109-
// Make a data pointer vector
110-
uint32x4_t data_addrs = vcreateq_u32(
111-
(uint64_t)(uintptr_t)data0 | ((uint64_t)(uintptr_t)data1 << 32),
112-
(uint64_t)(uintptr_t)data2 | ((uint64_t)(uintptr_t)data3 << 32)
113-
);
114-
// Only load full 64-bit values from state
115-
if (offset & 7){
116-
117-
uint32_t nBytes = extract_bytes_in_lane(state, data0, data1, data2, data3, offset, length);
118-
data_addrs = vaddq_n_u32(data_addrs, nBytes);
119-
length -= nBytes;
120-
offset += nBytes;
121-
}
122-
// For each full vector
123-
if (length >=8 ) {
124-
data_addrs = vsubq_n_u32(data_addrs,4);
125-
for (; length >= 8; length -=8) {
126-
// Load the vector & increment read pointer
127-
uint32x4_t evens = vldrwq_u32((uint32_t *)((uintptr_t)state + offset * 2));
128-
uint32x4_t odds = vldrwq_u32((uint32_t *)((uintptr_t)state + 400 + offset * 2));
129-
offset += 8;
130-
// Deinterleave
131-
uint32x4x2_t dint = from_bit_interleaving_4x(evens, odds);
132-
// Write out & increment the write pointer
133-
__asm__ volatile("vstrw.u32 %q[d], [%q[a], #4]!" : [a]"+w"(data_addrs) : [d]"w"(dint.val[0]) : "memory");
134-
__asm__ volatile("vstrw.u32 %q[d], [%q[a], #4]!" : [a]"+w"(data_addrs) : [d]"w"(dint.val[1]) : "memory");
135-
}
136-
data_addrs = vaddq_n_u32(data_addrs, 4);
137-
}
138-
if (length) {
139-
data0 = (uint8_t *) vgetq_lane_u32(data_addrs, 0);
140-
data1 = (uint8_t *) vgetq_lane_u32(data_addrs, 1);
141-
data2 = (uint8_t *) vgetq_lane_u32(data_addrs, 2);
142-
data3 = (uint8_t *) vgetq_lane_u32(data_addrs, 3);
143-
// printf("Remaining length: %u; current offset: %u\r\n", length, offset);
144-
extract_bytes_in_lane(state, data0, data1, data2, data3, offset, length);
127+
// Make a data pointer vector
128+
uint32x4_t data_addrs = vcreateq_u32(
129+
(uint64_t)(uintptr_t)data0 | ((uint64_t)(uintptr_t)data1 << 32),
130+
(uint64_t)(uintptr_t)data2 | ((uint64_t)(uintptr_t)data3 << 32));
131+
// Only load full 64-bit values from state
132+
if (offset & 7)
133+
{
134+
uint32_t nBytes = extract_bytes_in_lane(state, data0, data1, data2, data3,
135+
offset, length);
136+
data_addrs = vaddq_n_u32(data_addrs, nBytes);
137+
length -= nBytes;
138+
offset += nBytes;
139+
}
140+
// For each full vector
141+
if (length >= 8)
142+
{
143+
data_addrs = vsubq_n_u32(data_addrs, 4);
144+
for (; length >= 8; length -= 8)
145+
{
146+
// Load the vector & increment read pointer
147+
uint32x4_t evens =
148+
vldrwq_u32((uint32_t *)((uintptr_t)state + offset * 2));
149+
uint32x4_t odds =
150+
vldrwq_u32((uint32_t *)((uintptr_t)state + 400 + offset * 2));
151+
offset += 8;
152+
// Deinterleave
153+
uint32x4x2_t dint = from_bit_interleaving_4x(evens, odds);
154+
// Write out & increment the write pointer
155+
__asm__ volatile("vstrw.u32 %q[d], [%q[a], #4]!"
156+
: [a] "+w"(data_addrs)
157+
: [d] "w"(dint.val[0])
158+
: "memory");
159+
__asm__ volatile("vstrw.u32 %q[d], [%q[a], #4]!"
160+
: [a] "+w"(data_addrs)
161+
: [d] "w"(dint.val[1])
162+
: "memory");
145163
}
164+
data_addrs = vaddq_n_u32(data_addrs, 4);
165+
}
166+
if (length)
167+
{
168+
data0 = (uint8_t *)vgetq_lane_u32(data_addrs, 0);
169+
data1 = (uint8_t *)vgetq_lane_u32(data_addrs, 1);
170+
data2 = (uint8_t *)vgetq_lane_u32(data_addrs, 2);
171+
data3 = (uint8_t *)vgetq_lane_u32(data_addrs, 3);
172+
// printf("Remaining length: %u; current offset: %u\r\n", length, offset);
173+
extract_bytes_in_lane(state, data0, data1, data2, data3, offset, length);
174+
}
146175
}
147-

mldsa/fips202/native/armv8.1-m/src/mve-keccak-4x_opt_m55.S

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1637,4 +1637,4 @@ roundend:
16371637
add sp, #8*16
16381638

16391639
vpop {d8-d15}
1640-
ldmia.w sp!, {r3,r4,r5,r6,r7,r8,r9,r10,r11,r12, pc}
1640+
ldmia.w sp!, {r3,r4,r5,r6,r7,r8,r9,r10,r11,r12, pc}

0 commit comments

Comments
 (0)