Skip to content

Commit 66b2e34

Browse files
chenxuqiangSvelar
authored andcommitted
erasure-code/isa/xor_op: add neon-based region_xor implementation
The load instruction of NEON can load 128 bits. Generally, the CPU has two load channels. Therefore, the 32-byte Region_xor can be implemented. According to the test by ceph_erasure_code_benchmark, the performance is improved by more than 20% ~ 50% on average. loop = 10000 (k, m, size) | base(s) | neon(s) ------------------------------------------ (4, 1, 16384) | 0.018 | 0.015 ------------------------------------------ (4, 1, 65536) | 0.043 | 0.037 ------------------------------------------ (4, 1, 102400) | 0.058 | 0.049 ------------------------------------------ (8, 1, 32768) | 0.034 | 0.029 ------------------------------------------ (8, 1, 65536) | 0.052 | 0.045 ------------------------------------------ (8, 1, 102400) | 0.068 | 0.061 ------------------------------------------ (8, 1, 524288) | 0.631 | 0.420 ------------------------------------------ (8, 1, 1048576) | 1.561 | 0.931 ------------------------------------------ (8, 1, 8388608) | 16.70 | 8.244 ------------------------------------------ Signed-off-by: chenxuqiang <[email protected]>
1 parent 3301618 commit 66b2e34

File tree

2 files changed

+64
-1
lines changed

2 files changed

+64
-1
lines changed

src/erasure-code/isa/xor_op.cc

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,11 @@
1515
#include <stdio.h>
1616
#include <string.h>
1717
#include "arch/intel.h"
18+
#include "arch/arm.h"
19+
20+
#if defined(__aarch64__) && defined(__ARM_NEON)
21+
#include <arm_neon.h>
22+
#endif
1823

1924
#include "include/ceph_assert.h"
2025

@@ -101,6 +106,16 @@ region_xor(unsigned char** src,
101106
// 64-byte region xor
102107
region_sse2_xor((char**) src, (char*) parity, src_size, region_size);
103108
} else
109+
#elif defined (__aarch64__) && defined(__ARM_NEON)
110+
if (ceph_arch_neon) {
111+
// -----------------------------
112+
// use NEON region xor function
113+
// -----------------------------
114+
unsigned region_size =
115+
(size / EC_ISA_VECTOR_NEON_WORDSIZE) * EC_ISA_VECTOR_NEON_WORDSIZE;
116+
size_left -= region_size;
117+
region_neon_xor((char**) src, (char *) parity, src_size, region_size);
118+
} else
104119
#endif
105120
{
106121
// --------------------------------------------
@@ -181,3 +196,42 @@ region_sse2_xor(char** src,
181196
#endif // __x86_64__
182197
return;
183198
}
199+
200+
void
201+
// -----------------------------------------------------------------------------
202+
region_neon_xor(char **src,
203+
char *parity,
204+
int src_size,
205+
unsigned size)
206+
// -----------------------------------------------------------------------------
207+
{
208+
#if defined(__aarch64__) && defined(__ARM_NEON)
209+
ceph_assert(!(size % EC_ISA_VECTOR_NEON_WORDSIZE));
210+
unsigned char *p = (unsigned char *)parity;
211+
unsigned char *vbuf[256] = { NULL };
212+
for (int v = 0; v < src_size; v++) {
213+
vbuf[v] = (unsigned char *)src[v];
214+
}
215+
216+
// ----------------------------------------------------------------------------------------
217+
// NEON load instructions can load 128bits of data each time, and there are 2 load channels
218+
// ----------------------------------------------------------------------------------------
219+
for (unsigned i = 0; i < size; i += EC_ISA_VECTOR_NEON_WORDSIZE) {
220+
uint64x2_t d0_1 = vld1q_u64((uint64_t *)(&(vbuf[0][i])));
221+
uint64x2_t d0_2 = vld1q_u64((uint64_t *)(&(vbuf[0][i + 16])));
222+
223+
for (int d = 1; d < src_size; d++) {
224+
uint64x2_t di_1 = vld1q_u64((uint64_t *)(&(vbuf[d][i])));
225+
uint64x2_t di_2 = vld1q_u64((uint64_t *)(&(vbuf[d][i + 16])));
226+
227+
d0_1 = veorq_u64(d0_1, di_1);
228+
d0_2 = veorq_u64(d0_2, di_2);
229+
}
230+
231+
vst1q_u64((uint64_t *)p, d0_1);
232+
vst1q_u64((uint64_t *)(p + 16), d0_2);
233+
p += EC_ISA_VECTOR_NEON_WORDSIZE;
234+
}
235+
#endif // __aarch64__ && __ARM_NEON
236+
return;
237+
}

src/erasure-code/isa/xor_op.h

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@
2727

2828
#define EC_ISA_ADDRESS_ALIGNMENT 32u
2929
#define EC_ISA_VECTOR_SSE2_WORDSIZE 64u
30-
30+
#define EC_ISA_VECTOR_NEON_WORDSIZE 32u
3131
#if __GNUC__ > 4 || \
3232
( (__GNUC__ == 4) && (__GNUC_MINOR__ >= 4) ) ||\
3333
(__clang__ == 1 )
@@ -83,5 +83,14 @@ region_sse2_xor(char** src /* array of 64-byte aligned source pointer to xor */,
8383
int src_size /* size of the source pointer array */,
8484
unsigned size /* size of the region to xor */);
8585

86+
// -------------------------------------------------------------------------
87+
// compute region XOR like parity = src[0] ^ src[1] ... ^ src[src_size-1]
88+
// using NEON 32-byte operations
89+
// -------------------------------------------------------------------------
90+
void
91+
region_neon_xor(char** src /* array of 64-byte aligned source pointer to xor */,
92+
char* parity /* 32-byte aligned output pointer containing the parity */,
93+
int src_size /* size of the source pointer array */,
94+
unsigned size /* size of the region to xor */);
8695

8796
#endif // EC_ISA_XOR_OP_H

0 commit comments

Comments
 (0)