|
15 | 15 | #include <stdio.h> |
16 | 16 | #include <string.h> |
17 | 17 | #include "arch/intel.h" |
| 18 | +#include "arch/arm.h" |
| 19 | + |
| 20 | +#if defined(__aarch64__) && defined(__ARM_NEON) |
| 21 | + #include <arm_neon.h> |
| 22 | +#endif |
18 | 23 |
|
19 | 24 | #include "include/ceph_assert.h" |
20 | 25 |
|
@@ -101,6 +106,16 @@ region_xor(unsigned char** src, |
101 | 106 | // 64-byte region xor |
102 | 107 | region_sse2_xor((char**) src, (char*) parity, src_size, region_size); |
103 | 108 | } else |
| 109 | +#elif defined (__aarch64__) && defined(__ARM_NEON) |
| 110 | + if (ceph_arch_neon) { |
| 111 | + // ----------------------------- |
| 112 | + // use NEON region xor function |
| 113 | + // ----------------------------- |
| 114 | + unsigned region_size = |
| 115 | + (size / EC_ISA_VECTOR_NEON_WORDSIZE) * EC_ISA_VECTOR_NEON_WORDSIZE; |
| 116 | + size_left -= region_size; |
| 117 | + region_neon_xor((char**) src, (char *) parity, src_size, region_size); |
| 118 | + } else |
104 | 119 | #endif |
105 | 120 | { |
106 | 121 | // -------------------------------------------- |
@@ -181,3 +196,42 @@ region_sse2_xor(char** src, |
181 | 196 | #endif // __x86_64__ |
182 | 197 | return; |
183 | 198 | } |
| 199 | + |
| 200 | +#if defined(__aarch64__) && defined(__ARM_NEON) |
| 201 | +void |
| 202 | +// ----------------------------------------------------------------------------- |
| 203 | +region_neon_xor(char **src, |
| 204 | + char *parity, |
| 205 | + int src_size, |
| 206 | + unsigned size) |
| 207 | +// ----------------------------------------------------------------------------- |
| 208 | +{ |
| 209 | + ceph_assert(!(size % EC_ISA_VECTOR_NEON_WORDSIZE)); |
| 210 | + unsigned char *p = (unsigned char *)parity; |
| 211 | + unsigned char *vbuf[256] = { NULL }; |
| 212 | + for (int v = 0; v < src_size; v++) { |
| 213 | + vbuf[v] = (unsigned char *)src[v]; |
| 214 | + } |
| 215 | + |
| 216 | + // ---------------------------------------------------------------------------------------- |
| 217 | + // NEON load instructions can load 128bits of data each time, and there are 2 load channels |
| 218 | + // ---------------------------------------------------------------------------------------- |
| 219 | + for (unsigned i = 0; i < size; i += EC_ISA_VECTOR_NEON_WORDSIZE) { |
| 220 | + uint64x2_t d0_1 = vld1q_u64((uint64_t *)(&(vbuf[0][i]))); |
| 221 | + uint64x2_t d0_2 = vld1q_u64((uint64_t *)(&(vbuf[0][i + 16]))); |
| 222 | + |
| 223 | + for (int d = 1; d < src_size; d++) { |
| 224 | + uint64x2_t di_1 = vld1q_u64((uint64_t *)(&(vbuf[d][i]))); |
| 225 | + uint64x2_t di_2 = vld1q_u64((uint64_t *)(&(vbuf[d][i + 16]))); |
| 226 | + |
| 227 | + d0_1 = veorq_u64(d0_1, di_1); |
| 228 | + d0_2 = veorq_u64(d0_2, di_2); |
| 229 | + } |
| 230 | + |
| 231 | + vst1q_u64((uint64_t *)p, d0_1); |
| 232 | + vst1q_u64((uint64_t *)(p + 16), d0_2); |
| 233 | + p += EC_ISA_VECTOR_NEON_WORDSIZE; |
| 234 | + } |
| 235 | + return; |
| 236 | +} |
| 237 | +#endif // __aarch64__ && __ARM_NEON |
0 commit comments