Skip to content

Commit a378a4c

Browse files
HouBingjianfacebook-github-bot
authored andcommitted
arm64 crc prefetch optimise (facebook#5773)
Summary: prefetch data for following block,avoid cache miss when doing crc caculate I do performance test at kunpeng-920 server(arm-v8, [email protected]) ./db_bench --benchmarks=crc32c --block_size=500000000 before optimise : 587313.500 micros/op 1 ops/sec; 811.9 MB/s (500000000 per op) after optimise : 289248.500 micros/op 3 ops/sec; 1648.5 MB/s (500000000 per op) Pull Request resolved: facebook#5773 Differential Revision: D17347339 fbshipit-source-id: bfcd74f0f0eb4b322b959be68019ddcaae1e3341
1 parent d35ffd5 commit a378a4c

File tree

2 files changed

+15
-1
lines changed

2 files changed

+15
-1
lines changed

util/crc32c_arm64.cc

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,10 @@ uint32_t crc32c_arm64(uint32_t crc, unsigned char const *data,
6464
*/
6565
uint32_t k0 = 0xe417f38a, k1 = 0x8f158014;
6666

67-
/* First 8 bytei for better pipelining */
67+
/* Prefetch data for following block to avoid cache miss */
68+
PREF1KL1((uint8_t *)buf64, 1024);
69+
70+
/* First 8 byte for better pipelining */
6871
crc0 = crc32c_u64(crc, *buf64++);
6972

7073
/* 3 blocks crc32c parallel computation

util/crc32c_arm64.h

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,17 @@
1717
#define crc32c_u16(crc, v) __crc32ch(crc, v)
1818
#define crc32c_u32(crc, v) __crc32cw(crc, v)
1919
#define crc32c_u64(crc, v) __crc32cd(crc, v)
20+
#define PREF4X64L1(buffer,PREF_OFFSET, ITR) \
21+
__asm__("PRFM PLDL1KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 0)*64));\
22+
__asm__("PRFM PLDL1KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 1)*64));\
23+
__asm__("PRFM PLDL1KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 2)*64));\
24+
__asm__("PRFM PLDL1KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 3)*64));
25+
26+
#define PREF1KL1(buffer,PREF_OFFSET) \
27+
PREF4X64L1(buffer,(PREF_OFFSET), 0) \
28+
PREF4X64L1(buffer,(PREF_OFFSET), 4) \
29+
PREF4X64L1(buffer,(PREF_OFFSET), 8) \
30+
PREF4X64L1(buffer,(PREF_OFFSET), 12)
2031

2132
extern uint32_t crc32c_arm64(uint32_t crc, unsigned char const *data, unsigned len);
2233
extern uint32_t crc32c_runtime_check(void);

0 commit comments

Comments
 (0)