Skip to content

Commit 633cb9c

Browse files
authored
Merge pull request google#213 from anthony-zy/add_rvv_optmized_memcopy64
feat(RISC-V): Add RVV-optimized implementation for memcopy64
2 parents 6f99459 + e92cb6a commit 633cb9c

File tree

4 files changed

+76
-7
lines changed

4 files changed

+76
-7
lines changed

CMakeLists.txt

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -216,6 +216,31 @@ int main() {
216216
return 0;
217217
}" SNAPPY_HAVE_NEON)
218218

219+
#check RVV 1.0 need __riscv_ prefix
220+
check_cxx_source_compiles("
221+
#include <riscv_vector.h>
222+
#include <stdint.h>
223+
#include <stddef.h>
224+
int main() {
225+
uint8_t val = 3, dup[8];
226+
size_t vl = __riscv_vsetvl_e8m1(8);
227+
vuint8m1_t v = __riscv_vmv_v_x_u8m1(val, vl);
228+
return 0;
229+
}" SNAPPY_RVV_1)
230+
231+
232+
#check RVV 0.7.1 not __riscv_ prefix
233+
check_cxx_source_compiles("
234+
#include <riscv_vector.h>
235+
#include <stdint.h>
236+
#include <stddef.h>
237+
int main() {
238+
uint8_t val = 3, dup[8];
239+
size_t vl = vsetvl_e8m1(8);
240+
vuint8m1_t v = vmv_v_x_u8m1(val, vl);
241+
return 0;
242+
}" SNAPPY_RVV_0_7)
243+
219244
include(CheckSymbolExists)
220245
check_symbol_exists("mmap" "sys/mman.h" HAVE_FUNC_MMAP)
221246
check_symbol_exists("sysconf" "unistd.h" HAVE_FUNC_SYSCONF)

cmake/config.h.in

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,12 @@
5858
/* Define to 1 if you target processors with NEON and have <arm_neon.h>. */
5959
#cmakedefine01 SNAPPY_HAVE_NEON
6060

61+
/* Define to 1 if you target processors with RVV1.0 and have <riscv_vector.h>. */
62+
#cmakedefine01 SNAPPY_RVV_1
63+
64+
/* Define to 1 if you target processors with RVV0.7 and have <riscv_vector.h>. */
65+
#cmakedefine01 SNAPPY_RVV_0_7
66+
6167
/* Define to 1 if you have <arm_neon.h> and <arm_acle.h> and want to optimize
6268
compression speed by using __crc32cw from <arm_acle.h>. */
6369
#cmakedefine01 SNAPPY_HAVE_NEON_CRC32

snappy-internal.h

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,24 @@
4646
#include <arm_neon.h>
4747
#endif
4848

49-
#if SNAPPY_HAVE_SSSE3 || SNAPPY_HAVE_NEON
49+
#if SNAPPY_RVV_1 || SNAPPY_RVV_0_7
50+
#define SNAPPY_HAVE_RVV 1
51+
#include <riscv_vector.h>
52+
#else
53+
#define SNAPPY_HAVE_RVV 0
54+
#endif
55+
56+
#ifdef SNAPPY_RVV_1
57+
#define VSETVL_E8M2 __riscv_vsetvl_e8m2
58+
#define VLE8_V_U8M2 __riscv_vle8_v_u8m2
59+
#define VSE8_V_U8M2 __riscv_vse8_v_u8m2
60+
#elif SNAPPY_RVV_0_7
61+
#define VSETVL_E8M2 vsetvl_e8m2
62+
#define VLE8_V_U8M2 vle8_v_u8m2
63+
#define VSE8_V_U8M2 vse8_v_u8m2
64+
#endif
65+
66+
#if SNAPPY_HAVE_SSSE3 || SNAPPY_HAVE_NEON
5067
#define SNAPPY_HAVE_VECTOR_BYTE_SHUFFLE 1
5168
#else
5269
#define SNAPPY_HAVE_VECTOR_BYTE_SHUFFLE 0
@@ -61,7 +78,7 @@ using V128 = __m128i;
6178
#elif SNAPPY_HAVE_NEON
6279
using V128 = uint8x16_t;
6380
#endif
64-
81+
6582
// Load 128 bits of integer data. `src` must be 16-byte aligned.
6683
inline V128 V128_Load(const V128* src);
6784

@@ -110,6 +127,8 @@ inline V128 V128_Shuffle(V128 input, V128 shuffle_mask) {
110127
}
111128

112129
inline V128 V128_DupChar(char c) { return vdupq_n_u8(c); }
130+
131+
113132
#endif
114133
#endif // SNAPPY_HAVE_VECTOR_BYTE_SHUFFLE
115134

@@ -172,6 +191,7 @@ char* CompressFragment(const char* input,
172191
// loading from s2 + n.
173192
//
174193
// Separate implementation for 64-bit, little-endian cpus.
194+
// riscv and little-endian cpu choose this routinue can be done faster too.
175195
#if !SNAPPY_IS_BIG_ENDIAN && \
176196
(defined(__x86_64__) || defined(_M_X64) || defined(ARCH_PPC) || \
177197
defined(ARCH_ARM) || defined(__riscv))

snappy.cc

Lines changed: 23 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -281,6 +281,8 @@ inline char* IncrementalCopySlow(const char* src, char* op,
281281
// 4, 5, 0, 1, 2, 3, 4, 5, 0, 1}. These byte index sequences are generated by
282282
// calling MakePatternMaskBytes(0, 6, index_sequence<16>()) and
283283
// MakePatternMaskBytes(16, 6, index_sequence<16>()) respectively.
284+
285+
284286
template <size_t... indexes>
285287
inline constexpr std::array<char, sizeof...(indexes)> MakePatternMaskBytes(
286288
int index_offset, int pattern_size, index_sequence<indexes...>) {
@@ -298,7 +300,6 @@ MakePatternMaskBytesTable(int index_offset,
298300
MakePatternMaskBytes(index_offset, pattern_sizes_minus_one + 1,
299301
make_index_sequence</*indexes=*/sizeof(V128)>())...};
300302
}
301-
302303
// This is an array of shuffle control masks that can be used as the source
303304
// operand for PSHUFB to permute the contents of the destination XMM register
304305
// into a repeating byte pattern.
@@ -329,7 +330,6 @@ static inline V128 LoadPattern(const char* src, const size_t pattern_size) {
329330
return V128_Shuffle(V128_LoadU(reinterpret_cast<const V128*>(src)),
330331
generation_mask);
331332
}
332-
333333
SNAPPY_ATTRIBUTE_ALWAYS_INLINE
334334
static inline std::pair<V128 /* pattern */, V128 /* reshuffle_mask */>
335335
LoadPatternAndReshuffleMask(const char* src, const size_t pattern_size) {
@@ -345,7 +345,6 @@ LoadPatternAndReshuffleMask(const char* src, const size_t pattern_size) {
345345
pattern_reshuffle_masks[pattern_size - 1].data()));
346346
return {pattern, reshuffle_mask};
347347
}
348-
349348
#endif // SNAPPY_HAVE_VECTOR_BYTE_SHUFFLE
350349

351350
// Fallback for when we need to copy while extending the pattern, for example
@@ -494,7 +493,6 @@ inline char* IncrementalCopy(const char* src, char* op, char* const op_limit,
494493
LoadPatternAndReshuffleMask(src, pattern_size);
495494
V128 pattern = pattern_and_reshuffle_mask.first;
496495
V128 reshuffle_mask = pattern_and_reshuffle_mask.second;
497-
498496
// There is at least one, and at most four 16-byte blocks. Writing four
499497
// conditionals instead of a loop allows FDO to layout the code with
500498
// respect to the actual probabilities of each length.
@@ -521,7 +519,6 @@ inline char* IncrementalCopy(const char* src, char* op, char* const op_limit,
521519
LoadPatternAndReshuffleMask(src, pattern_size);
522520
V128 pattern = pattern_and_reshuffle_mask.first;
523521
V128 reshuffle_mask = pattern_and_reshuffle_mask.second;
524-
525522
// This code path is relatively cold however so we save code size
526523
// by avoiding unrolling and vectorizing.
527524
//
@@ -1246,6 +1243,27 @@ void MemCopy64(char* dst, const void* src, size_t size) {
12461243
data = _mm256_lddqu_si256(static_cast<const __m256i *>(src) + 1);
12471244
_mm256_storeu_si256(reinterpret_cast<__m256i *>(dst) + 1, data);
12481245
}
1246+
// RVV acceleration available on RISC-V when compiled with -march=rv64gcv
1247+
#elif defined(__riscv) && SNAPPY_HAVE_RVV
1248+
// Cast pointers to the type we will operate on.
1249+
unsigned char* dst_ptr = reinterpret_cast<unsigned char*>(dst);
1250+
const unsigned char* src_ptr = reinterpret_cast<const unsigned char*>(src);
1251+
size_t remaining_bytes = size;
1252+
// Loop as long as there are bytes remaining to be copied.
1253+
while (remaining_bytes > 0) {
1254+
// Set vector configuration: e8 (8-bit elements), m2 (LMUL=2).
1255+
// Use e8m2 configuration to maximize throughput.
1256+
size_t vl = VSETVL_E8M2(remaining_bytes);
1257+
// Load data from the current source pointer.
1258+
vuint8m2_t vec = VLE8_V_U8M2(src_ptr, vl);
1259+
// Store data to the current destination pointer.
1260+
VSE8_V_U8M2(dst_ptr, vec, vl);
1261+
// Update pointers and the remaining count.
1262+
src_ptr += vl;
1263+
dst_ptr += vl;
1264+
remaining_bytes -= vl;
1265+
}
1266+
12491267
#else
12501268
std::memmove(dst, src, kShortMemCopy);
12511269
// Profiling shows that nearly all copies are short.

0 commit comments

Comments
 (0)