@@ -281,6 +281,8 @@ inline char* IncrementalCopySlow(const char* src, char* op,
281281// 4, 5, 0, 1, 2, 3, 4, 5, 0, 1}. These byte index sequences are generated by
282282// calling MakePatternMaskBytes(0, 6, index_sequence<16>()) and
283283// MakePatternMaskBytes(16, 6, index_sequence<16>()) respectively.
284+
285+
284286template <size_t ... indexes>
285287inline constexpr std::array<char , sizeof ...(indexes)> MakePatternMaskBytes (
286288 int index_offset, int pattern_size, index_sequence<indexes...>) {
@@ -298,7 +300,6 @@ MakePatternMaskBytesTable(int index_offset,
298300 MakePatternMaskBytes (index_offset, pattern_sizes_minus_one + 1 ,
299301 make_index_sequence</* indexes=*/ sizeof (V128)>())...};
300302}
301-
302303// This is an array of shuffle control masks that can be used as the source
303304// operand for PSHUFB to permute the contents of the destination XMM register
304305// into a repeating byte pattern.
@@ -329,7 +330,6 @@ static inline V128 LoadPattern(const char* src, const size_t pattern_size) {
329330 return V128_Shuffle (V128_LoadU (reinterpret_cast <const V128*>(src)),
330331 generation_mask);
331332}
332-
333333SNAPPY_ATTRIBUTE_ALWAYS_INLINE
334334static inline std::pair<V128 /* pattern */ , V128 /* reshuffle_mask */ >
335335LoadPatternAndReshuffleMask (const char * src, const size_t pattern_size) {
@@ -345,7 +345,6 @@ LoadPatternAndReshuffleMask(const char* src, const size_t pattern_size) {
345345 pattern_reshuffle_masks[pattern_size - 1 ].data ()));
346346 return {pattern, reshuffle_mask};
347347}
348-
349348#endif // SNAPPY_HAVE_VECTOR_BYTE_SHUFFLE
350349
351350// Fallback for when we need to copy while extending the pattern, for example
@@ -494,7 +493,6 @@ inline char* IncrementalCopy(const char* src, char* op, char* const op_limit,
494493 LoadPatternAndReshuffleMask (src, pattern_size);
495494 V128 pattern = pattern_and_reshuffle_mask.first ;
496495 V128 reshuffle_mask = pattern_and_reshuffle_mask.second ;
497-
498496 // There is at least one, and at most four 16-byte blocks. Writing four
499497 // conditionals instead of a loop allows FDO to layout the code with
500498 // respect to the actual probabilities of each length.
@@ -521,7 +519,6 @@ inline char* IncrementalCopy(const char* src, char* op, char* const op_limit,
521519 LoadPatternAndReshuffleMask (src, pattern_size);
522520 V128 pattern = pattern_and_reshuffle_mask.first ;
523521 V128 reshuffle_mask = pattern_and_reshuffle_mask.second ;
524-
525522 // This code path is relatively cold however so we save code size
526523 // by avoiding unrolling and vectorizing.
527524 //
@@ -1246,6 +1243,27 @@ void MemCopy64(char* dst, const void* src, size_t size) {
12461243 data = _mm256_lddqu_si256 (static_cast <const __m256i *>(src) + 1 );
12471244 _mm256_storeu_si256 (reinterpret_cast <__m256i *>(dst) + 1 , data);
12481245 }
1246+ // RVV acceleration available on RISC-V when compiled with -march=rv64gcv
1247+ #elif defined(__riscv) && SNAPPY_HAVE_RVV
1248+ // Cast pointers to the type we will operate on.
1249+ unsigned char * dst_ptr = reinterpret_cast <unsigned char *>(dst);
1250+ const unsigned char * src_ptr = reinterpret_cast <const unsigned char *>(src);
1251+ size_t remaining_bytes = size;
1252+ // Loop as long as there are bytes remaining to be copied.
1253+ while (remaining_bytes > 0 ) {
1254+ // Set vector configuration: e8 (8-bit elements), m2 (LMUL=2).
1255+ // Use e8m2 configuration to maximize throughput.
1256+ size_t vl = VSETVL_E8M2 (remaining_bytes);
1257+ // Load data from the current source pointer.
1258+ vuint8m2_t vec = VLE8_V_U8M2 (src_ptr, vl);
1259+ // Store data to the current destination pointer.
1260+ VSE8_V_U8M2 (dst_ptr, vec, vl);
1261+ // Update pointers and the remaining count.
1262+ src_ptr += vl;
1263+ dst_ptr += vl;
1264+ remaining_bytes -= vl;
1265+ }
1266+
12491267#else
12501268 std::memmove (dst, src, kShortMemCopy );
12511269 // Profiling shows that nearly all copies are short.
0 commit comments