Merge pull request google#213 from anthony-zy/add_rvv_optmized_memcopy64

danilak-G · web-flow · commit 633cb9cbcb4b · 2025-10-20T08:50:42.000+01:00
feat(RISC-V): Add RVV-optimized implementation for memcopy64
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -216,6 +216,31 @@ int main() {
   return 0;
 }" SNAPPY_HAVE_NEON)
 
+#check RVV 1.0 need __riscv_ prefix
+check_cxx_source_compiles("
+  #include <riscv_vector.h>
+  #include <stdint.h>
+  #include <stddef.h>
+  int main() {
+    uint8_t val = 3, dup[8];
+    size_t vl = __riscv_vsetvl_e8m1(8);
+    vuint8m1_t v = __riscv_vmv_v_x_u8m1(val, vl);
+    return 0;
+  }" SNAPPY_RVV_1)
+
+
+#check RVV 0.7.1 not  __riscv_ prefix
+check_cxx_source_compiles("
+  #include <riscv_vector.h>
+  #include <stdint.h>
+  #include <stddef.h>
+  int main() {
+    uint8_t val = 3, dup[8];
+    size_t vl = vsetvl_e8m1(8);
+    vuint8m1_t v = vmv_v_x_u8m1(val, vl);
+    return 0;
+  }" SNAPPY_RVV_0_7)
+
 include(CheckSymbolExists)
 check_symbol_exists("mmap" "sys/mman.h" HAVE_FUNC_MMAP)
 check_symbol_exists("sysconf" "unistd.h" HAVE_FUNC_SYSCONF)
diff --git a/cmake/config.h.in b/cmake/config.h.in
@@ -58,6 +58,12 @@
 /* Define to 1 if you target processors with NEON and have <arm_neon.h>. */
 #cmakedefine01 SNAPPY_HAVE_NEON
 
+/* Define to 1 if you target processors with RVV1.0 and have <riscv_vector.h>. */
+#cmakedefine01 SNAPPY_RVV_1
+
+/* Define to 1 if you target processors with RVV0.7 and have <riscv_vector.h>. */
+#cmakedefine01 SNAPPY_RVV_0_7
+
 /* Define to 1 if you have <arm_neon.h> and <arm_acle.h> and want to optimize
    compression speed by using __crc32cw from <arm_acle.h>. */
 #cmakedefine01 SNAPPY_HAVE_NEON_CRC32
diff --git a/snappy-internal.h b/snappy-internal.h
@@ -46,7 +46,24 @@
 #include <arm_neon.h>
 #endif
 
-#if SNAPPY_HAVE_SSSE3 || SNAPPY_HAVE_NEON
+#if SNAPPY_RVV_1 || SNAPPY_RVV_0_7
+#define SNAPPY_HAVE_RVV 1
+#include <riscv_vector.h>
+#else
+#define SNAPPY_HAVE_RVV 0
+#endif
+
+#ifdef SNAPPY_RVV_1
+#define VSETVL_E8M2 __riscv_vsetvl_e8m2
+#define VLE8_V_U8M2 __riscv_vle8_v_u8m2
+#define VSE8_V_U8M2 __riscv_vse8_v_u8m2
+#elif SNAPPY_RVV_0_7
+#define VSETVL_E8M2 vsetvl_e8m2
+#define VLE8_V_U8M2 vle8_v_u8m2
+#define VSE8_V_U8M2 vse8_v_u8m2
+#endif
+
+#if SNAPPY_HAVE_SSSE3 || SNAPPY_HAVE_NEON 
 #define SNAPPY_HAVE_VECTOR_BYTE_SHUFFLE 1
 #else
 #define SNAPPY_HAVE_VECTOR_BYTE_SHUFFLE 0
@@ -61,7 +78,7 @@ using V128 = __m128i;
 #elif SNAPPY_HAVE_NEON
 using V128 = uint8x16_t;
 #endif
-
+ 
 // Load 128 bits of integer data. `src` must be 16-byte aligned.
 inline V128 V128_Load(const V128* src);
 
@@ -110,6 +127,8 @@ inline V128 V128_Shuffle(V128 input, V128 shuffle_mask) {
 }
 
 inline V128 V128_DupChar(char c) { return vdupq_n_u8(c); }
+
+
 #endif
 #endif  // SNAPPY_HAVE_VECTOR_BYTE_SHUFFLE
 
@@ -172,6 +191,7 @@ char* CompressFragment(const char* input,
 // loading from s2 + n.
 //
 // Separate implementation for 64-bit, little-endian cpus.
+// riscv and little-endian cpu choose this routinue can be done faster too.
 #if !SNAPPY_IS_BIG_ENDIAN && \
     (defined(__x86_64__) || defined(_M_X64) || defined(ARCH_PPC) || \
      defined(ARCH_ARM) || defined(__riscv))
diff --git a/snappy.cc b/snappy.cc
@@ -281,6 +281,8 @@ inline char* IncrementalCopySlow(const char* src, char* op,
 // 4, 5, 0, 1, 2, 3, 4, 5, 0, 1}. These byte index sequences are generated by
 // calling MakePatternMaskBytes(0, 6, index_sequence<16>()) and
 // MakePatternMaskBytes(16, 6, index_sequence<16>()) respectively.
+
+
 template <size_t... indexes>
 inline constexpr std::array<char, sizeof...(indexes)> MakePatternMaskBytes(
     int index_offset, int pattern_size, index_sequence<indexes...>) {
@@ -298,7 +300,6 @@ MakePatternMaskBytesTable(int index_offset,
       MakePatternMaskBytes(index_offset, pattern_sizes_minus_one + 1,
                            make_index_sequence</*indexes=*/sizeof(V128)>())...};
 }
-
 // This is an array of shuffle control masks that can be used as the source
 // operand for PSHUFB to permute the contents of the destination XMM register
 // into a repeating byte pattern.
@@ -329,7 +330,6 @@ static inline V128 LoadPattern(const char* src, const size_t pattern_size) {
   return V128_Shuffle(V128_LoadU(reinterpret_cast<const V128*>(src)),
                       generation_mask);
 }
-
 SNAPPY_ATTRIBUTE_ALWAYS_INLINE
 static inline std::pair<V128 /* pattern */, V128 /* reshuffle_mask */>
 LoadPatternAndReshuffleMask(const char* src, const size_t pattern_size) {
@@ -345,7 +345,6 @@ LoadPatternAndReshuffleMask(const char* src, const size_t pattern_size) {
       pattern_reshuffle_masks[pattern_size - 1].data()));
   return {pattern, reshuffle_mask};
 }
-
 #endif  // SNAPPY_HAVE_VECTOR_BYTE_SHUFFLE
 
 // Fallback for when we need to copy while extending the pattern, for example
@@ -494,7 +493,6 @@ inline char* IncrementalCopy(const char* src, char* op, char* const op_limit,
           LoadPatternAndReshuffleMask(src, pattern_size);
       V128 pattern = pattern_and_reshuffle_mask.first;
       V128 reshuffle_mask = pattern_and_reshuffle_mask.second;
-
       // There is at least one, and at most four 16-byte blocks. Writing four
       // conditionals instead of a loop allows FDO to layout the code with
       // respect to the actual probabilities of each length.
@@ -521,7 +519,6 @@ inline char* IncrementalCopy(const char* src, char* op, char* const op_limit,
           LoadPatternAndReshuffleMask(src, pattern_size);
       V128 pattern = pattern_and_reshuffle_mask.first;
       V128 reshuffle_mask = pattern_and_reshuffle_mask.second;
-
       // This code path is relatively cold however so we save code size
       // by avoiding unrolling and vectorizing.
       //
@@ -1246,6 +1243,27 @@ void MemCopy64(char* dst, const void* src, size_t size) {
     data = _mm256_lddqu_si256(static_cast<const __m256i *>(src) + 1);
     _mm256_storeu_si256(reinterpret_cast<__m256i *>(dst) + 1, data);
   }
+  // RVV acceleration available on RISC-V when compiled with -march=rv64gcv
+#elif defined(__riscv) && SNAPPY_HAVE_RVV
+  // Cast pointers to the type we will operate on.
+  unsigned char* dst_ptr = reinterpret_cast<unsigned char*>(dst);
+  const unsigned char* src_ptr = reinterpret_cast<const unsigned char*>(src);
+  size_t remaining_bytes = size;
+  // Loop as long as there are bytes remaining to be copied.
+  while (remaining_bytes > 0) {
+    // Set vector configuration: e8 (8-bit elements), m2 (LMUL=2).
+    // Use e8m2 configuration to maximize throughput.
+    size_t vl = VSETVL_E8M2(remaining_bytes);
+    // Load data from the current source pointer.
+    vuint8m2_t vec = VLE8_V_U8M2(src_ptr, vl);
+    // Store data to the current destination pointer.
+    VSE8_V_U8M2(dst_ptr, vec, vl);
+    // Update pointers and the remaining count.
+    src_ptr += vl;
+    dst_ptr += vl;
+    remaining_bytes -= vl;
+  }
+
 #else
   std::memmove(dst, src, kShortMemCopy);
   // Profiling shows that nearly all copies are short.