Skip to content

Commit 55cace7

Browse files
w1m024ChenMiaoigong-flying
committed
Add support for RISC-V Vector Extension
- Add compile options and detection logic for RISC-V Vector Extension (RVV) - Implement RVV-optimized memory copy in the IncrementalCopy function - Add RISC-V toolchain configuration file Signed-off-by: w1m024 <iwangyiming@qq.com> Co-authored-by: chenmiaoi <chenmiao.ku@gmail.com> Co-authored-by: gong-flying <gongxiaofei24@iscas.ac.cn>
1 parent 537f4ad commit 55cace7

File tree

4 files changed

+89
-0
lines changed

4 files changed

+89
-0
lines changed

CMakeLists.txt

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,8 @@ option(SNAPPY_REQUIRE_AVX "Target processors with AVX support." OFF)
4949

5050
option(SNAPPY_REQUIRE_AVX2 "Target processors with AVX2 support." OFF)
5151

52+
option(SNAPPY_REQUIRE_RVV "Target processors with RVV support." OFF)
53+
5254
option(SNAPPY_INSTALL "Install Snappy's header and library" ON)
5355

5456
include(TestBigEndian)
@@ -63,6 +65,8 @@ check_include_file("sys/time.h" HAVE_SYS_TIME_H)
6365
check_include_file("sys/uio.h" HAVE_SYS_UIO_H)
6466
check_include_file("unistd.h" HAVE_UNISTD_H)
6567
check_include_file("windows.h" HAVE_WINDOWS_H)
68+
check_include_file("sse2rvv.h" HAVE_SSE2RISCV_INSTRINSIC_H)
69+
check_include_file("riscv_vector.h" HAVE_RISCV_INSTRINSIC_H)
6670

6771
include(CheckLibraryExists)
6872
check_library_exists(z zlibVersion "" HAVE_LIBZ)
@@ -73,6 +77,7 @@ CHECK_CXX_COMPILER_FLAG("/arch:AVX" HAVE_VISUAL_STUDIO_ARCH_AVX)
7377
CHECK_CXX_COMPILER_FLAG("/arch:AVX2" HAVE_VISUAL_STUDIO_ARCH_AVX2)
7478
CHECK_CXX_COMPILER_FLAG("-mavx" HAVE_CLANG_MAVX)
7579
CHECK_CXX_COMPILER_FLAG("-mbmi2" HAVE_CLANG_MBMI2)
80+
CHECK_CXX_COMPILER_FLAG("-march=rv64gcv" HAVE_CLANG_RVV)
7681
if(SNAPPY_REQUIRE_AVX2)
7782
if(HAVE_VISUAL_STUDIO_ARCH_AVX2)
7883
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX2")
@@ -90,6 +95,10 @@ elseif (SNAPPY_REQUIRE_AVX)
9095
if(HAVE_CLANG_MAVX)
9196
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx")
9297
endif(HAVE_CLANG_MAVX)
98+
elseif (SNAPPY_REQUIRE_RVV)
99+
if(HAVE_CLANG_RVV)
100+
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=rv64gcv")
101+
endif(HAVE_CLANG_RVV)
93102
endif(SNAPPY_REQUIRE_AVX2)
94103

95104
include(CheckCXXSourceCompiles)
@@ -115,6 +124,42 @@ int main() {
115124
return 0;
116125
}" SNAPPY_HAVE_SSSE3)
117126

127+
check_cxx_source_compiles("
128+
#include <riscv_vector.h>
129+
130+
#define vreinterpretq_f64_m128i(x) \
131+
__riscv_vreinterpret_v_i64m1_i32m1(__riscv_vreinterpret_v_f64m1_i64m1(x))
132+
#define vreinterpretq_m128i_i64(x) __riscv_vreinterpret_v_i32m1_i64m1(x)
133+
#define vreinterpretq_i64_m128i(x) __riscv_vreinterpret_v_i64m1_i32m1(x)
134+
#define vreinterpretq_m128i_i8(x) __riscv_vreinterpret_v_i32m1_i8m1(x)
135+
#define vreinterpretq_i8_m128i(x) __riscv_vreinterpret_v_i8m1_i32m1(x)
136+
137+
int main() {
138+
const vint32m1_t *src = 0;
139+
vint32m1_t dest;
140+
const vint32m1_t shuffle_mask = vreinterpretq_f64_m128i(
141+
__riscv_vle64_v_f64m1((double const *)src, 2));
142+
143+
vint64m1_t addr = vreinterpretq_m128i_i64(*src);
144+
vint64m1_t zeros = __riscv_vmv_v_x_i64m1(0, 2);
145+
146+
vint32m1_t a = vreinterpretq_i64_m128i(
147+
__riscv_vslideup_vx_i64m1_tu(addr, zeros, 1, 2));
148+
149+
vint8m1_t _a = vreinterpretq_m128i_i8(a);
150+
vint8m1_t _b = vreinterpretq_m128i_i8(shuffle_mask);
151+
vbool8_t mask_lt_zero = __riscv_vmslt_vx_i8m1_b8(_b, 0, 16);
152+
vuint8m1_t idxs =
153+
__riscv_vreinterpret_v_i8m1_u8m1(__riscv_vand_vx_i8m1(_b, 0xf, 16));
154+
vint8m1_t shuffle = __riscv_vrgather_vv_i8m1(_a, idxs, 16);
155+
156+
const vint32m1_t pattern = vreinterpretq_i8_m128i(
157+
__riscv_vmerge_vxm_i8m1(shuffle, 0, mask_lt_zero, 16));
158+
159+
dest = pattern;
160+
return 0;
161+
}" SNAPPY_HAVE_RVV)
162+
118163
check_cxx_source_compiles("
119164
#include <immintrin.h>
120165
int main() {

cmake/config.h.in

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,9 @@
5252
/* Define to 1 if you target processors with SSSE3+ and have <tmmintrin.h>. */
5353
#cmakedefine01 SNAPPY_HAVE_SSSE3
5454

55+
/* Define to 1 if you target processors with RVV and have <riscv_vector.h>. */
56+
#cmakedefine01 SNAPPY_HAVE_RVV
57+
5558
/* Define to 1 if you target processors with BMI2+ and have <bmi2intrin.h>. */
5659
#cmakedefine01 SNAPPY_HAVE_BMI2
5760

cmake/toolchain-riscv.cmake

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
# Usage: cmake -DCMAKE_TOOLCHAIN_FILE=path/to/toolchain-riscv.cmake ..
2+
3+
set(CMAKE_SYSTEM_NAME Linux)
4+
set(CMAKE_SYSTEM_PROCESSOR riscv64)
5+
6+
# Specify the cross compiler
7+
set(CMAKE_C_COMPILER "riscv64-unknown-linux-gnu-gcc")
8+
set(CMAKE_CXX_COMPILER "riscv64-unknown-linux-gnu-g++")
9+
10+
set(CMAKE_C_FLAGS "-march=rv64gcv -mabi=lp64d" CACHE INTERNAL "C compiler flags")
11+
set(CMAKE_CXX_FLAGS "-march=rv64gcv -mabi=lp64d -static-libstdc++" CACHE INTERNAL "C++ compiler flags")
12+
13+
include_directories(/usr/lib/gcc-cross/riscv64-linux-gnu/14/include)

snappy.cc

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,10 @@
6262
#include <tmmintrin.h>
6363
#endif
6464

65+
#if SNAPPY_HAVE_RVV
66+
#include <riscv_vector.h>
67+
#endif
68+
6569
#if SNAPPY_HAVE_BMI2
6670
// Please do not replace with <x86intrin.h>. or with headers that assume more
6771
// advanced SSE versions without checking with all the OWNERS.
@@ -252,6 +256,30 @@ inline char* IncrementalCopy(const char* src, char* op, char* const op_limit,
252256
if (SNAPPY_PREDICT_TRUE(op >= op_limit)) return op_limit;
253257
}
254258
return IncrementalCopySlow(src, op, op_limit);
259+
#elif defined (SNAPPY_HAVE_RVV)
260+
size_t bytes_to_copy = op_limit - op;
261+
while (bytes_to_copy > 0) {
262+
263+
size_t vl = __riscv_vsetvl_e8m1(bytes_to_copy);
264+
265+
vuint8m1_t pattern_source = __riscv_vle8_v_u8m1(
266+
reinterpret_cast<const uint8_t*>(src), pattern_size);
267+
vuint8m1_t indices_sequential = __riscv_vid_v_u8m1(vl);
268+
vuint8m1_t indices_repeating = __riscv_vremu_vx_u8m1(
269+
indices_sequential, pattern_size, vl);
270+
271+
vuint8m1_t pattern_to_write = __riscv_vrgather_vv_u8m1(
272+
pattern_source, indices_repeating, vl);
273+
274+
__riscv_vse8_v_u8m1(reinterpret_cast<uint8_t*>(op), pattern_to_write, vl);
275+
276+
op += vl;
277+
bytes_to_copy -= vl;
278+
}
279+
280+
return op_limit;
281+
282+
255283
#else // !SNAPPY_HAVE_SSSE3
256284
// If plenty of buffer space remains, expand the pattern to at least 8
257285
// bytes. The way the following loop is written, we need 8 bytes of buffer

0 commit comments

Comments
 (0)