Skip to content

Commit 1cf123a

Browse files
authored
ggml-cpu : add basic RVV support for vector f32 ops (#15057)
* ggml-cpu : add basic RVV support for vector f32 ops * ggml-cpu : add RVV support for f32 softmax
1 parent fcca218 commit 1cf123a

File tree

5 files changed

+168
-19
lines changed

5 files changed

+168
-19
lines changed

ggml/src/ggml-cpu/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -435,7 +435,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
435435
)
436436
if (GGML_RVV)
437437
if (GGML_XTHEADVECTOR)
438-
list(APPEND ARCH_FLAGS -march=rv64gc_xtheadvector -mabi=lp64d)
438+
list(APPEND ARCH_FLAGS -march=rv64gc_zfhmin_xtheadvector -mabi=lp64d)
439439
elseif (GGML_RV_ZFH)
440440
list(APPEND ARCH_FLAGS -march=rv64gcv_zfhmin -mabi=lp64d)
441441
else()

ggml/src/ggml-cpu/ops.cpp

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9072,6 +9072,9 @@ static void ggml_compute_forward_ssm_scan_f32(
90729072
}
90739073

90749074
sumf = GGML_F32xt_REDUCE_ONE(sum);
9075+
#elif defined(__riscv_v_intrinsic)
9076+
// todo: RVV implementation
9077+
const int np = 0;
90759078
#else
90769079
const int np = (nc & ~(GGML_F32_STEP - 1));
90779080

@@ -10023,8 +10026,8 @@ static void ggml_compute_forward_rwkv_wkv7_f32(
1002310026
int64_t h_stride_2d = head_size * head_size;
1002410027

1002510028
#if defined(GGML_SIMD)
10026-
#if defined(__ARM_FEATURE_SVE)
10027-
// scalar Route to scalar implementation //TODO: Write SVE code
10029+
#if defined(__ARM_FEATURE_SVE) || defined(__riscv_v_intrinsic)
10030+
// scalar Route to scalar implementation //TODO: Write SVE code and RVV code
1002810031
for (int64_t t = 0; t < T; t++) {
1002910032
int64_t t_offset = t * t_stride;
1003010033
int64_t state_offset = head_size * C * (t / (T / n_seqs));

ggml/src/ggml-cpu/simd-mappings.h

Lines changed: 39 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,10 @@
1818
#include <immintrin.h>
1919
#endif
2020

21+
#if defined(__riscv_v_intrinsic)
22+
#include <riscv_vector.h>
23+
#endif
24+
2125
#ifdef __cplusplus
2226
extern "C" {
2327
#endif
@@ -94,24 +98,15 @@ extern "C" {
9498
}
9599
#elif defined(__riscv) && defined(__riscv_zfhmin)
96100
static inline float riscv_compute_fp16_to_fp32(ggml_fp16_t h) {
97-
float f;
98-
__asm__(
99-
"fmv.h.x %[f], %[h]\n\t"
100-
"fcvt.s.h %[f], %[f]"
101-
: [f] "=&f" (f)
102-
: [h] "r" (h)
103-
);
104-
return f;
101+
_Float16 hf;
102+
memcpy(&hf, &h, sizeof(ggml_fp16_t));
103+
return hf;
105104
}
106105

107106
static inline ggml_fp16_t riscv_compute_fp32_to_fp16(float f) {
108107
ggml_fp16_t res;
109-
__asm__(
110-
"fcvt.h.s %[f], %[f]\n\t"
111-
"fmv.x.h %[h], %[f]"
112-
: [h] "=&r" (res)
113-
: [f] "f" (f)
114-
);
108+
_Float16 hf = (_Float16)f;
109+
memcpy(&res, &hf, sizeof(ggml_fp16_t));
115110
return res;
116111
}
117112

@@ -1170,6 +1165,36 @@ static inline void __lzs_f16cx4_store(ggml_fp16_t * x, float32x4_t v_y) {
11701165
#define GGML_F16_VEC_MUL GGML_F32x4_MUL
11711166
#define GGML_F16_VEC_REDUCE GGML_F32x4_REDUCE
11721167

1168+
#elif defined(__riscv_v_intrinsic)
1169+
1170+
// compatible with vlen >= 128
1171+
1172+
#define GGML_SIMD
1173+
1174+
// F32
1175+
1176+
#define GGML_F32_STEP 16
1177+
#define GGML_F32_EPR 4
1178+
1179+
#define GGML_F32x4 vfloat32m1_t
1180+
#define GGML_F32x4_ZERO __riscv_vfmv_v_f_f32m1(0.0f, GGML_F32_EPR)
1181+
#define GGML_F32x4_SET1(x) __riscv_vfmv_v_f_f32m1(x, GGML_F32_EPR)
1182+
#define GGML_F32x4_LOAD(x) __riscv_vle32_v_f32m1(x, GGML_F32_EPR)
1183+
#define GGML_F32x4_STORE(b, v) __riscv_vse32_v_f32m1(b, v, GGML_F32_EPR)
1184+
#define GGML_F32x4_FMA(a, b, c) __riscv_vfmacc_vv_f32m1(a, b, c, GGML_F32_EPR)
1185+
#define GGML_F32x4_ADD(a, b) __riscv_vfadd_vv_f32m1(a, b, GGML_F32_EPR)
1186+
#define GGML_F32x4_MUL(a, b) __riscv_vfmul_vv_f32m1(a, b, GGML_F32_EPR)
1187+
1188+
#define GGML_F32_VEC GGML_F32x4
1189+
#define GGML_F32_VEC_ZERO GGML_F32x4_ZERO
1190+
#define GGML_F32_VEC_SET1 GGML_F32x4_SET1
1191+
#define GGML_F32_VEC_LOAD GGML_F32x4_LOAD
1192+
#define GGML_F32_VEC_STORE GGML_F32x4_STORE
1193+
#define GGML_F32_VEC_FMA GGML_F32x4_FMA
1194+
#define GGML_F32_VEC_ADD GGML_F32x4_ADD
1195+
#define GGML_F32_VEC_MUL GGML_F32x4_MUL
1196+
#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
1197+
11731198
#endif
11741199

11751200
// GGML_F32_ARR / GGML_F16_ARR

ggml/src/ggml-cpu/vec.cpp

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,16 @@ void ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const float * G
8484
}
8585
// reduce sum1,sum2 to sum1
8686
GGML_F32_VEC_REDUCE(sumf, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8);
87+
#elif defined(__riscv_v_intrinsic)
88+
vfloat32m1_t vsum = __riscv_vfmv_v_f_f32m1(0.0f, 1);
89+
for (int i = 0, avl; i < n; i += avl) {
90+
avl = __riscv_vsetvl_e32m8(n - i);
91+
vfloat32m8_t ax = __riscv_vle32_v_f32m8(&x[i], avl);
92+
vfloat32m8_t ay = __riscv_vle32_v_f32m8(&y[i], avl);
93+
vfloat32m8_t prod = __riscv_vfmul_vv_f32m8(ax, ay, avl);
94+
vsum = __riscv_vfredusum_vs_f32m8_f32m1(prod, vsum, avl);
95+
}
96+
sumf += __riscv_vfmv_f_s_f32m1_f32(vsum);
8797
#else
8898
const int np = (n & ~(GGML_F32_STEP - 1));
8999

@@ -197,7 +207,7 @@ void ggml_vec_dot_f16(int n, float * GGML_RESTRICT s, size_t bs, ggml_fp16_t * G
197207

198208
ggml_float sumf = 0.0;
199209

200-
#if defined(GGML_SIMD)
210+
#if defined(GGML_SIMD) && !defined(__riscv_v_intrinsic)
201211
const int np = (n & ~(GGML_F16_STEP - 1));
202212

203213
GGML_F16_VEC sum[GGML_F16_ARR] = { GGML_F16_VEC_ZERO };
@@ -325,6 +335,15 @@ ggml_float ggml_vec_soft_max_f32(const int n, float * y, const float * x, float
325335
vst1q_f32(y + i, val);
326336
sum += (ggml_float)vaddvq_f32(val);
327337
}
338+
#elif defined(__riscv_v_intrinsic)
339+
vfloat64m1_t vsum = __riscv_vfmv_v_f_f64m1(0, 1);
340+
for (int avl; i < n; i += avl) {
341+
avl = __riscv_vsetvl_e32m2(n - i);
342+
vfloat32m2_t val = ggml_v_expf_m2(__riscv_vfsub_vf_f32m2(__riscv_vle32_v_f32m2(&x[i], avl), max, avl), avl);
343+
__riscv_vse32_v_f32m2(&y[i], val, avl);
344+
vsum = __riscv_vfwredusum_vs_f32m2_f64m1(val, vsum, avl);
345+
}
346+
return (ggml_float)__riscv_vfmv_f_s_f64m1_f64(vsum);
328347
#endif
329348
for (; i < n; ++i) {
330349
float val = expf(x[i] - max);

ggml/src/ggml-cpu/vec.h

Lines changed: 103 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,14 @@ inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * GG
119119
}
120120

121121
#if defined(GGML_SIMD)
122+
#if defined(__riscv_v_intrinsic)
123+
// todo: RVV impl
124+
for (int i = 0; i < n; ++i) {
125+
for (int j = 0; j < GGML_VEC_DOT_UNROLL; ++j) {
126+
sumf[j] += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[j][i])*GGML_CPU_FP16_TO_FP32(y[i]));
127+
}
128+
}
129+
#else
122130
const int np = (n & ~(GGML_F16_STEP - 1));
123131

124132
GGML_F16_VEC sum[GGML_VEC_DOT_UNROLL][GGML_F16_ARR] = { { GGML_F16_VEC_ZERO } };
@@ -149,6 +157,7 @@ inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * GG
149157
sumf[j] += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[j][i])*GGML_CPU_FP16_TO_FP32(y[i]));
150158
}
151159
}
160+
#endif
152161
#else
153162
for (int i = 0; i < n; ++i) {
154163
for (int j = 0; j < GGML_VEC_DOT_UNROLL; ++j) {
@@ -243,6 +252,14 @@ inline static void ggml_vec_mad_f32(const int n, float * GGML_RESTRICT y, const
243252

244253
svst1_f32(pg, y + np2, ay1);
245254
}
255+
#elif defined(__riscv_v_intrinsic)
256+
for (int i = 0, avl; i < n; i += avl) {
257+
avl = __riscv_vsetvl_e32m8(n - i);
258+
vfloat32m8_t ax = __riscv_vle32_v_f32m8(&x[i], avl);
259+
vfloat32m8_t ay = __riscv_vle32_v_f32m8(&y[i], avl);
260+
vfloat32m8_t ny = __riscv_vfmadd_vf_f32m8(ax, v, ay, avl);
261+
__riscv_vse32_v_f32m8(&y[i], ny, avl);
262+
}
246263
#else
247264
const int np = (n & ~(GGML_F32_STEP - 1));
248265

@@ -276,6 +293,13 @@ inline static void ggml_vec_mad_f32(const int n, float * GGML_RESTRICT y, const
276293

277294
inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * GGML_RESTRICT y, const ggml_fp16_t * GGML_RESTRICT x, const float v) {
278295
#if defined(GGML_SIMD)
296+
#if defined(__riscv_v_intrinsic)
297+
// todo: RVV impl
298+
// scalar
299+
for (int i = 0; i < n; ++i) {
300+
y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i]) + GGML_CPU_FP16_TO_FP32(x[i])*v);
301+
}
302+
#else
279303
const int np = (n & ~(GGML_F16_STEP - 1));
280304

281305
GGML_F16_VEC vx = GGML_F16_VEC_SET1(v);
@@ -297,6 +321,7 @@ inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * GGML_RESTRICT y,
297321
for (int i = np; i < n; ++i) {
298322
y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i]) + GGML_CPU_FP16_TO_FP32(x[i])*v);
299323
}
324+
#endif
300325
#else
301326
// scalar
302327
for (int i = 0; i < n; ++i) {
@@ -324,6 +349,16 @@ inline static void ggml_vec_mad_f32_unroll(const int n, const int xs, const int
324349
y[i] += x[k][i]*v[k][0];
325350
}
326351
}
352+
#elif defined(__riscv_v_intrinsic)
353+
for (int i = 0, avl; i < n; i += avl) {
354+
avl = __riscv_vsetvl_e32m8(n - i);
355+
vfloat32m8_t ay = __riscv_vle32_v_f32m8(&y[i], avl);
356+
for (int k = 0; k < GGML_VEC_MAD_UNROLL; k++) {
357+
vfloat32m8_t ax = __riscv_vle32_v_f32m8(&x[k][i], avl);
358+
ay = __riscv_vfmadd_vf_f32m8(ax, v[k][0], ay, avl);
359+
}
360+
__riscv_vse32_v_f32m8(&y[i], ay, avl);
361+
}
327362
#else
328363
const int np = (n & ~(GGML_F32_STEP - 1));
329364

@@ -375,6 +410,14 @@ inline static void ggml_vec_mad1_f32(const int n, float * y, const float * x, co
375410
for (int i = 0; i < n; ++i) {
376411
y[i] = x[i]*s + b;
377412
}
413+
#elif defined(__riscv_v_intrinsic)
414+
for (int i = 0, avl; i < n; i += avl) {
415+
avl = __riscv_vsetvl_e32m8(n - i);
416+
vfloat32m8_t ax = __riscv_vle32_v_f32m8(&x[i], avl);
417+
vfloat32m8_t vb = __riscv_vfmv_v_f_f32m8(b, avl);
418+
vfloat32m8_t ny = __riscv_vfmadd_vf_f32m8(ax, s, vb, avl);
419+
__riscv_vse32_v_f32m8(&y[i], ny, avl);
420+
}
378421
#else
379422
const int np = (n & ~(GGML_F32_STEP - 1));
380423

@@ -436,6 +479,13 @@ inline static void ggml_vec_scale_f32(const int n, float * y, const float v) {
436479
ay1 = svmul_f32_m(pg, ay1, vx);
437480
svst1_f32(pg, y + np, ay1);
438481
}
482+
#elif defined(__riscv_v_intrinsic)
483+
for (int i = 0, avl; i < n; i += avl) {
484+
avl = __riscv_vsetvl_e32m8(n - i);
485+
vfloat32m8_t ay = __riscv_vle32_v_f32m8(&y[i], avl);
486+
vfloat32m8_t ny = __riscv_vfmul_vf_f32m8(ay, v, avl);
487+
__riscv_vse32_v_f32m8(&y[i], ny, avl);
488+
}
439489
#else
440490
const int np = (n & ~(GGML_F32_STEP - 1));
441491

@@ -467,6 +517,13 @@ inline static void ggml_vec_scale_f32(const int n, float * y, const float v) {
467517

468518
inline static void ggml_vec_scale_f16(const int n, ggml_fp16_t * y, const float v) {
469519
#if defined(GGML_SIMD)
520+
#if defined(__riscv_v_intrinsic)
521+
// todo: RVV impl
522+
// scalar
523+
for (int i = 0; i < n; ++i) {
524+
y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i])*v);
525+
}
526+
#else
470527
const int np = (n & ~(GGML_F16_STEP - 1));
471528

472529
GGML_F16_VEC vx = GGML_F16_VEC_SET1(v);
@@ -486,6 +543,7 @@ inline static void ggml_vec_scale_f16(const int n, ggml_fp16_t * y, const float
486543
for (int i = np; i < n; ++i) {
487544
y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i])*v);
488545
}
546+
#endif
489547
#else
490548
// scalar
491549
for (int i = 0; i < n; ++i) {
@@ -928,7 +986,51 @@ inline static __m128 ggml_v_silu(__m128 x) {
928986
return _mm_div_ps(x, one_plus_exp_neg_x);
929987
}
930988

931-
#endif // __ARM_NEON / __AVX2__ / __SSE2__
989+
#elif defined(__riscv_v_intrinsic)
990+
991+
// adapted from arm limited optimized routine
992+
// the maximum error is 1.45358 plus 0.5 ulps
993+
// numbers above 88.38 will flush to infinity
994+
// numbers beneath -103.97 will flush to zero
995+
inline static vfloat32m2_t ggml_v_expf_m2(vfloat32m2_t x, int vl) {
996+
const vfloat32m2_t r = __riscv_vfmv_v_f_f32m2(0x1.8p23f, vl);
997+
#ifdef __riscv_xtheadvector
998+
// workaround for compiler bug (gcc 14.3.0: Error: unrecognized opcode `th.vmv1r.v v2,v4')
999+
vfloat32m2_t z = __riscv_vfadd_vf_f32m2(r, 0.0f, vl);
1000+
z = __riscv_vfmacc_vf_f32m2(z, 0x1.715476p+0f, x, vl);
1001+
#else
1002+
const vfloat32m2_t z = __riscv_vfmacc_vf_f32m2(r, 0x1.715476p+0f, x, vl);
1003+
#endif
1004+
const vfloat32m2_t n = __riscv_vfsub_vv_f32m2(z, r, vl);
1005+
const vfloat32m2_t b = __riscv_vfnmsac_vf_f32m2(__riscv_vfnmsac_vf_f32m2(x, 0x1.62e4p-1f, n, vl),
1006+
0x1.7f7d1cp-20f, n, vl);
1007+
const vuint32m2_t e = __riscv_vsll_vx_u32m2(__riscv_vreinterpret_v_f32m2_u32m2(z), 23, vl);
1008+
const vfloat32m2_t k = __riscv_vreinterpret_v_u32m2_f32m2(__riscv_vadd_vx_u32m2(e, 0x3f800000, vl)); // 1.0f
1009+
const vbool16_t c = __riscv_vmfgt_vf_f32m2_b16(__riscv_vfabs_v_f32m2(n, vl), 126.0f, vl);
1010+
const vfloat32m2_t u = __riscv_vfmul_vv_f32m2(b, b, vl);
1011+
const vfloat32m2_t j = __riscv_vfmacc_vv_f32m2(
1012+
__riscv_vfmul_vf_f32m2(b, 0x1.ffffecp-1f, vl),
1013+
__riscv_vfmacc_vv_f32m2(
1014+
__riscv_vfmacc_vf_f32m2(__riscv_vfmv_v_f_f32m2(0x1.fffdb6p-2f, vl), 0x1.555e66p-3f, b, vl),
1015+
__riscv_vfmacc_vf_f32m2(__riscv_vfmv_v_f_f32m2(0x1.573e2ep-5f, vl), 0x1.0e4020p-7f, b, vl),
1016+
u, vl), u, vl);
1017+
if (!__riscv_vcpop_m_b16(c, vl))
1018+
return __riscv_vfmacc_vv_f32m2(k, j, k, vl);
1019+
const vbool16_t dm = __riscv_vmfle_vf_f32m2_b16(n, 0.0f, vl);
1020+
const vuint32m2_t d = __riscv_vmerge_vxm_u32m2(__riscv_vmv_v_x_u32m2(0, vl), 0x82000000, dm, vl);
1021+
const vfloat32m2_t s1 = __riscv_vreinterpret_v_u32m2_f32m2(__riscv_vadd_vx_u32m2(d, 0x7f000000, vl));
1022+
const vfloat32m2_t s2 = __riscv_vreinterpret_v_u32m2_f32m2(__riscv_vsub_vv_u32m2(e, d, vl));
1023+
const vfloat32m2_t r1 = __riscv_vmerge_vvm_f32m2(
1024+
__riscv_vfmacc_vv_f32m2(k, k, j, vl),
1025+
__riscv_vfmul_vv_f32m2(__riscv_vfmacc_vv_f32m2(s2, s2, j, vl), s1, vl),
1026+
c, vl);
1027+
return __riscv_vmerge_vvm_f32m2(
1028+
r1, __riscv_vfmul_vv_f32m2(s1, s1, vl),
1029+
__riscv_vmfgt_vf_f32m2_b16(__riscv_vfabs_v_f32m2(n, vl), 192.0f, vl),
1030+
vl);
1031+
}
1032+
1033+
#endif // __ARM_NEON / __AVX2__ / __SSE2__ / __riscv_v_intrinsic
9321034

9331035
inline static void ggml_vec_silu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
9341036
for (int i = 0; i < n; ++i) {

0 commit comments

Comments
 (0)