Skip to content

Commit fcfce04

Browse files
authored
ggml : LoongArch fixes (#16958)
* Fix test-quantize-fns f16 and q4_0 failed when use LSX * Fix LoongArch set float intrinsic when use LSX/LASX
1 parent ee3a5a1 commit fcfce04

File tree

3 files changed

+32
-31
lines changed

3 files changed

+32
-31
lines changed

ggml/src/ggml-cpu/arch/loongarch/quants.c

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -700,7 +700,8 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
700700
for (; ib + 1 < nb; ib += 2) {
701701

702702
// Compute combined scale for the block 0 and 1
703-
const __m128 d_0_1 = (__m128)__lsx_vreplgr2vr_w( GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d) );
703+
const float ft0 = GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d);
704+
const __m128 d_0_1 = (__m128)(v4f32){ft0, ft0, ft0, ft0};
704705

705706
const __m128i tmp_0_1 = __lsx_vld((const __m128i *)x[ib].qs, 0);
706707

@@ -714,11 +715,9 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
714715
bx_1 = __lsx_vsub_b(bx_1, off);
715716
const __m128i i32_1 = mul_sum_i8_pairs(bx_1, by_1);
716717

717-
//_mm_prefetch(&x[ib] + 2 * sizeof(block_q4_0), _MM_HINT_T0);
718-
//_mm_prefetch(&y[ib] + 2 * sizeof(block_q8_0), _MM_HINT_T0);
719-
720718
// Compute combined scale for the block 2 and 3
721-
const __m128 d_2_3 = (__m128)__lsx_vreplgr2vr_w( GGML_CPU_FP16_TO_FP32(x[ib + 1].d) * GGML_CPU_FP16_TO_FP32(y[ib + 1].d) );
719+
const float ft1 = GGML_CPU_FP16_TO_FP32(x[ib + 1].d) * GGML_CPU_FP16_TO_FP32(y[ib + 1].d);
720+
const __m128 d_2_3 = (__m128)(v4f32){ft1, ft1, ft1, ft1};
722721

723722
const __m128i tmp_2_3 = __lsx_vld((const __m128i *)x[ib + 1].qs, 0);
724723

ggml/src/ggml-cpu/ggml-cpu-impl.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -500,13 +500,15 @@ inline static int32x4_t ggml_vec_dot(int32x4_t acc, int8x16_t a, int8x16_t b) {
500500

501501
#endif
502502

503-
#if defined(__loongarch_asx)
503+
#if defined(__loongarch_sx)
504504
/* float type data load instructions */
505505
static __m128 __lsx_vreplfr2vr_s(const float val) {
506506
v4f32 res = {val, val, val, val};
507507
return (__m128)res;
508508
}
509+
#endif
509510

511+
#if defined(__loongarch_asx)
510512
static __m256 __lasx_xvreplfr2vr_s(const float val) {
511513
v8f32 res = {val, val, val, val, val, val, val, val};
512514
return (__m256)res;

ggml/src/ggml-cpu/simd-mappings.h

Lines changed: 25 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -956,7 +956,7 @@ do { \
956956

957957
#define GGML_F32Cx8 __m256
958958
#define GGML_F32Cx8_ZERO (__m256)__lasx_xvldi(0)
959-
#define GGML_F32Cx8_SET1(x) (__m256)__lasx_xvreplgr2vr_w((x))
959+
#define GGML_F32Cx8_SET1(x) (__m256)__lasx_xvreplfr2vr_s((x))
960960

961961
static inline __m256 __lasx_f32cx8_load(const ggml_fp16_t * x) {
962962
__m256i a;
@@ -999,34 +999,34 @@ static inline void __lasx_f32cx8_store(ggml_fp16_t * x, __m256 y) {
999999

10001000
#define GGML_F32x4 __m128
10011001
#define GGML_F32x4_ZERO (__m128)__lsx_vldi(0)
1002-
#define GGML_F32x4_SET1(x) (__m128)__lsx_vinsgr2vr_w(__lsx_vldi(0),(x), 0)
1002+
#define GGML_F32x4_SET1(x) (__m128)__lsx_vreplfr2vr_s((x))
10031003
#define GGML_F32x4_LOAD(x) (__m128)__lsx_vld((x), 0)
10041004
#define GGML_F32x4_STORE(x, y) __lsx_vst(y, x, 0)
10051005
#define GGML_F32x4_FMA(a, b, c) __lsx_vfmadd_s(b, c, a)
10061006
#define GGML_F32x4_ADD __lsx_vfadd_s
10071007
#define GGML_F32x4_MUL __lsx_vfmul_s
1008-
#define GGML_F32x4_REDUCE(res, x) \
1009-
{ \
1010-
int offset = GGML_F32_ARR >> 1; \
1011-
for (int i = 0; i < offset; ++i) { \
1012-
x[i] = __lsx_vfadd_s(x[i], x[offset + i]); \
1013-
} \
1014-
offset >>= 1; \
1015-
for (int i = 0; i < offset; ++i) { \
1016-
x[i] = __lsx_vfadd_s(x[i], x[offset + i]); \
1017-
} \
1018-
offset >>= 1; \
1019-
for (int i = 0; i < offset; ++i) { \
1020-
x[i] = __lsx_vfadd_s(x[i], x[offset + i]); \
1021-
} \
1022-
__m128i tmp = __lsx_vsrli_d((__m128i) x[0], 32); \
1023-
tmp = (__m128i) __lsx_vfadd_s((__m128) tmp, x[0]); \
1024-
tmp = __lsx_vpickev_w(__lsx_vldi(0), tmp); \
1025-
const __m128 t0 = (__m128)__lsx_vshuf4i_w(tmp, 0x88); \
1026-
tmp = __lsx_vsrli_d((__m128i) t0, 32); \
1027-
tmp = (__m128i) __lsx_vfadd_s((__m128) tmp, t0); \
1028-
tmp = __lsx_vpickev_w(__lsx_vldi(0), tmp); \
1029-
res = (ggml_float) __lsx_vpickve2gr_w(__lsx_vshuf4i_w(tmp, 0x88), 0); \
1008+
1009+
#define GGML_F32x4_REDUCE(res, x) \
1010+
{ \
1011+
int offset = GGML_F32_ARR >> 1; \
1012+
for (int i = 0; i < offset; ++i) { \
1013+
x[i] = __lsx_vfadd_s(x[i], x[offset+i]); \
1014+
} \
1015+
offset >>= 1; \
1016+
for (int i = 0; i < offset; ++i) { \
1017+
x[i] = __lsx_vfadd_s(x[i], x[offset+i]); \
1018+
} \
1019+
offset >>= 1; \
1020+
for (int i = 0; i < offset; ++i) { \
1021+
x[i] = __lsx_vfadd_s(x[i], x[offset+i]); \
1022+
} \
1023+
__m128i t0 = __lsx_vpickev_w((__m128i)x[0], (__m128i)x[0]); \
1024+
__m128i t1 = __lsx_vpickod_w((__m128i)x[0], (__m128i)x[0]); \
1025+
__m128 t2 = __lsx_vfadd_s((__m128)t0, (__m128)t1); \
1026+
__m128i t3 = __lsx_vpickev_w((__m128i)t2, (__m128i)t2); \
1027+
__m128i t4 = __lsx_vpickod_w((__m128i)t2, (__m128i)t2); \
1028+
__m128 t5 = __lsx_vfadd_s((__m128)t3, (__m128)t4); \
1029+
res = (ggml_float) ((v4f32)t5)[0]; \
10301030
}
10311031

10321032
#define GGML_F32_VEC GGML_F32x4
@@ -1068,7 +1068,7 @@ static inline void __lsx_f16x4_store(ggml_fp16_t * x, __m128 y) {
10681068

10691069
#define GGML_F32Cx4 __m128
10701070
#define GGML_F32Cx4_ZERO (__m128)__lsx_vldi(0)
1071-
#define GGML_F32Cx4_SET1(x) (__m128)__lsx_vinsgr2vr_w(__lsx_vldi(0),(x), 0)
1071+
#define GGML_F32Cx4_SET1(x) (__m128)__lsx_vreplfr2vr_s((x))
10721072
#define GGML_F32Cx4_LOAD(x) (__m128)__lsx_f16x4_load(x)
10731073
#define GGML_F32Cx4_STORE(x, y) __lsx_f16x4_store(x, y)
10741074
#define GGML_F32Cx4_FMA GGML_F32x4_FMA

0 commit comments

Comments
 (0)