@@ -956,7 +956,7 @@ do { \
956956
957957#define GGML_F32Cx8 __m256
958958#define GGML_F32Cx8_ZERO (__m256)__lasx_xvldi(0)
959- #define GGML_F32Cx8_SET1 (x ) (__m256)__lasx_xvreplgr2vr_w ((x))
959+ #define GGML_F32Cx8_SET1 (x ) (__m256)__lasx_xvreplfr2vr_s ((x))
960960
961961static inline __m256 __lasx_f32cx8_load (const ggml_fp16_t * x ) {
962962 __m256i a ;
@@ -999,34 +999,34 @@ static inline void __lasx_f32cx8_store(ggml_fp16_t * x, __m256 y) {
999999
10001000#define GGML_F32x4 __m128
10011001#define GGML_F32x4_ZERO (__m128)__lsx_vldi(0)
1002- #define GGML_F32x4_SET1 (x ) (__m128)__lsx_vinsgr2vr_w(__lsx_vldi(0),(x), 0 )
1002+ #define GGML_F32x4_SET1 (x ) (__m128)__lsx_vreplfr2vr_s((x) )
10031003#define GGML_F32x4_LOAD (x ) (__m128)__lsx_vld((x), 0)
10041004#define GGML_F32x4_STORE (x , y ) __lsx_vst(y, x, 0)
10051005#define GGML_F32x4_FMA (a , b , c ) __lsx_vfmadd_s(b, c, a)
10061006#define GGML_F32x4_ADD __lsx_vfadd_s
10071007#define GGML_F32x4_MUL __lsx_vfmul_s
1008- #define GGML_F32x4_REDUCE ( res , x ) \
1009- { \
1010- int offset = GGML_F32_ARR >> 1; \
1011- for ( int i = 0; i < offset; ++i) { \
1012- x[i] = __lsx_vfadd_s(x[i], x[ offset + i]); \
1013- } \
1014- offset >>= 1; \
1015- for (int i = 0; i < offset; ++i) { \
1016- x[i] = __lsx_vfadd_s(x[i], x[ offset + i]); \
1017- } \
1018- offset >>= 1; \
1019- for (int i = 0; i < offset; ++i) { \
1020- x[i] = __lsx_vfadd_s(x[i], x[ offset + i]); \
1021- } \
1022- __m128i tmp = __lsx_vsrli_d((__m128i) x[0], 32); \
1023- tmp = ( __m128i) __lsx_vfadd_s((__m128) tmp, x[0]); \
1024- tmp = __lsx_vpickev_w(__lsx_vldi(0), tmp); \
1025- const __m128 t0 = ( __m128)__lsx_vshuf4i_w(tmp, 0x88); \
1026- tmp = __lsx_vsrli_d ((__m128i) t0, 32); \
1027- tmp = ( __m128i) __lsx_vfadd_s((__m128) tmp, t0); \
1028- tmp = __lsx_vpickev_w(__lsx_vldi(0), tmp); \
1029- res = (ggml_float) __lsx_vpickve2gr_w(__lsx_vshuf4i_w(tmp, 0x88), 0); \
1008+
1009+ #define GGML_F32x4_REDUCE ( res , x ) \
1010+ { \
1011+ int offset = GGML_F32_ARR >> 1; \
1012+ for (int i = 0; i < offset; ++i) { \
1013+ x[i] = __lsx_vfadd_s(x[i], x[offset+i]); \
1014+ } \
1015+ offset >>= 1; \
1016+ for (int i = 0; i < offset; ++i) { \
1017+ x[i] = __lsx_vfadd_s(x[i], x[offset+i]); \
1018+ } \
1019+ offset >>= 1; \
1020+ for (int i = 0; i < offset; ++i) { \
1021+ x[i] = __lsx_vfadd_s(x[i], x[offset+i]); \
1022+ } \
1023+ __m128i t0 = __lsx_vpickev_w(( __m128i)x[0], (__m128i) x[0]); \
1024+ __m128i t1 = __lsx_vpickod_w((__m128i)x[0], (__m128i)x[0]); \
1025+ __m128 t2 = __lsx_vfadd_s(( __m128)t0, (__m128)t1); \
1026+ __m128i t3 = __lsx_vpickev_w ((__m128i)t2, (__m128i)t2); \
1027+ __m128i t4 = __lsx_vpickod_w(( __m128i)t2, (__m128i)t2); \
1028+ __m128 t5 = __lsx_vfadd_s((__m128)t3, (__m128)t4); \
1029+ res = (ggml_float) ((v4f32)t5)[0]; \
10301030}
10311031
10321032#define GGML_F32_VEC GGML_F32x4
@@ -1068,7 +1068,7 @@ static inline void __lsx_f16x4_store(ggml_fp16_t * x, __m128 y) {
10681068
10691069#define GGML_F32Cx4 __m128
10701070#define GGML_F32Cx4_ZERO (__m128)__lsx_vldi(0)
1071- #define GGML_F32Cx4_SET1 (x ) (__m128)__lsx_vinsgr2vr_w(__lsx_vldi(0),(x), 0 )
1071+ #define GGML_F32Cx4_SET1 (x ) (__m128)__lsx_vreplfr2vr_s((x) )
10721072#define GGML_F32Cx4_LOAD (x ) (__m128)__lsx_f16x4_load(x)
10731073#define GGML_F32Cx4_STORE (x , y ) __lsx_f16x4_store(x, y)
10741074#define GGML_F32Cx4_FMA GGML_F32x4_FMA
0 commit comments