Skip to content

Commit ba31c8f

Browse files
authored
Merge pull request #2853 from Qiyu8/usimd-daxpy
Optimize the performance of daxpy by using universal intrinsics
2 parents e961d4d + 881c151 commit ba31c8f

File tree

6 files changed

+177
-19
lines changed

6 files changed

+177
-19
lines changed

getarch.c

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -492,7 +492,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
492492
"-DDTB_DEFAULT_ENTRIES=32 -DDTB_SIZE=4096 " \
493493
"-DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 " \
494494
"-DHAVE_SSE4A -DHAVE_MISALIGNSSE -DHAVE_128BITFPU -DHAVE_FASTMOVU " \
495-
"-DHAVE_AVX -DHAVE_FMA4"
495+
"-DHAVE_AVX"
496496
#define LIBNAME "bulldozer"
497497
#define CORENAME "BULLDOZER"
498498
#endif
@@ -508,7 +508,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
508508
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
509509
"-DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 " \
510510
"-DHAVE_SSE4A -DHAVE_MISALIGNSSE -DHAVE_128BITFPU -DHAVE_FASTMOVU -DHAVE_CFLUSH " \
511-
"-DHAVE_AVX -DHAVE_FMA4 -DHAVE_FMA3"
511+
"-DHAVE_AVX -DHAVE_FMA3"
512512
#define LIBNAME "piledriver"
513513
#define CORENAME "PILEDRIVER"
514514
#endif
@@ -524,7 +524,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
524524
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
525525
"-DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 " \
526526
"-DHAVE_SSE4A -DHAVE_MISALIGNSSE -DHAVE_128BITFPU -DHAVE_FASTMOVU -DHAVE_CFLUSH " \
527-
"-DHAVE_AVX -DHAVE_FMA4 -DHAVE_FMA3"
527+
"-DHAVE_AVX -DHAVE_FMA3"
528528
#define LIBNAME "steamroller"
529529
#define CORENAME "STEAMROLLER"
530530
#endif
@@ -540,7 +540,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
540540
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
541541
"-DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 " \
542542
"-DHAVE_SSE4A -DHAVE_MISALIGNSSE -DHAVE_128BITFPU -DHAVE_FASTMOVU -DHAVE_CFLUSH " \
543-
"-DHAVE_AVX -DHAVE_FMA4 -DHAVE_FMA3"
543+
"-DHAVE_AVX -DHAVE_FMA3"
544544
#define LIBNAME "excavator"
545545
#define CORENAME "EXCAVATOR"
546546
#endif

kernel/simd/intrin.h

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
#ifndef _INTRIN_H_
2+
#define _INTRIN_H_
3+
4+
#if defined(_MSC_VER)
5+
#define BLAS_INLINE __inline
6+
#elif defined(__GNUC__)
7+
#if defined(__STRICT_ANSI__)
8+
#define BLAS_INLINE __inline__
9+
#else
10+
#define BLAS_INLINE inline
11+
#endif
12+
#else
13+
#define BLAS_INLINE
14+
#endif
15+
16+
#ifdef _MSC_VER
17+
#define BLAS_FINLINE static __forceinline
18+
#elif defined(__GNUC__)
19+
#define BLAS_FINLINE static BLAS_INLINE __attribute__((always_inline))
20+
#else
21+
#define BLAS_FINLINE static
22+
#endif
23+
24+
#ifdef __cplusplus
25+
extern "C" {
26+
#endif
27+
// include head
28+
/** SSE **/
29+
#ifdef HAVE_SSE
30+
#include <xmmintrin.h>
31+
#endif
32+
/** SSE2 **/
33+
#ifdef HAVE_SSE2
34+
#include <emmintrin.h>
35+
#endif
36+
/** SSE3 **/
37+
#ifdef HAVE_SSE3
38+
#include <pmmintrin.h>
39+
#endif
40+
/** SSSE3 **/
41+
#ifdef HAVE_SSSE3
42+
#include <tmmintrin.h>
43+
#endif
44+
/** SSE41 **/
45+
#ifdef HAVE_SSE4_1
46+
#include <smmintrin.h>
47+
#endif
48+
49+
/** AVX **/
50+
#ifdef HAVE_AVX
51+
#include <immintrin.h>
52+
#endif
53+
54+
// distribute
55+
#if defined(HAVE_AVX512VL) || defined(HAVE_AVX512BF16)
56+
#include "intrin_avx512.h"
57+
#elif defined(HAVE_AVX2)
58+
#include "intrin_avx.h"
59+
#elif defined(HAVE_SSE2)
60+
#include "intrin_sse.h"
61+
#endif
62+
63+
#ifndef V_SIMD
64+
#define V_SIMD 0
65+
#define V_SIMD_F64 0
66+
#endif
67+
68+
#ifdef __cplusplus
69+
}
70+
#endif
71+
#endif // _INTRIN_H_

kernel/simd/intrin_avx.h

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
#define V_SIMD 256
2+
#define V_SIMD_F64 1
3+
/*
4+
Data Type
5+
*/
6+
typedef __m256 v_f32;
7+
#define v_nlanes_f32 8
8+
/*
9+
arithmetic
10+
*/
11+
#define v_add_f32 _mm256_add_ps
12+
#define v_mul_f32 _mm256_mul_ps
13+
14+
#ifdef HAVE_FMA3
15+
// multiply and add, a*b + c
16+
#define v_muladd_f32 _mm256_fmadd_ps
17+
#else
18+
// multiply and add, a*b + c
19+
BLAS_FINLINE v_f32 v_muladd_f32(v_f32 a, v_f32 b, v_f32 c)
20+
{ return v_add_f32(v_mul_f32(a, b), c); }
21+
#endif // !HAVE_FMA3
22+
23+
/*
24+
memory
25+
*/
26+
// unaligned load
27+
#define v_loadu_f32 _mm256_loadu_ps
28+
#define v_storeu_f32 _mm256_storeu_ps
29+
#define v_setall_f32(VAL) _mm256_set1_ps(VAL)

kernel/simd/intrin_avx512.h

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
#define V_SIMD 512
2+
#define V_SIMD_F64 1
3+
/*
4+
Data Type
5+
*/
6+
typedef __m512 v_f32;
7+
#define v_nlanes_f32 16
8+
/*
9+
arithmetic
10+
*/
11+
#define v_add_f32 _mm512_add_ps
12+
#define v_mul_f32 _mm512_mul_ps
13+
// multiply and add, a*b + c
14+
#define v_muladd_f32 _mm512_fmadd_ps
15+
/*
16+
memory
17+
*/
18+
// unaligned load
19+
#define v_loadu_f32(PTR) _mm512_loadu_ps((const __m512*)(PTR))
20+
#define v_storeu_f32 _mm512_storeu_ps
21+
#define v_setall_f32(VAL) _mm512_set1_ps(VAL)

kernel/simd/intrin_sse.h

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
#define V_SIMD 128
2+
#define V_SIMD_F64 1
3+
/*
4+
Data Type
5+
*/
6+
typedef __m128 v_f32;
7+
#define v_nlanes_f32 4
8+
/*
9+
arithmetic
10+
*/
11+
#define v_add_f32 _mm_add_ps
12+
#define v_mul_f32 _mm_mul_ps
13+
#ifdef HAVE_FMA3
14+
// multiply and add, a*b + c
15+
#define v_muladd_f32 _mm_fmadd_ps
16+
#elif defined(HAVE_FMA4)
17+
// multiply and add, a*b + c
18+
#define v_muladd_f32 _mm_macc_ps
19+
#else
20+
// multiply and add, a*b + c
21+
BLAS_FINLINE v_f32 v_muladd_f32(v_f32 a, v_f32 b, v_f32 c)
22+
{ return v_add_f32(v_mul_f32(a, b), c); }
23+
#endif // HAVE_FMA3
24+
/*
25+
memory
26+
*/
27+
// unaligned load
28+
#define v_loadu_f32 _mm_loadu_ps
29+
#define v_storeu_f32 _mm_storeu_ps
30+
#define v_setall_f32(VAL) _mm_set1_ps(VAL)

kernel/x86_64/daxpy.c

Lines changed: 22 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -45,28 +45,35 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
4545
#include "daxpy_microk_sandy-2.c"
4646
#endif
4747

48-
4948
#ifndef HAVE_KERNEL_8
49+
#include"../simd/intrin.h"
5050

5151
static void daxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
5252
{
5353
BLASLONG register i = 0;
5454
FLOAT a = *alpha;
55-
55+
#if V_SIMD
56+
v_f32 __alpha, tmp;
57+
__alpha = v_setall_f32(*alpha);
58+
const int vstep = v_nlanes_f32;
59+
for (; i < n; i += vstep) {
60+
tmp = v_muladd_f32(__alpha, v_loadu_f32( x + i ), v_loadu_f32(y + i));
61+
v_storeu_f32(y + i, tmp);
62+
}
63+
#else
5664
while(i < n)
57-
{
58-
y[i] += a * x[i];
59-
y[i+1] += a * x[i+1];
60-
y[i+2] += a * x[i+2];
61-
y[i+3] += a * x[i+3];
62-
y[i+4] += a * x[i+4];
63-
y[i+5] += a * x[i+5];
64-
y[i+6] += a * x[i+6];
65-
y[i+7] += a * x[i+7];
66-
i+=8 ;
67-
68-
}
69-
65+
{
66+
y[i] += a * x[i];
67+
y[i+1] += a * x[i+1];
68+
y[i+2] += a * x[i+2];
69+
y[i+3] += a * x[i+3];
70+
y[i+4] += a * x[i+4];
71+
y[i+5] += a * x[i+5];
72+
y[i+6] += a * x[i+6];
73+
y[i+7] += a * x[i+7];
74+
i+=8 ;
75+
}
76+
#endif
7077
}
7178

7279
#endif

0 commit comments

Comments
 (0)