Skip to content

Commit 325b539

Browse files
committed
Optimize the performance of daxpy by using universal intrinsics
1 parent 0f11207 commit 325b539

File tree

5 files changed

+131
-16
lines changed

5 files changed

+131
-16
lines changed

kernel/simd/intrin.h

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
#ifndef _INTRIN_H_
2+
#define _INTRIN_H_
3+
4+
#ifdef __cplusplus
5+
extern "C" {
6+
#endif
7+
// include head
8+
/** SSE **/
9+
#ifdef HAVE_SSE
10+
#include <xmmintrin.h>
11+
#endif
12+
/** SSE2 **/
13+
#ifdef HAVE_SSE2
14+
#include <emmintrin.h>
15+
#endif
16+
/** SSE3 **/
17+
#ifdef HAVE_SSE3
18+
#include <pmmintrin.h>
19+
#endif
20+
/** SSSE3 **/
21+
#ifdef HAVE_SSSE3
22+
#include <tmmintrin.h>
23+
#endif
24+
/** SSE41 **/
25+
#ifdef HAVE_SSE4_1
26+
#include <smmintrin.h>
27+
#endif
28+
29+
/** AVX **/
30+
#ifdef HAVE_AVX
31+
#include <immintrin.h>
32+
#endif
33+
34+
// distribute
35+
#if defined(HAVE_AVX512VL) || defined(HAVE_AVX512BF16)
36+
#include "intrin_avx512.h"
37+
#elif defined(HAVE_AVX2)
38+
#include "intrin_avx.h"
39+
#elif defined(HAVE_SSE2)
40+
#include "intrin_sse.h"
41+
#endif
42+
43+
#ifndef V_SIMD
44+
#define V_SIMD 0
45+
#define V_SIMD_F64 0
46+
#endif
47+
48+
#ifdef __cplusplus
49+
}
50+
#endif
51+
#endif // _INTRIN_H_

kernel/simd/intrin_avx.h

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
#define V_SIMD 256
2+
#define V_SIMD_F64 1
3+
/*
4+
Data Type
5+
*/
6+
typedef __m256 v_f32;
7+
#define v_nlanes_f32 8
8+
/*
9+
arithmetic
10+
*/
11+
#define v_add_f32 _mm256_add_ps
12+
#define v_mul_f32 _mm256_mul_ps
13+
/*
14+
memory
15+
*/
16+
// unaligned load
17+
#define v_loadu_f32 _mm256_loadu_ps
18+
#define v_storeu_f32 _mm256_storeu_ps
19+
#define v_setall_f32(VAL) _mm256_set1_ps(VAL)

kernel/simd/intrin_avx512.h

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
#define V_SIMD 512
2+
#define V_SIMD_F64 1
3+
/*
4+
Data Type
5+
*/
6+
typedef __m512 v_f32;
7+
#define v_nlanes_f32 16
8+
/*
9+
arithmetic
10+
*/
11+
#define v_add_f32 _mm512_add_ps
12+
#define v_mul_f32 _mm512_mul_ps
13+
/*
14+
memory
15+
*/
16+
// unaligned load
17+
#define v_loadu_f32(PTR) _mm512_loadu_ps((const __m512*)(PTR))
18+
#define v_storeu_f32(PTR) _mm512_storeu_ps((const __m512*)(PTR))
19+
#define v_setall_f32(VAL) _mm512_set1_ps(VAL)

kernel/simd/intrin_sse.h

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
#define V_SIMD 128
2+
#define V_SIMD_F64 1
3+
/*
4+
Data Type
5+
*/
6+
typedef __m128 v_f32;
7+
#define v_nlanes_f32 4
8+
/*
9+
arithmetic
10+
*/
11+
#define v_add_f32 _mm_add_ps
12+
#define v_mul_f32 _mm_mul_ps
13+
/*
14+
memory
15+
*/
16+
// unaligned load
17+
#define v_loadu_f32 _mm_loadu_ps
18+
#define v_storeu_f32 _mm_storeu_ps
19+
#define v_setall_f32(VAL) _mm_set1_ps(VAL)

kernel/x86_64/daxpy.c

Lines changed: 23 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -45,28 +45,35 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
4545
#include "daxpy_microk_sandy-2.c"
4646
#endif
4747

48-
4948
#ifndef HAVE_KERNEL_8
49+
#include"../simd/intrin.h"
5050

51-
static void daxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
51+
void daxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
5252
{
5353
BLASLONG register i = 0;
5454
FLOAT a = *alpha;
55-
55+
#if V_SIMD
56+
v_f32 __alpha, tmp;
57+
__alpha = v_setall_f32(*alpha);
58+
const int vstep = v_nlanes_f32;
59+
for (; i < n; i += vstep) {
60+
tmp = v_add_f32(v_loadu_f32(y + i), v_mul_f32(__alpha, v_loadu_f32( x + i )));
61+
v_storeu_f32(y + i, tmp);
62+
}
63+
#else
5664
while(i < n)
57-
{
58-
y[i] += a * x[i];
59-
y[i+1] += a * x[i+1];
60-
y[i+2] += a * x[i+2];
61-
y[i+3] += a * x[i+3];
62-
y[i+4] += a * x[i+4];
63-
y[i+5] += a * x[i+5];
64-
y[i+6] += a * x[i+6];
65-
y[i+7] += a * x[i+7];
66-
i+=8 ;
67-
68-
}
69-
65+
{
66+
y[i] += a * x[i];
67+
y[i+1] += a * x[i+1];
68+
y[i+2] += a * x[i+2];
69+
y[i+3] += a * x[i+3];
70+
y[i+4] += a * x[i+4];
71+
y[i+5] += a * x[i+5];
72+
y[i+6] += a * x[i+6];
73+
y[i+7] += a * x[i+7];
74+
i+=8 ;
75+
}
76+
#endif
7077
}
7178

7279
#endif

0 commit comments

Comments
 (0)