Skip to content

Commit 14f7dad

Browse files
committed
performance improved
1 parent 325b539 commit 14f7dad

File tree

5 files changed

+46
-3
lines changed

5 files changed

+46
-3
lines changed

kernel/simd/intrin.h

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,26 @@
11
#ifndef _INTRIN_H_
22
#define _INTRIN_H_
33

4+
#if defined(_MSC_VER)
5+
#define BLAS_INLINE __inline
6+
#elif defined(__GNUC__)
7+
#if defined(__STRICT_ANSI__)
8+
#define BLAS_INLINE __inline__
9+
#else
10+
#define BLAS_INLINE inline
11+
#endif
12+
#else
13+
#define BLAS_INLINE
14+
#endif
15+
16+
#ifdef _MSC_VER
17+
#define BLAS_FINLINE static __forceinline
18+
#elif defined(__GNUC__)
19+
#define BLAS_FINLINE static BLAS_INLINE __attribute__((always_inline))
20+
#else
21+
#define BLAS_FINLINE static
22+
#endif
23+
424
#ifdef __cplusplus
525
extern "C" {
626
#endif

kernel/simd/intrin_avx.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,16 @@ arithmetic
1010
*/
1111
#define v_add_f32 _mm256_add_ps
1212
#define v_mul_f32 _mm256_mul_ps
13+
14+
#ifdef HAVE_FMA3
15+
// multiply and add, a*b + c
16+
#define v_muladd_f32 _mm256_fmadd_ps
17+
#else
18+
// multiply and add, a*b + c
19+
BLAS_FINLINE v_f32 v_muladd_f32(v_f32 a, v_f32 b, v_f32 c)
20+
{ return v_add_f32(v_mul_f32(a, b), c); }
21+
#endif // !HAVE_FMA3
22+
1323
/*
1424
memory
1525
*/

kernel/simd/intrin_avx512.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,10 +10,12 @@ arithmetic
1010
*/
1111
#define v_add_f32 _mm512_add_ps
1212
#define v_mul_f32 _mm512_mul_ps
13+
// multiply and add, a*b + c
14+
#define v_muladd_f32 _mm512_fmadd_ps
1315
/*
1416
memory
1517
*/
1618
// unaligned load
1719
#define v_loadu_f32(PTR) _mm512_loadu_ps((const __m512*)(PTR))
18-
#define v_storeu_f32(PTR) _mm512_storeu_ps((const __m512*)(PTR))
20+
#define v_storeu_f32 _mm512_storeu_ps
1921
#define v_setall_f32(VAL) _mm512_set1_ps(VAL)

kernel/simd/intrin_sse.h

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,17 @@ arithmetic
1010
*/
1111
#define v_add_f32 _mm_add_ps
1212
#define v_mul_f32 _mm_mul_ps
13+
#ifdef HAVE_FMA3
14+
// multiply and add, a*b + c
15+
#define v_muladd_f32 _mm_fmadd_ps
16+
#elif defined(HAVE_FMA4)
17+
// multiply and add, a*b + c
18+
#define v_muladd_f32 _mm_macc_ps
19+
#else
20+
// multiply and add, a*b + c
21+
BLAS_FINLINE v_f32 v_muladd_f32(v_f32 a, v_f32 b, v_f32 c)
22+
{ return v_add_f32(v_mul_f32(a, b), c); }
23+
#endif // HAVE_FMA3
1324
/*
1425
memory
1526
*/

kernel/x86_64/daxpy.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
4848
#ifndef HAVE_KERNEL_8
4949
#include"../simd/intrin.h"
5050

51-
void daxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
51+
static void daxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
5252
{
5353
BLASLONG register i = 0;
5454
FLOAT a = *alpha;
@@ -57,7 +57,7 @@ void daxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
5757
__alpha = v_setall_f32(*alpha);
5858
const int vstep = v_nlanes_f32;
5959
for (; i < n; i += vstep) {
60-
tmp = v_add_f32(v_loadu_f32(y + i), v_mul_f32(__alpha, v_loadu_f32( x + i )));
60+
tmp = v_muladd_f32(__alpha, v_loadu_f32( x + i ), v_loadu_f32(y + i));
6161
v_storeu_f32(y + i, tmp);
6262
}
6363
#else

0 commit comments

Comments
 (0)