3
3
/***************************
4
4
* Data Type
5
5
***************************/
6
- #ifdef DOUBLE
7
- typedef __m128d v_f32 ;
8
- #else
9
6
typedef __m128 v_f32 ;
10
- #endif
11
-
12
7
#define v_nlanes_f32 4
13
8
/***************************
14
9
* Arithmetic
15
10
***************************/
16
- #ifdef DOUBLE
17
- #define v_add_f32 _mm_add_pd
18
- #define v_mul_f32 _mm_mul_pd
19
- #else
20
11
#define v_add_f32 _mm_add_ps
21
12
#define v_mul_f32 _mm_mul_ps
22
- #endif
23
13
#ifdef HAVE_FMA3
24
14
// multiply and add, a*b + c
25
- #ifdef DOUBLE
26
- #define v_muladd_f32 _mm_fmadd_pd
27
- #else
28
- #define v_muladd_f32 _mm_fmadd_ps
29
- #endif
15
+ #define v_muladd_f32 _mm_fmadd_ps
30
16
#elif defined(HAVE_FMA4 )
31
17
// multiply and add, a*b + c
32
- #ifdef DOUBLE
33
- #define v_muladd_f32 _mm_macc_pd
34
- #else
35
- #define v_muladd_f32 _mm_macc_ps
36
- #endif
18
+ #define v_muladd_f32 _mm_macc_ps
37
19
#else
38
20
// multiply and add, a*b + c
39
21
BLAS_FINLINE v_f32 v_muladd_f32 (v_f32 a , v_f32 b , v_f32 c )
40
22
{ return v_add_f32 (v_mul_f32 (a , b ), c ); }
41
23
#endif // HAVE_FMA3
42
24
43
- // Horizontal add: Calculates the sum of all vector elements.
44
- #ifdef DOUBLE
45
- BLAS_FINLINE double v_sum_f32 (__m128d a )
46
- {
47
- #ifdef HAVE_SSE3
48
- __m128d sum_halves = _mm_hadd_pd (a , a );
49
- return _mm_cvtsd_f64 (_mm_hadd_pd (sum_halves , sum_halves ));
50
- #else
51
- __m128d t1 = _mm_movehl_pd (a , a );
52
- __m128d t2 = _mm_add_pd (a , t1 );
53
- __m128d t3 = _mm_shuffle_pd (t2 , t2 , 1 );
54
- __m128d t4 = _mm_add_ss (t2 , t3 );
55
- return _mm_cvtsd_f64 (t4 );
56
- #endif
57
- }
58
- #else
59
25
// Horizontal add: Calculates the sum of all vector elements.
60
26
BLAS_FINLINE float v_sum_f32 (__m128 a )
61
27
{
@@ -70,19 +36,11 @@ BLAS_FINLINE float v_sum_f32(__m128 a)
70
36
return _mm_cvtss_f32 (t4 );
71
37
#endif
72
38
}
73
- #endif
74
39
/***************************
75
40
* memory
76
41
***************************/
77
42
// unaligned load
78
- #ifdef DOUBLE
79
- #define v_loadu_f32 _mm_loadu_pd
80
- #define v_storeu_f32 _mm_storeu_pd
81
- #define v_setall_f32 (VAL ) _mm_set1_pd(VAL)
82
- #define v_zero_f32 _mm_setzero_pd
83
- #else
84
43
#define v_loadu_f32 _mm_loadu_ps
85
44
#define v_storeu_f32 _mm_storeu_ps
86
45
#define v_setall_f32 (VAL ) _mm_set1_ps(VAL)
87
- #define v_zero_f32 _mm_setzero_ps
88
- #endif
46
+ #define v_zero_f32 _mm_setzero_ps
0 commit comments