Skip to content

Commit cb3c190

Browse files
committed
Implementaion of dasum, sasum with AVX2 & AVX512 intrinsic
1 parent 75eeb26 commit cb3c190

File tree

7 files changed

+327
-0
lines changed

7 files changed

+327
-0
lines changed

kernel/x86_64/KERNEL.HASWELL

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -100,3 +100,5 @@ ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
100100
CGEMM3MKERNEL = cgemm3m_kernel_8x4_haswell.c
101101
ZGEMM3MKERNEL = zgemm3m_kernel_4x4_haswell.c
102102

103+
SASUMKERNEL = sasum.c
104+
DASUMKERNEL = dasum.c

kernel/x86_64/dasum.c

Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
#include "common.h"
2+
#include <math.h>
3+
4+
#define ABS fabs
5+
6+
#if defined(SKYLAKEX)
7+
#include "dasum_microk_skylakex-2.c"
8+
#elif defined(HASWELL)
9+
#include "dasum_microk_haswell-2.c"
10+
#endif
11+
12+
#ifndef HAVE_KERNEL_16
13+
static FLOAT dasum_kernel_16(BLASLONG n, FLOAT *x1)
14+
{
15+
16+
BLASLONG i=0;
17+
FLOAT *x = x1;
18+
FLOAT temp0, temp1, temp2, temp3;
19+
FLOAT temp4, temp5, temp6, temp7;
20+
FLOAT sum0 = 0.0;
21+
FLOAT sum1 = 0.0;
22+
FLOAT sum2 = 0.0;
23+
FLOAT sum3 = 0.0;
24+
25+
while ( i< n )
26+
{
27+
28+
temp0 = ABS(x[0]);
29+
temp1 = ABS(x[1]);
30+
temp2 = ABS(x[2]);
31+
temp3 = ABS(x[3]);
32+
temp4 = ABS(x[4]);
33+
temp5 = ABS(x[5]);
34+
temp6 = ABS(x[6]);
35+
temp7 = ABS(x[7]);
36+
37+
sum0 += temp0;
38+
sum1 += temp1;
39+
sum2 += temp2;
40+
sum3 += temp3;
41+
42+
sum0 += temp4;
43+
sum1 += temp5;
44+
sum2 += temp6;
45+
sum3 += temp7;
46+
47+
x+=8;
48+
i+=8;
49+
50+
}
51+
52+
return sum0+sum1+sum2+sum3;
53+
}
54+
55+
#endif
56+
57+
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
58+
{
59+
BLASLONG i=0;
60+
FLOAT sumf = 0.0;
61+
BLASLONG n1;
62+
63+
if (n <= 0 || inc_x <= 0) return(sumf);
64+
65+
if ( inc_x == 1 )
66+
{
67+
68+
n1 = n & -16;
69+
if ( n1 > 0 )
70+
{
71+
72+
sumf = dasum_kernel_16(n1, x);
73+
i=n1;
74+
}
75+
76+
while(i < n)
77+
{
78+
sumf += ABS(x[i]);
79+
i++;
80+
}
81+
82+
}
83+
else
84+
{
85+
86+
n *= inc_x;
87+
while(i < n)
88+
{
89+
sumf += ABS(x[i]);
90+
i += inc_x;
91+
}
92+
93+
}
94+
return(sumf);
95+
}
96+
Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6))
2+
3+
#define HAVE_KERNEL_16 1
4+
5+
#include <immintrin.h>
6+
7+
static FLOAT dasum_kernel_16(BLASLONG n, FLOAT *x1)
8+
{
9+
BLASLONG i = 0;
10+
__m256d accum_0, accum_1, accum_2, accum_3;
11+
12+
accum_0 = _mm256_setzero_pd();
13+
accum_1 = _mm256_setzero_pd();
14+
accum_2 = _mm256_setzero_pd();
15+
accum_3 = _mm256_setzero_pd();
16+
17+
__m256i abs_mask = _mm256_set1_epi64x(0x7fffffffffffffff);
18+
for (; i < n; i += 16) {
19+
accum_0 += (__m256d)_mm256_and_si256(_mm256_loadu_si256(&x1[i+ 0]), abs_mask);
20+
accum_1 += (__m256d)_mm256_and_si256(_mm256_loadu_si256(&x1[i+ 4]), abs_mask);
21+
accum_2 += (__m256d)_mm256_and_si256(_mm256_loadu_si256(&x1[i+ 8]), abs_mask);
22+
accum_3 += (__m256d)_mm256_and_si256(_mm256_loadu_si256(&x1[i+12]), abs_mask);
23+
}
24+
25+
accum_0 = accum_0 + accum_1 + accum_2 + accum_3;
26+
27+
__m128d half_accum0;
28+
half_accum0 = _mm_add_pd(_mm256_extractf128_pd(accum_0, 0), _mm256_extractf128_pd(accum_0, 1));
29+
30+
half_accum0 = _mm_hadd_pd(half_accum0, half_accum0);
31+
32+
return half_accum0[0];
33+
34+
}
35+
#endif
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
/* need a new enough GCC for avx512 support */
2+
#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6))
3+
4+
#if defined(__AVX512CD__)
5+
#define HAVE_KERNEL_16 1
6+
7+
#include <immintrin.h>
8+
9+
static FLOAT dasum_kernel_16(BLASLONG n, FLOAT *x1)
10+
{
11+
BLASLONG i = 0;
12+
13+
__m512d accum_0, accum_1;
14+
15+
accum_0 = _mm512_setzero_pd();
16+
accum_1 = _mm512_setzero_pd();
17+
18+
for (; i < n; i += 16) {
19+
accum_0 += _mm512_abs_pd(_mm512_loadu_pd(&x1[i+ 0]));
20+
accum_1 += _mm512_abs_pd(_mm512_loadu_pd(&x1[i+ 8]));
21+
}
22+
23+
accum_0 += accum_1;
24+
return _mm512_reduce_add_pd(accum_0);
25+
}
26+
#endif
27+
#endif

kernel/x86_64/sasum.c

Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,104 @@
1+
#include "common.h"
2+
#include <math.h>
3+
4+
#if defined(DOUBLE)
5+
6+
#error supports float only
7+
8+
#else
9+
10+
#define ABS fabsf
11+
12+
#endif
13+
14+
#if defined(SKYLAKEX)
15+
#include "sasum_microk_skylakex-2.c"
16+
#elif defined(HASWELL)
17+
#include "sasum_microk_haswell-2.c"
18+
#endif
19+
20+
#ifndef HAVE_KERNEL_32
21+
22+
static FLOAT sasum_kernel_32(BLASLONG n, FLOAT *x1)
23+
{
24+
25+
BLASLONG i=0;
26+
FLOAT *x = x1;
27+
FLOAT temp0, temp1, temp2, temp3;
28+
FLOAT temp4, temp5, temp6, temp7;
29+
FLOAT sum0 = 0.0;
30+
FLOAT sum1 = 0.0;
31+
FLOAT sum2 = 0.0;
32+
FLOAT sum3 = 0.0;
33+
34+
while ( i< n )
35+
{
36+
37+
temp0 = ABS(x[0]);
38+
temp1 = ABS(x[1]);
39+
temp2 = ABS(x[2]);
40+
temp3 = ABS(x[3]);
41+
temp4 = ABS(x[4]);
42+
temp5 = ABS(x[5]);
43+
temp6 = ABS(x[6]);
44+
temp7 = ABS(x[7]);
45+
46+
sum0 += temp0;
47+
sum1 += temp1;
48+
sum2 += temp2;
49+
sum3 += temp3;
50+
51+
sum0 += temp4;
52+
sum1 += temp5;
53+
sum2 += temp6;
54+
sum3 += temp7;
55+
56+
x+=8;
57+
i+=8;
58+
59+
}
60+
61+
return sum0+sum1+sum2+sum3;
62+
}
63+
64+
#endif
65+
66+
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
67+
{
68+
BLASLONG i=0;
69+
FLOAT sumf = 0.0;
70+
BLASLONG n1;
71+
72+
if (n <= 0 || inc_x <= 0) return(sumf);
73+
74+
if ( inc_x == 1 )
75+
{
76+
77+
n1 = n & -32;
78+
if ( n1 > 0 )
79+
{
80+
81+
sumf = sasum_kernel_32(n1, x);
82+
i=n1;
83+
}
84+
85+
while(i < n)
86+
{
87+
sumf += ABS(x[i]);
88+
i++;
89+
}
90+
91+
}
92+
else
93+
{
94+
95+
n *= inc_x;
96+
while(i < n)
97+
{
98+
sumf += ABS(x[i]);
99+
i += inc_x;
100+
}
101+
102+
}
103+
return(sumf);
104+
}
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6))
2+
3+
#define HAVE_KERNEL_32 1
4+
5+
#include <immintrin.h>
6+
7+
static FLOAT sasum_kernel_32(BLASLONG n, FLOAT *x1)
8+
{
9+
BLASLONG i = 0;
10+
__m256 accum_0, accum_1, accum_2, accum_3;
11+
12+
accum_0 = _mm256_setzero_ps();
13+
accum_1 = _mm256_setzero_ps();
14+
accum_2 = _mm256_setzero_ps();
15+
accum_3 = _mm256_setzero_ps();
16+
17+
__m256i abs_mask = _mm256_set1_epi32(0x7fffffff);
18+
for (; i < n; i += 32) {
19+
accum_0 += (__m256)_mm256_and_si256(_mm256_loadu_si256(&x1[i+ 0]), abs_mask);
20+
accum_1 += (__m256)_mm256_and_si256(_mm256_loadu_si256(&x1[i+ 8]), abs_mask);
21+
accum_2 += (__m256)_mm256_and_si256(_mm256_loadu_si256(&x1[i+16]), abs_mask);
22+
accum_3 += (__m256)_mm256_and_si256(_mm256_loadu_si256(&x1[i+24]), abs_mask);
23+
}
24+
25+
accum_0 = accum_0 + accum_1 + accum_2 + accum_3;
26+
27+
__m128 half_accum0;
28+
half_accum0 = _mm_add_ps(_mm256_extractf128_ps(accum_0, 0), _mm256_extractf128_ps(accum_0, 1));
29+
30+
half_accum0 = _mm_hadd_ps(half_accum0, half_accum0);
31+
half_accum0 = _mm_hadd_ps(half_accum0, half_accum0);
32+
33+
return half_accum0[0];
34+
35+
}
36+
#endif
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
/* need a new enough GCC for avx512 support */
2+
#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6))
3+
4+
#if defined(__AVX512CD__)
5+
#define HAVE_KERNEL_32 1
6+
7+
#include <immintrin.h>
8+
9+
static FLOAT sasum_kernel_32(BLASLONG n, FLOAT *x1)
10+
{
11+
BLASLONG i = 0;
12+
13+
__m512 accum_0, accum_1;
14+
15+
accum_0 = _mm512_setzero_ps();
16+
accum_1 = _mm512_setzero_ps();
17+
18+
for (; i < n; i += 32) {
19+
accum_0 += _mm512_abs_ps(_mm512_loadu_ps(&x1[i+ 0]));
20+
accum_1 += _mm512_abs_ps(_mm512_loadu_ps(&x1[i+ 16]));
21+
}
22+
23+
accum_0 += accum_1;
24+
return _mm512_reduce_add_ps(accum_0);
25+
}
26+
#endif
27+
#endif

0 commit comments

Comments
 (0)