Skip to content

Commit 5c0d0ec

Browse files
committed
Merge pull request #430 from wernsaar/develop
added a better optimized sgemv_n kernel
2 parents c2fdeb6 + 8c05b81 commit 5c0d0ec

11 files changed

+654
-9
lines changed

kernel/x86_64/KERNEL

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -373,7 +373,7 @@ endif
373373
GEMVDEP = ../l2param.h
374374

375375
ifndef SGEMVNKERNEL
376-
SGEMVNKERNEL = ../arm/gemv_n.c
376+
SGEMVNKERNEL = sgemv_n.c
377377
endif
378378

379379
ifndef SGEMVTKERNEL

kernel/x86_64/KERNEL.BULLDOZER

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
ifdef OS_WINDOWS
2-
SGEMVNKERNEL = ../arm/gemv_n.c
2+
SGEMVNKERNEL = sgemv_n.c
33
SGEMVTKERNEL = ../arm/gemv_t.c
44
else
5-
SGEMVNKERNEL = sgemv_n_avx.c
5+
SGEMVNKERNEL = sgemv_n.c
66
SGEMVTKERNEL = sgemv_t_avx.c
77
endif
88

kernel/x86_64/KERNEL.HASWELL

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
ifdef OS_WINDOWS
2-
SGEMVNKERNEL = ../arm/gemv_n.c
2+
SGEMVNKERNEL = sgemv_n.c
33
SGEMVTKERNEL = ../arm/gemv_t.c
44
else
5-
SGEMVNKERNEL = sgemv_n_avx.c
5+
SGEMVNKERNEL = sgemv_n.c
66
SGEMVTKERNEL = sgemv_t_avx.c
77
endif
88

kernel/x86_64/KERNEL.NEHALEM

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,11 @@
1+
ifdef OS_WINDOWS
2+
SGEMVNKERNEL = sgemv_n.c
3+
SGEMVTKERNEL = ../arm/gemv_t.c
4+
else
5+
SGEMVNKERNEL = sgemv_n.c
6+
SGEMVTKERNEL = ../arm/gemv_t.c
7+
endif
8+
19

210
SGEMMKERNEL = gemm_kernel_4x8_nehalem.S
311
SGEMMINCOPY = gemm_ncopy_4.S

kernel/x86_64/KERNEL.PILEDRIVER

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
ifdef OS_WINDOWS
2-
SGEMVNKERNEL = ../arm/gemv_n.c
2+
SGEMVNKERNEL = sgemv_n.c
33
SGEMVTKERNEL = ../arm/gemv_t.c
44
else
5-
SGEMVNKERNEL = sgemv_n_avx.c
5+
SGEMVNKERNEL = sgemv_n.c
66
SGEMVTKERNEL = sgemv_t_avx.c
77
endif
88

kernel/x86_64/KERNEL.SANDYBRIDGE

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
ifdef OS_WINDOWS
2-
SGEMVNKERNEL = ../arm/gemv_n.c
2+
SGEMVNKERNEL = sgemv_n.c
33
SGEMVTKERNEL = ../arm/gemv_t.c
44
else
5-
SGEMVNKERNEL = sgemv_n_avx.c
5+
SGEMVNKERNEL = sgemv_n.c
66
SGEMVTKERNEL = sgemv_t_avx.c
77
endif
88

kernel/x86_64/sgemv_n.c

Lines changed: 209 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,209 @@
1+
/***************************************************************************
2+
Copyright (c) 2014, The OpenBLAS Project
3+
All rights reserved.
4+
Redistribution and use in source and binary forms, with or without
5+
modification, are permitted provided that the following conditions are
6+
met:
7+
1. Redistributions of source code must retain the above copyright
8+
notice, this list of conditions and the following disclaimer.
9+
2. Redistributions in binary form must reproduce the above copyright
10+
notice, this list of conditions and the following disclaimer in
11+
the documentation and/or other materials provided with the
12+
distribution.
13+
3. Neither the name of the OpenBLAS project nor the names of
14+
its contributors may be used to endorse or promote products
15+
derived from this software without specific prior written permission.
16+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17+
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18+
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19+
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
20+
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21+
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22+
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23+
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24+
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
25+
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26+
*****************************************************************************/
27+
28+
29+
#include "common.h"
30+
31+
32+
#if defined(BULLDOZER) || defined(PILEDRIVER)
33+
#include "sgemv_n_microk_bulldozer-2.c"
34+
#elif defined(HASWELL)
35+
#include "sgemv_n_microk_haswell-2.c"
36+
#elif defined(SANDYBRIDGE)
37+
#include "sgemv_n_microk_sandy-2.c"
38+
#elif defined(NEHALEM)
39+
#include "sgemv_n_microk_nehalem-2.c"
40+
#endif
41+
42+
43+
#define NBMAX 4096
44+
45+
#ifndef HAVE_KERNEL_16x4
46+
47+
static void sgemv_kernel_16x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
48+
{
49+
BLASLONG i;
50+
FLOAT *a0,*a1,*a2,*a3;
51+
a0 = ap[0];
52+
a1 = ap[1];
53+
a2 = ap[2];
54+
a3 = ap[3];
55+
56+
for ( i=0; i< n; i+=4 )
57+
{
58+
y[i] += a0[i]*x[0] + a1[i]*x[1] + a2[i]*x[2] + a3[i]*x[3];
59+
y[i+1] += a0[i+1]*x[0] + a1[i+1]*x[1] + a2[i+1]*x[2] + a3[i+1]*x[3];
60+
y[i+2] += a0[i+2]*x[0] + a1[i+2]*x[1] + a2[i+2]*x[2] + a3[i+2]*x[3];
61+
y[i+3] += a0[i+3]*x[0] + a1[i+3]*x[1] + a2[i+3]*x[2] + a3[i+3]*x[3];
62+
}
63+
}
64+
65+
#endif
66+
67+
static void sgemv_kernel_16x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
68+
{
69+
BLASLONG i;
70+
FLOAT *a0;
71+
a0 = ap;
72+
73+
for ( i=0; i< n; i+=4 )
74+
{
75+
y[i] += a0[i]*x[0];
76+
y[i+1] += a0[i+1]*x[0];
77+
y[i+2] += a0[i+2]*x[0];
78+
y[i+3] += a0[i+3]*x[0];
79+
}
80+
}
81+
82+
83+
static void zero_y(BLASLONG n, FLOAT *dest)
84+
{
85+
BLASLONG i;
86+
for ( i=0; i<n; i++ )
87+
{
88+
*dest = 0.0;
89+
dest++;
90+
}
91+
}
92+
93+
94+
95+
static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest)
96+
{
97+
BLASLONG i;
98+
if ( inc_dest == 1 )
99+
{
100+
for ( i=0; i<n; i+=4 )
101+
{
102+
dest[i] += src[i];
103+
dest[i+1] += src[i+1];
104+
dest[i+2] += src[i+2];
105+
dest[i+3] += src[i+3];
106+
}
107+
108+
}
109+
else
110+
{
111+
for ( i=0; i<n; i++ )
112+
{
113+
*dest += *src;
114+
src++;
115+
dest += inc_dest;
116+
}
117+
}
118+
}
119+
120+
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
121+
{
122+
BLASLONG i;
123+
BLASLONG j;
124+
FLOAT *a_ptr;
125+
FLOAT *x_ptr;
126+
FLOAT *y_ptr;
127+
FLOAT *ap[4];
128+
BLASLONG n1;
129+
BLASLONG m1;
130+
BLASLONG m2;
131+
BLASLONG n2;
132+
FLOAT xbuffer[4],*ybuffer;
133+
134+
ybuffer = buffer;
135+
136+
n1 = n / 4 ;
137+
n2 = n % 4 ;
138+
139+
m1 = m - ( m % 16 );
140+
m2 = (m % NBMAX) - (m % 16) ;
141+
142+
y_ptr = y;
143+
144+
BLASLONG NB = NBMAX;
145+
146+
while ( NB == NBMAX )
147+
{
148+
149+
m1 -= NB;
150+
if ( m1 < 0)
151+
{
152+
if ( m2 == 0 ) break;
153+
NB = m2;
154+
}
155+
156+
a_ptr = a;
157+
x_ptr = x;
158+
zero_y(NB,ybuffer);
159+
for( i = 0; i < n1 ; i++)
160+
{
161+
xbuffer[0] = alpha * x_ptr[0];
162+
x_ptr += inc_x;
163+
xbuffer[1] = alpha * x_ptr[0];
164+
x_ptr += inc_x;
165+
xbuffer[2] = alpha * x_ptr[0];
166+
x_ptr += inc_x;
167+
xbuffer[3] = alpha * x_ptr[0];
168+
x_ptr += inc_x;
169+
ap[0] = a_ptr;
170+
ap[1] = a_ptr + lda;
171+
ap[2] = ap[1] + lda;
172+
ap[3] = ap[2] + lda;
173+
sgemv_kernel_16x4(NB,ap,xbuffer,ybuffer);
174+
a_ptr += 4 * lda;
175+
}
176+
177+
for( i = 0; i < n2 ; i++)
178+
{
179+
xbuffer[0] = alpha * x_ptr[0];
180+
x_ptr += inc_x;
181+
sgemv_kernel_16x1(NB,a_ptr,xbuffer,ybuffer);
182+
a_ptr += 1 * lda;
183+
184+
}
185+
add_y(NB,ybuffer,y_ptr,inc_y);
186+
a += NB;
187+
y_ptr += NB * inc_y;
188+
}
189+
j=0;
190+
while ( j < (m % 16))
191+
{
192+
a_ptr = a;
193+
x_ptr = x;
194+
FLOAT temp = 0.0;
195+
for( i = 0; i < n; i++ )
196+
{
197+
temp += a_ptr[0] * x_ptr[0];
198+
a_ptr += lda;
199+
x_ptr += inc_x;
200+
}
201+
y_ptr[0] += alpha * temp;
202+
y_ptr += inc_y;
203+
a++;
204+
j++;
205+
}
206+
return(0);
207+
}
208+
209+
Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,99 @@
1+
/***************************************************************************
2+
Copyright (c) 2014, The OpenBLAS Project
3+
All rights reserved.
4+
Redistribution and use in source and binary forms, with or without
5+
modification, are permitted provided that the following conditions are
6+
met:
7+
1. Redistributions of source code must retain the above copyright
8+
notice, this list of conditions and the following disclaimer.
9+
2. Redistributions in binary form must reproduce the above copyright
10+
notice, this list of conditions and the following disclaimer in
11+
the documentation and/or other materials provided with the
12+
distribution.
13+
3. Neither the name of the OpenBLAS project nor the names of
14+
its contributors may be used to endorse or promote products
15+
derived from this software without specific prior written permission.
16+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17+
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18+
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19+
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
20+
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21+
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22+
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23+
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24+
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
25+
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26+
*****************************************************************************/
27+
28+
#define HAVE_KERNEL_16x4 1
29+
static void sgemv_kernel_16x4( BLASLONG n, float **ap, float *x, float *y) __attribute__ ((noinline));
30+
31+
static void sgemv_kernel_16x4( BLASLONG n, float **ap, float *x, float *y)
32+
{
33+
34+
BLASLONG register i = 0;
35+
36+
__asm__ __volatile__
37+
(
38+
"vbroadcastss (%2), %%xmm12 \n\t" // x0
39+
"vbroadcastss 4(%2), %%xmm13 \n\t" // x1
40+
"vbroadcastss 8(%2), %%xmm14 \n\t" // x2
41+
"vbroadcastss 12(%2), %%xmm15 \n\t" // x3
42+
43+
".align 16 \n\t"
44+
".L01LOOP%=: \n\t"
45+
"vmovups (%3,%0,4), %%xmm4 \n\t" // 4 * y
46+
"vmovups 16(%3,%0,4), %%xmm5 \n\t" // 4 * y
47+
"vmovups 32(%3,%0,4), %%xmm6 \n\t" // 4 * y
48+
"vmovups 48(%3,%0,4), %%xmm7 \n\t" // 4 * y
49+
50+
"prefetcht0 192(%4,%0,4) \n\t"
51+
"vfmaddps %%xmm4, (%4,%0,4), %%xmm12, %%xmm4 \n\t"
52+
"vfmaddps %%xmm5, 16(%4,%0,4), %%xmm12, %%xmm5 \n\t"
53+
"vfmaddps %%xmm6, 32(%4,%0,4), %%xmm12, %%xmm6 \n\t"
54+
"vfmaddps %%xmm7, 48(%4,%0,4), %%xmm12, %%xmm7 \n\t"
55+
"prefetcht0 192(%5,%0,4) \n\t"
56+
"vfmaddps %%xmm4, (%5,%0,4), %%xmm13, %%xmm4 \n\t"
57+
"vfmaddps %%xmm5, 16(%5,%0,4), %%xmm13, %%xmm5 \n\t"
58+
"vfmaddps %%xmm6, 32(%5,%0,4), %%xmm13, %%xmm6 \n\t"
59+
"vfmaddps %%xmm7, 48(%5,%0,4), %%xmm13, %%xmm7 \n\t"
60+
"prefetcht0 192(%6,%0,4) \n\t"
61+
"vfmaddps %%xmm4, (%6,%0,4), %%xmm14, %%xmm4 \n\t"
62+
"vfmaddps %%xmm5, 16(%6,%0,4), %%xmm14, %%xmm5 \n\t"
63+
"vfmaddps %%xmm6, 32(%6,%0,4), %%xmm14, %%xmm6 \n\t"
64+
"vfmaddps %%xmm7, 48(%6,%0,4), %%xmm14, %%xmm7 \n\t"
65+
"prefetcht0 192(%7,%0,4) \n\t"
66+
"vfmaddps %%xmm4, (%7,%0,4), %%xmm15, %%xmm4 \n\t"
67+
"vfmaddps %%xmm5, 16(%7,%0,4), %%xmm15, %%xmm5 \n\t"
68+
"vfmaddps %%xmm6, 32(%7,%0,4), %%xmm15, %%xmm6 \n\t"
69+
"vfmaddps %%xmm7, 48(%7,%0,4), %%xmm15, %%xmm7 \n\t"
70+
71+
"vmovups %%xmm4, (%3,%0,4) \n\t" // 4 * y
72+
"vmovups %%xmm5, 16(%3,%0,4) \n\t" // 4 * y
73+
"vmovups %%xmm6, 32(%3,%0,4) \n\t" // 4 * y
74+
"vmovups %%xmm7, 48(%3,%0,4) \n\t" // 4 * y
75+
76+
"addq $16, %0 \n\t"
77+
"subq $16, %1 \n\t"
78+
"jnz .L01LOOP%= \n\t"
79+
80+
:
81+
:
82+
"r" (i), // 0
83+
"r" (n), // 1
84+
"r" (x), // 2
85+
"r" (y), // 3
86+
"r" (ap[0]), // 4
87+
"r" (ap[1]), // 5
88+
"r" (ap[2]), // 6
89+
"r" (ap[3]) // 7
90+
: "cc",
91+
"%xmm4", "%xmm5",
92+
"%xmm6", "%xmm7",
93+
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
94+
"memory"
95+
);
96+
97+
}
98+
99+

0 commit comments

Comments
 (0)