Skip to content

Commit ab87ee6

Browse files
authored
Merge pull request #1329 from martin-frbg/dsdot
(Trivial) optimized dsdot implementation for HASWELL
2 parents b71f4fe + a07807c commit ab87ee6

File tree

3 files changed

+42
-5
lines changed

3 files changed

+42
-5
lines changed

kernel/x86_64/KERNEL.HASWELL

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,8 @@ DDOTKERNEL = ddot.c
2424
CDOTKERNEL = cdot.c
2525
ZDOTKERNEL = zdot.c
2626

27+
DSDOTKERNEL = sdot.c
28+
2729
SAXPYKERNEL = saxpy.c
2830
DAXPYKERNEL = daxpy.c
2931
CAXPYKERNEL = caxpy.c

kernel/x86_64/sdot.c

Lines changed: 38 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -68,13 +68,22 @@ static void sdot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d)
6868

6969
#endif
7070

71+
#if defined (DSDOT)
72+
double CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
73+
#else
7174
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
75+
#endif
7276
{
7377
BLASLONG i=0;
7478
BLASLONG ix=0,iy=0;
7579
double dot = 0.0 ;
7680

81+
#if defined (DSDOT)
82+
double mydot = 0.0;
83+
FLOAT asmdot = 0.0;
84+
#else
7785
FLOAT mydot=0.0;
86+
#endif
7887
BLASLONG n1;
7988

8089
if ( n <= 0 ) return(dot);
@@ -85,17 +94,35 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
8594
n1 = n & (BLASLONG)(-32);
8695

8796
if ( n1 )
97+
#if defined(DSDOT)
98+
{
99+
FLOAT *x1=x;
100+
FLOAT *y1=y;
101+
BLASLONG n2 = 32;
102+
while (i<n1) {
103+
sdot_kernel_16(n2, x1, y1 , &asmdot );
104+
mydot += (double)asmdot;
105+
asmdot=0.;
106+
x1+=32;
107+
y1+=32;
108+
i+=32;
109+
}
110+
}
111+
#else
88112
sdot_kernel_16(n1, x, y , &mydot );
89-
90-
113+
#endif
91114
i = n1;
92115
while(i < n)
93116
{
94-
117+
#if defined(DSDOT)
118+
dot += (double)y[i] * (double)x[i] ;
119+
#else
95120
dot += y[i] * x[i] ;
121+
#endif
96122
i++ ;
97123

98124
}
125+
99126
dot+=mydot;
100127
return(dot);
101128

@@ -106,8 +133,11 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
106133

107134
while(i < n1)
108135
{
109-
136+
#if defined (DSDOT)
137+
dot += (double)y[iy] * (double)x[ix] + (double)y[iy+inc_y] * (double)x[ix+inc_x];
138+
#else
110139
dot += y[iy] * x[ix] + y[iy+inc_y] * x[ix+inc_x];
140+
#endif
111141
ix += inc_x*2 ;
112142
iy += inc_y*2 ;
113143
i+=2 ;
@@ -116,8 +146,11 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
116146

117147
while(i < n)
118148
{
119-
149+
#if defined (DSDOT)
150+
dot += (double)y[iy] * (double)x[ix] ;
151+
#else
120152
dot += y[iy] * x[ix] ;
153+
#endif
121154
ix += inc_x ;
122155
iy += inc_y ;
123156
i++ ;

kernel/x86_64/sdot_microk_haswell-2.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,9 +53,11 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
5353
"vfmadd231ps 64(%3,%0,4), %%ymm14, %%ymm6 \n\t" // 2 * y
5454
"vfmadd231ps 96(%3,%0,4), %%ymm15, %%ymm7 \n\t" // 2 * y
5555

56+
#ifndef DSDOT
5657
"addq $32 , %0 \n\t"
5758
"subq $32 , %1 \n\t"
5859
"jnz 1b \n\t"
60+
#endif
5961

6062
"vextractf128 $1 , %%ymm4 , %%xmm12 \n\t"
6163
"vextractf128 $1 , %%ymm5 , %%xmm13 \n\t"

0 commit comments

Comments
 (0)