Skip to content

Commit 7a4b3cf

Browse files
author
martin
committed
Add trivially optimized DSDOT for POWER8
1 parent 5056a04 commit 7a4b3cf

File tree

2 files changed

+47
-9
lines changed

2 files changed

+47
-9
lines changed

kernel/power/KERNEL.POWER8

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -122,6 +122,7 @@ ZCOPYKERNEL = zcopy.c
122122
#
123123
SDOTKERNEL = sdot.c
124124
DDOTKERNEL = ddot.c
125+
DSDOTKERNEL = sdot.c
125126
#CDOTKERNEL = ../arm/zdot.c
126127
ZDOTKERNEL = zdot.c
127128
#

kernel/power/sdot.c

Lines changed: 46 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/***************************************************************************
2-
Copyright (c) 2013-2016, The OpenBLAS Project
2+
Copyright (c) 2013-2017, The OpenBLAS Project
33
All rights reserved.
44
Redistribution and use in source and binary forms, with or without
55
modification, are permitted provided that the following conditions are
@@ -66,42 +66,76 @@ static FLOAT sdot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y)
6666

6767
#endif
6868

69+
#if defined (DSDOT)
70+
double CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
71+
#else
6972
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
73+
#endif
7074
{
7175
BLASLONG i=0;
7276
BLASLONG ix=0,iy=0;
77+
double dot = 0.0 ;
7378

74-
FLOAT dot = 0.0 ;
79+
#if defined (DSDOT)
80+
double mydot = 0.0;
81+
FLOAT asmdot = 0.0;
82+
#else
83+
FLOAT mydot=0.0;
84+
#endif
85+
BLASLONG n1;
7586

7687
if ( n <= 0 ) return(dot);
7788

7889
if ( (inc_x == 1) && (inc_y == 1) )
7990
{
8091

81-
BLASLONG n1 = n & -32;
92+
n1 = n & (BLASLONG)(-32);
8293

8394
if ( n1 )
84-
dot = sdot_kernel_16(n1, x, y);
85-
95+
#if defined(DSDOT)
96+
{
97+
FLOAT *x1=x;
98+
FLOAT *y1=y;
99+
BLASLONG n2 = 32;
100+
while (i<n1) {
101+
asmdot = sdot_kernel_16(n2, x1, y1);
102+
mydot += (double)asmdot;
103+
asmdot=0.;
104+
x1+=32;
105+
y1+=32;
106+
i+=32;
107+
}
108+
}
109+
#else
110+
mydot = sdot_kernel_16(n1, x, y);
111+
#endif
86112
i = n1;
87113
while(i < n)
88114
{
89-
115+
#if defined(DSDOT)
116+
dot += (double)y[i] * (double)x[i] ;
117+
#else
90118
dot += y[i] * x[i] ;
119+
#endif
91120
i++ ;
92121

93122
}
123+
124+
dot+=mydot;
94125
return(dot);
95126

96127

97128
}
98129

99-
BLASLONG n1 = n & -2;
130+
n1 = n & (BLASLONG)(-2);
100131

101132
while(i < n1)
102133
{
103-
134+
#if defined (DSDOT)
135+
dot += (double)y[iy] * (double)x[ix] + (double)y[iy+inc_y] * (double)x[ix+inc_x];
136+
#else
104137
dot += y[iy] * x[ix] + y[iy+inc_y] * x[ix+inc_x];
138+
#endif
105139
ix += inc_x*2 ;
106140
iy += inc_y*2 ;
107141
i+=2 ;
@@ -110,8 +144,11 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
110144

111145
while(i < n)
112146
{
113-
147+
#if defined (DSDOT)
148+
dot += (double)y[iy] * (double)x[ix] ;
149+
#else
114150
dot += y[iy] * x[ix] ;
151+
#endif
115152
ix += inc_x ;
116153
iy += inc_y ;
117154
i++ ;

0 commit comments

Comments
 (0)