Skip to content

Commit 840e010

Browse files
authored
Merge pull request #1491 from martin-frbg/ddot_mt
Add multithreading support for Haswell DDOT
2 parents 73c5ca7 + a55694d commit 840e010

File tree

1 file changed

+66
-1
lines changed

1 file changed

+66
-1
lines changed

kernel/x86_64/ddot.c

Lines changed: 66 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
4343
#include "ddot_microk_sandy-2.c"
4444
#endif
4545

46+
#if !defined(DSDOT)
47+
#define RETURN_TYPE FLOAT
48+
#else
49+
#define RETURN_TYPE double
50+
#endif
51+
4652

4753
#ifndef HAVE_KERNEL_8
4854

@@ -71,7 +77,7 @@ static void ddot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d)
7177

7278
#endif
7379

74-
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
80+
static FLOAT dot_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
7581
{
7682
BLASLONG i=0;
7783
BLASLONG ix=0,iy=0;
@@ -139,4 +145,63 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
139145

140146
}
141147

148+
#if defined(SMP)
149+
static int dot_thread_function(BLASLONG n, BLASLONG dummy0,
150+
BLASLONG dummy1, FLOAT dummy2, FLOAT *x, BLASLONG inc_x, FLOAT *y,
151+
BLASLONG inc_y, RETURN_TYPE *result, BLASLONG dummy3)
152+
{
153+
*(RETURN_TYPE *)result = dot_compute(n, x, inc_x, y, inc_y);
154+
155+
return 0;
156+
}
157+
158+
extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n,
159+
BLASLONG k, void *alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb,
160+
void *c, BLASLONG ldc, int (*function)(), int nthreads);
161+
#endif
142162

163+
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
164+
{
165+
#if defined(SMP)
166+
int nthreads;
167+
FLOAT dummy_alpha;
168+
#endif
169+
FLOAT dot = 0.0;
170+
171+
#if defined(SMP)
172+
nthreads = num_cpu_avail(1);
173+
174+
if (inc_x == 0 || inc_y == 0)
175+
nthreads = 1;
176+
177+
if (n <= 10000)
178+
nthreads = 1;
179+
180+
if (nthreads == 1) {
181+
dot = dot_compute(n, x, inc_x, y, inc_y);
182+
} else {
183+
int mode, i;
184+
char result[MAX_CPU_NUMBER * sizeof(double) * 2];
185+
RETURN_TYPE *ptr;
186+
187+
#if !defined(DOUBLE)
188+
mode = BLAS_SINGLE | BLAS_REAL;
189+
#else
190+
mode = BLAS_DOUBLE | BLAS_REAL;
191+
#endif
192+
blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha,
193+
x, inc_x, y, inc_y, result, 0,
194+
( void *)dot_thread_function, nthreads);
195+
196+
ptr = (RETURN_TYPE *)result;
197+
for (i = 0; i < nthreads; i++) {
198+
dot = dot + (*ptr);
199+
ptr = (RETURN_TYPE *)(((char *)ptr) + sizeof(double) * 2);
200+
}
201+
}
202+
#else
203+
dot = dot_compute(n, x, inc_x, y, inc_y);
204+
#endif
205+
206+
return dot;
207+
}

0 commit comments

Comments
 (0)