Skip to content

Commit ecbeb80

Browse files
authored
Merge pull request #1865 from martin-frbg/issue1844
Optimize gemv for small M, large N only if it can be done in a threadsafe manner
2 parents 2c5725c + 0427277 commit ecbeb80

File tree

1 file changed

+33
-4
lines changed

1 file changed

+33
-4
lines changed

driver/level2/gemv_thread.c

Lines changed: 33 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -62,9 +62,36 @@
6262
#endif
6363
#endif
6464

65-
#ifndef TRANSA
65+
#ifndef thread_local
66+
# if __STDC_VERSION__ >= 201112 && !defined __STDC_NO_THREADS__
67+
# define thread_local _Thread_local
68+
# elif defined _WIN32 && ( \
69+
defined _MSC_VER || \
70+
defined __ICL || \
71+
defined __DMC__ || \
72+
defined __BORLANDC__ )
73+
# define thread_local __declspec(thread)
74+
/* note that ICC (linux) and Clang are covered by __GNUC__ */
75+
# elif defined __GNUC__ || \
76+
defined __SUNPRO_C || \
77+
defined __xlC__
78+
# define thread_local __thread
79+
# else
80+
# define UNSAFE
81+
#endif
82+
#endif
83+
#if defined USE_OPENMP
84+
#undef UNSAFE
85+
#endif
86+
87+
#if !defined(TRANSA) && !defined(UNSAFE)
6688
#define Y_DUMMY_NUM 1024
89+
#if defined(USE_OPENMP)
6790
static FLOAT y_dummy[Y_DUMMY_NUM];
91+
#pragma omp threadprivate(y_dummy)
92+
# else
93+
static thread_local FLOAT y_dummy[Y_DUMMY_NUM];
94+
# endif
6895
#endif
6996

7097
static int gemv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *dummy1, FLOAT *buffer, BLASLONG pos){
@@ -105,10 +132,12 @@ static int gemv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F
105132
#ifdef TRANSA
106133
y += n_from * incy * COMPSIZE;
107134
#else
135+
# ifndef UNSAFE
108136
//for split matrix row (n) direction and vector x of gemv_n
109137
x += n_from * incx * COMPSIZE;
110138
//store partial result for every thread
111139
y += (m_to - m_from) * 1 * COMPSIZE * pos;
140+
# endif
112141
#endif
113142
}
114143

@@ -136,7 +165,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x
136165

137166
BLASLONG width, i, num_cpu;
138167

139-
#ifndef TRANSA
168+
#if !defined(TRANSA) && !defined(iUNSAFE)
140169
int split_x=0;
141170
#endif
142171

@@ -212,7 +241,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x
212241
i -= width;
213242
}
214243

215-
#ifndef TRANSA
244+
#if !defined(TRANSA) && !defined(UNSAFE)
216245
//try to split matrix on row direction and x.
217246
//Then, reduction.
218247
if (num_cpu < nthreads) {
@@ -272,7 +301,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x
272301
exec_blas(num_cpu, queue);
273302
}
274303

275-
#ifndef TRANSA
304+
#if !defined(TRANSA) && !defined(UNSAFE)
276305
if(split_x==1){
277306
//reduction
278307
for(i=0; i<num_cpu; i++){

0 commit comments

Comments
 (0)