|
62 | 62 | #endif
|
63 | 63 | #endif
|
64 | 64 |
|
65 |
| -#ifndef TRANSA |
| 65 | +#ifndef thread_local |
| 66 | +# if __STDC_VERSION__ >= 201112 && !defined __STDC_NO_THREADS__ |
| 67 | +# define thread_local _Thread_local |
| 68 | +# elif defined _WIN32 && ( \ |
| 69 | + defined _MSC_VER || \ |
| 70 | + defined __ICL || \ |
| 71 | + defined __DMC__ || \ |
| 72 | + defined __BORLANDC__ ) |
| 73 | +# define thread_local __declspec(thread) |
| 74 | +/* note that ICC (linux) and Clang are covered by __GNUC__ */ |
| 75 | +# elif defined __GNUC__ || \ |
| 76 | + defined __SUNPRO_C || \ |
| 77 | + defined __xlC__ |
| 78 | +# define thread_local __thread |
| 79 | +# else |
| 80 | +# define UNSAFE |
| 81 | +#endif |
| 82 | +#endif |
| 83 | +#if defined USE_OPENMP |
| 84 | +#undef UNSAFE |
| 85 | +#endif |
| 86 | + |
| 87 | +#if !defined(TRANSA) && !defined(UNSAFE) |
66 | 88 | #define Y_DUMMY_NUM 1024
|
| 89 | +#if defined(USE_OPENMP) |
67 | 90 | static FLOAT y_dummy[Y_DUMMY_NUM];
|
| 91 | +#pragma omp threadprivate(y_dummy) |
| 92 | +# else |
| 93 | +static thread_local FLOAT y_dummy[Y_DUMMY_NUM]; |
| 94 | +# endif |
68 | 95 | #endif
|
69 | 96 |
|
70 | 97 | static int gemv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *dummy1, FLOAT *buffer, BLASLONG pos){
|
@@ -105,10 +132,12 @@ static int gemv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F
|
105 | 132 | #ifdef TRANSA
|
106 | 133 | y += n_from * incy * COMPSIZE;
|
107 | 134 | #else
|
| 135 | +# ifndef UNSAFE |
108 | 136 | //for split matrix row (n) direction and vector x of gemv_n
|
109 | 137 | x += n_from * incx * COMPSIZE;
|
110 | 138 | //store partial result for every thread
|
111 | 139 | y += (m_to - m_from) * 1 * COMPSIZE * pos;
|
| 140 | +# endif |
112 | 141 | #endif
|
113 | 142 | }
|
114 | 143 |
|
@@ -136,7 +165,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x
|
136 | 165 |
|
137 | 166 | BLASLONG width, i, num_cpu;
|
138 | 167 |
|
139 |
| -#ifndef TRANSA |
| 168 | +#if !defined(TRANSA) && !defined(iUNSAFE) |
140 | 169 | int split_x=0;
|
141 | 170 | #endif
|
142 | 171 |
|
@@ -212,7 +241,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x
|
212 | 241 | i -= width;
|
213 | 242 | }
|
214 | 243 |
|
215 |
| -#ifndef TRANSA |
| 244 | +#if !defined(TRANSA) && !defined(UNSAFE) |
216 | 245 | //try to split matrix on row direction and x.
|
217 | 246 | //Then, reduction.
|
218 | 247 | if (num_cpu < nthreads) {
|
@@ -272,7 +301,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x
|
272 | 301 | exec_blas(num_cpu, queue);
|
273 | 302 | }
|
274 | 303 |
|
275 |
| -#ifndef TRANSA |
| 304 | +#if !defined(TRANSA) && !defined(UNSAFE) |
276 | 305 | if(split_x==1){
|
277 | 306 | //reduction
|
278 | 307 | for(i=0; i<num_cpu; i++){
|
|
0 commit comments