Skip to content

Commit bb9876d

Browse files
authored
Fix thread races and infinite looping on systems with many cpus
On systems with more than 64 cpus, blas_quickdivide will sometimes return zero which creates bogus workloads when used for the stride calculation. This then leads to threads spinning incessantly waiting for a status change that never happens, as seen in #1497. This patch also fixes several data races that were found by helgrind and/or tsan while debugging the issue.
1 parent 0ab5bf1 commit bb9876d

File tree

1 file changed

+88
-8
lines changed

1 file changed

+88
-8
lines changed

lapack/getrf/getrf_parallel.c

Lines changed: 88 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,26 @@ double sqrt(double);
6767
#undef GETRF_FACTOR
6868
#define GETRF_FACTOR 1.00
6969

70+
71+
#if defined(USE_PTHREAD_LOCK)
72+
static pthread_mutex_t getrf_lock = PTHREAD_MUTEX_INITIALIZER;
73+
#elif defined(USE_PTHREAD_SPINLOCK)
74+
static pthread_spinlock_t getrf_lock = 0;
75+
#else
76+
static BLASULONG getrf_lock = 0UL;
77+
#endif
78+
79+
#if defined(USE_PTHREAD_LOCK)
80+
static pthread_mutex_t getrf_flag_lock = PTHREAD_MUTEX_INITIALIZER;
81+
#elif defined(USE_PTHREAD_SPINLOCK)
82+
static pthread_spinlock_t getrf_flag_lock = 0;
83+
#else
84+
static BLASULONG getrf_flag_lock = 0UL;
85+
#endif
86+
87+
88+
89+
7090
static __inline BLASLONG FORMULA1(BLASLONG M, BLASLONG N, BLASLONG IS, BLASLONG BK, BLASLONG T) {
7191

7292
double m = (double)(M - IS - BK);
@@ -217,7 +237,10 @@ static int inner_advanced_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *
217237

218238
blasint *ipiv = (blasint *)args -> c;
219239

220-
volatile BLASLONG *flag = (volatile BLASLONG *)args -> d;
240+
//_Atomic
241+
BLASLONG jw;
242+
243+
_Atomic BLASLONG *flag = (_Atomic BLASLONG *)args -> d;
221244

222245
if (args -> a == NULL) {
223246
TRSM_ILTCOPY(k, k, (FLOAT *)args -> b, lda, 0, sb);
@@ -245,8 +268,20 @@ static int inner_advanced_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *
245268
for (xxx = n_from, bufferside = 0; xxx < n_to; xxx += div_n, bufferside ++) {
246269

247270
for (i = 0; i < args -> nthreads; i++)
271+
#if 1
272+
{
273+
LOCK_COMMAND(&getrf_lock);
274+
jw = job[mypos].working[i][CACHE_LINE_SIZE * bufferside];
275+
UNLOCK_COMMAND(&getrf_lock);
276+
do {
277+
LOCK_COMMAND(&getrf_lock);
278+
jw = job[mypos].working[i][CACHE_LINE_SIZE * bufferside];
279+
UNLOCK_COMMAND(&getrf_lock);
280+
} while (jw);
281+
}
282+
#else
248283
while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {};
249-
284+
#endif
250285
for(jjs = xxx; jjs < MIN(n_to, xxx + div_n); jjs += min_jj){
251286
min_jj = MIN(n_to, xxx + div_n) - jjs;
252287
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
@@ -283,18 +318,23 @@ static int inner_advanced_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *
283318
b + (is + jjs * lda) * COMPSIZE, lda, is);
284319
}
285320
}
286-
287321
MB;
288-
for (i = 0; i < args -> nthreads; i++)
322+
for (i = 0; i < args -> nthreads; i++) {
323+
LOCK_COMMAND(&getrf_lock);
289324
job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside];
290-
325+
UNLOCK_COMMAND(&getrf_lock);
326+
}
291327
}
292328

329+
LOCK_COMMAND(&getrf_flag_lock);
293330
flag[mypos * CACHE_LINE_SIZE] = 0;
331+
UNLOCK_COMMAND(&getrf_flag_lock);
294332

295333
if (m == 0) {
296334
for (xxx = 0; xxx < DIVIDE_RATE; xxx++) {
335+
LOCK_COMMAND(&getrf_lock);
297336
job[mypos].working[mypos][CACHE_LINE_SIZE * xxx] = 0;
337+
UNLOCK_COMMAND(&getrf_lock);
298338
}
299339
}
300340

@@ -318,7 +358,18 @@ static int inner_advanced_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *
318358
for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) {
319359

320360
if ((current != mypos) && (!is)) {
361+
#if 1
362+
LOCK_COMMAND(&getrf_lock);
363+
jw = job[current].working[mypos][CACHE_LINE_SIZE * bufferside];
364+
UNLOCK_COMMAND(&getrf_lock);
365+
do {
366+
LOCK_COMMAND(&getrf_lock);
367+
jw = job[current].working[mypos][CACHE_LINE_SIZE * bufferside];
368+
UNLOCK_COMMAND(&getrf_lock);
369+
} while (jw == 0);
370+
#else
321371
while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {};
372+
#endif
322373
}
323374

324375
KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), k,
@@ -327,7 +378,9 @@ static int inner_advanced_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *
327378

328379
MB;
329380
if (is + min_i >= m) {
381+
LOCK_COMMAND(&getrf_lock);
330382
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0;
383+
UNLOCK_COMMAND(&getrf_lock);
331384
}
332385
}
333386

@@ -339,7 +392,18 @@ static int inner_advanced_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *
339392

340393
for (i = 0; i < args -> nthreads; i++) {
341394
for (xxx = 0; xxx < DIVIDE_RATE; xxx++) {
395+
#if 1
396+
LOCK_COMMAND(&getrf_lock);
397+
jw = job[mypos].working[i][CACHE_LINE_SIZE *xxx];
398+
UNLOCK_COMMAND(&getrf_lock);
399+
do {
400+
LOCK_COMMAND(&getrf_lock);
401+
jw = job[mypos].working[i][CACHE_LINE_SIZE *xxx];
402+
UNLOCK_COMMAND(&getrf_lock);
403+
} while(jw != 0);
404+
#else
342405
while (job[mypos].working[i][CACHE_LINE_SIZE * xxx] ) {};
406+
#endif
343407
}
344408
}
345409

@@ -374,6 +438,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
374438
BLASLONG i, j, k, is, bk;
375439

376440
BLASLONG num_cpu;
441+
BLASLONG f;
377442

378443
#ifdef _MSC_VER
379444
BLASLONG flag[MAX_CPU_NUMBER * CACHE_LINE_SIZE];
@@ -501,11 +566,13 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
501566
if (mm >= nn) {
502567

503568
width = blas_quickdivide(nn + args -> nthreads - num_cpu, args -> nthreads - num_cpu - 1);
569+
if (width == 0) width = nn;
504570
if (nn < width) width = nn;
505571
nn -= width;
506572
range_N[num_cpu + 1] = range_N[num_cpu] + width;
507573

508574
width = blas_quickdivide(mm + args -> nthreads - num_cpu, args -> nthreads - num_cpu - 1);
575+
if (width == 0) width = mm;
509576
if (mm < width) width = mm;
510577
if (nn <= 0) width = mm;
511578
mm -= width;
@@ -514,11 +581,13 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
514581
} else {
515582

516583
width = blas_quickdivide(mm + args -> nthreads - num_cpu, args -> nthreads - num_cpu - 1);
584+
if (width == 0) width = mm;
517585
if (mm < width) width = mm;
518586
mm -= width;
519587
range_M[num_cpu + 1] = range_M[num_cpu] + width;
520588

521589
width = blas_quickdivide(nn + args -> nthreads - num_cpu, args -> nthreads - num_cpu - 1);
590+
if (width == 0) width = nn;
522591
if (nn < width) width = nn;
523592
if (mm <= 0) width = nn;
524593
nn -= width;
@@ -561,7 +630,6 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
561630
range_n_new[1] = offset + is + bk;
562631

563632
if (num_cpu > 0) {
564-
565633
queue[num_cpu - 1].next = NULL;
566634

567635
exec_blas_async(0, &queue[0]);
@@ -572,8 +640,20 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
572640

573641
if (iinfo && !info) info = iinfo + is;
574642

575-
for (i = 0; i < num_cpu; i ++) while (flag[i * CACHE_LINE_SIZE]) {};
576-
643+
for (i = 0; i < num_cpu; i ++) {
644+
#if 1
645+
LOCK_COMMAND(&getrf_flag_lock);
646+
f=flag[i*CACHE_LINE_SIZE];
647+
UNLOCK_COMMAND(&getrf_flag_lock);
648+
while (f!=0) {
649+
LOCK_COMMAND(&getrf_flag_lock);
650+
f=flag[i*CACHE_LINE_SIZE];
651+
UNLOCK_COMMAND(&getrf_flag_lock);
652+
};
653+
#else
654+
while (flag[i*CACHE_LINE_SIZE]) {};
655+
#endif
656+
}
577657
TRSM_ILTCOPY(bk, bk, a + (is + is * lda) * COMPSIZE, lda, 0, sb);
578658

579659
} else {

0 commit comments

Comments
 (0)