@@ -67,6 +67,26 @@ double sqrt(double);
67
67
#undef GETRF_FACTOR
68
68
#define GETRF_FACTOR 1.00
69
69
70
+
71
+ #if defined(USE_PTHREAD_LOCK )
72
+ static pthread_mutex_t getrf_lock = PTHREAD_MUTEX_INITIALIZER ;
73
+ #elif defined(USE_PTHREAD_SPINLOCK )
74
+ static pthread_spinlock_t getrf_lock = 0 ;
75
+ #else
76
+ static BLASULONG getrf_lock = 0UL ;
77
+ #endif
78
+
79
+ #if defined(USE_PTHREAD_LOCK )
80
+ static pthread_mutex_t getrf_flag_lock = PTHREAD_MUTEX_INITIALIZER ;
81
+ #elif defined(USE_PTHREAD_SPINLOCK )
82
+ static pthread_spinlock_t getrf_flag_lock = 0 ;
83
+ #else
84
+ static BLASULONG getrf_flag_lock = 0UL ;
85
+ #endif
86
+
87
+
88
+
89
+
70
90
static __inline BLASLONG FORMULA1 (BLASLONG M , BLASLONG N , BLASLONG IS , BLASLONG BK , BLASLONG T ) {
71
91
72
92
double m = (double )(M - IS - BK );
@@ -217,7 +237,10 @@ static int inner_advanced_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *
217
237
218
238
blasint * ipiv = (blasint * )args -> c ;
219
239
220
- volatile BLASLONG * flag = (volatile BLASLONG * )args -> d ;
240
+ //_Atomic
241
+ BLASLONG jw ;
242
+
243
+ _Atomic BLASLONG * flag = (_Atomic BLASLONG * )args -> d ;
221
244
222
245
if (args -> a == NULL ) {
223
246
TRSM_ILTCOPY (k , k , (FLOAT * )args -> b , lda , 0 , sb );
@@ -245,8 +268,20 @@ static int inner_advanced_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *
245
268
for (xxx = n_from , bufferside = 0 ; xxx < n_to ; xxx += div_n , bufferside ++ ) {
246
269
247
270
for (i = 0 ; i < args -> nthreads ; i ++ )
271
+ #if 1
272
+ {
273
+ LOCK_COMMAND (& getrf_lock );
274
+ jw = job [mypos ].working [i ][CACHE_LINE_SIZE * bufferside ];
275
+ UNLOCK_COMMAND (& getrf_lock );
276
+ do {
277
+ LOCK_COMMAND (& getrf_lock );
278
+ jw = job [mypos ].working [i ][CACHE_LINE_SIZE * bufferside ];
279
+ UNLOCK_COMMAND (& getrf_lock );
280
+ } while (jw );
281
+ }
282
+ #else
248
283
while (job [mypos ].working [i ][CACHE_LINE_SIZE * bufferside ]) {};
249
-
284
+ #endif
250
285
for (jjs = xxx ; jjs < MIN (n_to , xxx + div_n ); jjs += min_jj ){
251
286
min_jj = MIN (n_to , xxx + div_n ) - jjs ;
252
287
if (min_jj > GEMM_UNROLL_N ) min_jj = GEMM_UNROLL_N ;
@@ -283,18 +318,23 @@ static int inner_advanced_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *
283
318
b + (is + jjs * lda ) * COMPSIZE , lda , is );
284
319
}
285
320
}
286
-
287
321
MB ;
288
- for (i = 0 ; i < args -> nthreads ; i ++ )
322
+ for (i = 0 ; i < args -> nthreads ; i ++ ) {
323
+ LOCK_COMMAND (& getrf_lock );
289
324
job [mypos ].working [i ][CACHE_LINE_SIZE * bufferside ] = (BLASLONG )buffer [bufferside ];
290
-
325
+ UNLOCK_COMMAND (& getrf_lock );
326
+ }
291
327
}
292
328
329
+ LOCK_COMMAND (& getrf_flag_lock );
293
330
flag [mypos * CACHE_LINE_SIZE ] = 0 ;
331
+ UNLOCK_COMMAND (& getrf_flag_lock );
294
332
295
333
if (m == 0 ) {
296
334
for (xxx = 0 ; xxx < DIVIDE_RATE ; xxx ++ ) {
335
+ LOCK_COMMAND (& getrf_lock );
297
336
job [mypos ].working [mypos ][CACHE_LINE_SIZE * xxx ] = 0 ;
337
+ UNLOCK_COMMAND (& getrf_lock );
298
338
}
299
339
}
300
340
@@ -318,7 +358,18 @@ static int inner_advanced_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *
318
358
for (xxx = range_n [current ], bufferside = 0 ; xxx < range_n [current + 1 ]; xxx += div_n , bufferside ++ ) {
319
359
320
360
if ((current != mypos ) && (!is )) {
361
+ #if 1
362
+ LOCK_COMMAND (& getrf_lock );
363
+ jw = job [current ].working [mypos ][CACHE_LINE_SIZE * bufferside ];
364
+ UNLOCK_COMMAND (& getrf_lock );
365
+ do {
366
+ LOCK_COMMAND (& getrf_lock );
367
+ jw = job [current ].working [mypos ][CACHE_LINE_SIZE * bufferside ];
368
+ UNLOCK_COMMAND (& getrf_lock );
369
+ } while (jw == 0 );
370
+ #else
321
371
while (job [current ].working [mypos ][CACHE_LINE_SIZE * bufferside ] == 0 ) {};
372
+ #endif
322
373
}
323
374
324
375
KERNEL_OPERATION (min_i , MIN (range_n [current + 1 ] - xxx , div_n ), k ,
@@ -327,7 +378,9 @@ static int inner_advanced_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *
327
378
328
379
MB ;
329
380
if (is + min_i >= m ) {
381
+ LOCK_COMMAND (& getrf_lock );
330
382
job [current ].working [mypos ][CACHE_LINE_SIZE * bufferside ] = 0 ;
383
+ UNLOCK_COMMAND (& getrf_lock );
331
384
}
332
385
}
333
386
@@ -339,7 +392,18 @@ static int inner_advanced_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *
339
392
340
393
for (i = 0 ; i < args -> nthreads ; i ++ ) {
341
394
for (xxx = 0 ; xxx < DIVIDE_RATE ; xxx ++ ) {
395
+ #if 1
396
+ LOCK_COMMAND (& getrf_lock );
397
+ jw = job [mypos ].working [i ][CACHE_LINE_SIZE * xxx ];
398
+ UNLOCK_COMMAND (& getrf_lock );
399
+ do {
400
+ LOCK_COMMAND (& getrf_lock );
401
+ jw = job [mypos ].working [i ][CACHE_LINE_SIZE * xxx ];
402
+ UNLOCK_COMMAND (& getrf_lock );
403
+ } while (jw != 0 );
404
+ #else
342
405
while (job [mypos ].working [i ][CACHE_LINE_SIZE * xxx ] ) {};
406
+ #endif
343
407
}
344
408
}
345
409
@@ -374,6 +438,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
374
438
BLASLONG i , j , k , is , bk ;
375
439
376
440
BLASLONG num_cpu ;
441
+ BLASLONG f ;
377
442
378
443
#ifdef _MSC_VER
379
444
BLASLONG flag [MAX_CPU_NUMBER * CACHE_LINE_SIZE ];
@@ -501,11 +566,13 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
501
566
if (mm >= nn ) {
502
567
503
568
width = blas_quickdivide (nn + args -> nthreads - num_cpu , args -> nthreads - num_cpu - 1 );
569
+ if (width == 0 ) width = nn ;
504
570
if (nn < width ) width = nn ;
505
571
nn -= width ;
506
572
range_N [num_cpu + 1 ] = range_N [num_cpu ] + width ;
507
573
508
574
width = blas_quickdivide (mm + args -> nthreads - num_cpu , args -> nthreads - num_cpu - 1 );
575
+ if (width == 0 ) width = mm ;
509
576
if (mm < width ) width = mm ;
510
577
if (nn <= 0 ) width = mm ;
511
578
mm -= width ;
@@ -514,11 +581,13 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
514
581
} else {
515
582
516
583
width = blas_quickdivide (mm + args -> nthreads - num_cpu , args -> nthreads - num_cpu - 1 );
584
+ if (width == 0 ) width = mm ;
517
585
if (mm < width ) width = mm ;
518
586
mm -= width ;
519
587
range_M [num_cpu + 1 ] = range_M [num_cpu ] + width ;
520
588
521
589
width = blas_quickdivide (nn + args -> nthreads - num_cpu , args -> nthreads - num_cpu - 1 );
590
+ if (width == 0 ) width = nn ;
522
591
if (nn < width ) width = nn ;
523
592
if (mm <= 0 ) width = nn ;
524
593
nn -= width ;
@@ -561,7 +630,6 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
561
630
range_n_new [1 ] = offset + is + bk ;
562
631
563
632
if (num_cpu > 0 ) {
564
-
565
633
queue [num_cpu - 1 ].next = NULL ;
566
634
567
635
exec_blas_async (0 , & queue [0 ]);
@@ -572,8 +640,20 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
572
640
573
641
if (iinfo && !info ) info = iinfo + is ;
574
642
575
- for (i = 0 ; i < num_cpu ; i ++ ) while (flag [i * CACHE_LINE_SIZE ]) {};
576
-
643
+ for (i = 0 ; i < num_cpu ; i ++ ) {
644
+ #if 1
645
+ LOCK_COMMAND (& getrf_flag_lock );
646
+ f = flag [i * CACHE_LINE_SIZE ];
647
+ UNLOCK_COMMAND (& getrf_flag_lock );
648
+ while (f != 0 ) {
649
+ LOCK_COMMAND (& getrf_flag_lock );
650
+ f = flag [i * CACHE_LINE_SIZE ];
651
+ UNLOCK_COMMAND (& getrf_flag_lock );
652
+ };
653
+ #else
654
+ while (flag [i * CACHE_LINE_SIZE ]) {};
655
+ #endif
656
+ }
577
657
TRSM_ILTCOPY (bk , bk , a + (is + is * lda ) * COMPSIZE , lda , 0 , sb );
578
658
579
659
} else {
0 commit comments