@@ -351,8 +351,9 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
351
351
/* Make sure if no one is using workspace */
352
352
START_RPCC ();
353
353
for (i = 0 ; i < args -> nthreads ; i ++ )
354
- while (job [mypos ].working [i ][CACHE_LINE_SIZE * bufferside ]) {YIELDING ;MB ; };
354
+ while (job [mypos ].working [i ][CACHE_LINE_SIZE * bufferside ]) {YIELDING ;};
355
355
STOP_RPCC (waiting1 );
356
+ MB ;
356
357
357
358
#if defined(FUSED_GEMM ) && !defined(TIMING )
358
359
@@ -395,10 +396,10 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
395
396
}
396
397
#endif
397
398
399
+ WMB ;
398
400
/* Set flag so other threads can access local region of B */
399
401
for (i = mypos_n * nthreads_m ; i < (mypos_n + 1 ) * nthreads_m ; i ++ )
400
402
job [mypos ].working [i ][CACHE_LINE_SIZE * bufferside ] = (BLASLONG )buffer [bufferside ];
401
- WMB ;
402
403
}
403
404
404
405
/* Get regions of B from other threads and apply kernel */
@@ -417,8 +418,9 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
417
418
418
419
/* Wait until other region of B is initialized */
419
420
START_RPCC ();
420
- while (job [current ].working [mypos ][CACHE_LINE_SIZE * bufferside ] == 0 ) {YIELDING ;MB ; };
421
+ while (job [current ].working [mypos ][CACHE_LINE_SIZE * bufferside ] == 0 ) {YIELDING ;};
421
422
STOP_RPCC (waiting2 );
423
+ MB ;
422
424
423
425
/* Apply kernel with local region of A and part of other region of B */
424
426
START_RPCC ();
@@ -434,8 +436,8 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
434
436
435
437
/* Clear synchronization flag if this thread is done with other region of B */
436
438
if (m_to - m_from == min_i ) {
437
- job [current ].working [mypos ][CACHE_LINE_SIZE * bufferside ] &= 0 ;
438
439
WMB ;
440
+ job [current ].working [mypos ][CACHE_LINE_SIZE * bufferside ] &= 0 ;
439
441
}
440
442
}
441
443
} while (current != mypos );
@@ -477,8 +479,8 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
477
479
478
480
/* Clear synchronization flag if this thread is done with region of B */
479
481
if (is + min_i >= m_to ) {
480
- job [current ].working [mypos ][CACHE_LINE_SIZE * bufferside ] &= 0 ;
481
482
WMB ;
483
+ job [current ].working [mypos ][CACHE_LINE_SIZE * bufferside ] &= 0 ;
482
484
}
483
485
}
484
486
@@ -497,10 +499,11 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
497
499
START_RPCC ();
498
500
for (i = 0 ; i < args -> nthreads ; i ++ ) {
499
501
for (js = 0 ; js < DIVIDE_RATE ; js ++ ) {
500
- while (job [mypos ].working [i ][CACHE_LINE_SIZE * js ] ) {YIELDING ;MB ; };
502
+ while (job [mypos ].working [i ][CACHE_LINE_SIZE * js ] ) {YIELDING ;};
501
503
}
502
504
}
503
505
STOP_RPCC (waiting3 );
506
+ MB ;
504
507
505
508
#ifdef TIMING
506
509
BLASLONG waiting = waiting1 + waiting2 + waiting3 ;
@@ -705,7 +708,7 @@ EnterCriticalSection((PCRITICAL_SECTION)&level3_lock);
705
708
}
706
709
}
707
710
}
708
-
711
+ WMB ;
709
712
/* Execute parallel computation */
710
713
exec_blas (nthreads , queue );
711
714
}
0 commit comments