9191#endif
9292
9393typedef struct {
94- #if __STDC_VERSION__ >= 201112L
95- _Atomic
96- #else
9794 volatile
98- #endif
9995 BLASLONG working [MAX_CPU_NUMBER ][CACHE_LINE_SIZE * DIVIDE_RATE ];
10096} job_t ;
10197
@@ -348,12 +344,6 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
348344 div_n = (n_to - n_from + DIVIDE_RATE - 1 ) / DIVIDE_RATE ;
349345 for (js = n_from , bufferside = 0 ; js < n_to ; js += div_n , bufferside ++ ) {
350346
351- /* Make sure if no one is using workspace */
352- START_RPCC ();
353- for (i = 0 ; i < args -> nthreads ; i ++ )
354- while (job [mypos ].working [i ][CACHE_LINE_SIZE * bufferside ]) {YIELDING ;};
355- STOP_RPCC (waiting1 );
356-
357347#if defined(FUSED_GEMM ) && !defined(TIMING )
358348
359349 /* Fused operation to copy region of B into workspace and apply kernel */
@@ -391,10 +381,15 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
391381 }
392382#endif
393383
394- /* Set flag so other threads can access local region of B */
395- for (i = mypos_n * nthreads_m ; i < (mypos_n + 1 ) * nthreads_m ; i ++ )
384+ for (i = mypos_n * nthreads_m ; i < (mypos_n + 1 ) * nthreads_m ; i ++ ) {
385+ /* Make sure if no one is using workspace */
386+ START_RPCC ();
387+ while (job [mypos ].working [i ][CACHE_LINE_SIZE * bufferside ]) {YIELDING ;MB ;};
388+ STOP_RPCC (waiting1 );
389+ /* Set flag so other threads can access local region of B */
396390 job [mypos ].working [i ][CACHE_LINE_SIZE * bufferside ] = (BLASLONG )buffer [bufferside ];
397- WMB ;
391+ WMB ;
392+ }
398393 }
399394
400395 /* Get regions of B from other threads and apply kernel */
@@ -413,7 +408,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
413408
414409 /* Wait until other region of B is initialized */
415410 START_RPCC ();
416- while (job [current ].working [mypos ][CACHE_LINE_SIZE * bufferside ] == 0 ) {YIELDING ;};
411+ while (job [current ].working [mypos ][CACHE_LINE_SIZE * bufferside ] == 0 ) {YIELDING ;MB ; };
417412 STOP_RPCC (waiting2 );
418413
419414 /* Apply kernel with local region of A and part of other region of B */
@@ -430,12 +425,13 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
430425
431426 /* Clear synchronization flag if this thread is done with other region of B */
432427 if (m_to - m_from == min_i ) {
433- job [current ].working [mypos ][CACHE_LINE_SIZE * bufferside ] &= 0 ;
428+ job [current ].working [mypos ][CACHE_LINE_SIZE * bufferside ] = 0 ;
429+ WMB ;
434430 }
435431 }
436432 } while (current != mypos );
437433
438- /* Iterate through steps of m
434+ /* Iterate through steps of m
439435 * Note: First step has already been finished */
440436 for (is = m_from + min_i ; is < m_to ; is += min_i ){
441437 min_i = m_to - is ;
@@ -465,14 +461,14 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
465461 sa , (FLOAT * )job [current ].working [mypos ][CACHE_LINE_SIZE * bufferside ],
466462 c , ldc , is , js );
467463 STOP_RPCC (kernel );
468-
464+
469465#ifdef TIMING
470466 ops += 2 * min_i * MIN (range_n [current + 1 ] - js , div_n ) * min_l ;
471467#endif
472-
468+
473469 /* Clear synchronization flag if this thread is done with region of B */
474470 if (is + min_i >= m_to ) {
475- job [current ].working [mypos ][CACHE_LINE_SIZE * bufferside ] & = 0 ;
471+ job [current ].working [mypos ][CACHE_LINE_SIZE * bufferside ] = 0 ;
476472 WMB ;
477473 }
478474 }
@@ -492,7 +488,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
492488 START_RPCC ();
493489 for (i = 0 ; i < args -> nthreads ; i ++ ) {
494490 for (js = 0 ; js < DIVIDE_RATE ; js ++ ) {
495- while (job [mypos ].working [i ][CACHE_LINE_SIZE * js ] ) {YIELDING ;};
491+ while (job [mypos ].working [i ][CACHE_LINE_SIZE * js ] ) {YIELDING ;MB ; };
496492 }
497493 }
498494 STOP_RPCC (waiting3 );
@@ -658,8 +654,8 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
658654 }
659655
660656 /* Clear synchronization flags */
661- for (i = 0 ; i < MAX_CPU_NUMBER ; i ++ ) {
662- for (j = 0 ; j < MAX_CPU_NUMBER ; j ++ ) {
657+ for (i = 0 ; i < nthreads ; i ++ ) {
658+ for (j = 0 ; j < nthreads ; j ++ ) {
663659 for (k = 0 ; k < DIVIDE_RATE ; k ++ ) {
664660 job [i ].working [j ][CACHE_LINE_SIZE * k ] = 0 ;
665661 }
0 commit comments