@@ -344,12 +344,6 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
344344 div_n = (n_to - n_from + DIVIDE_RATE - 1 ) / DIVIDE_RATE ;
345345 for (js = n_from , bufferside = 0 ; js < n_to ; js += div_n , bufferside ++ ) {
346346
347- /* Make sure if no one is using workspace */
348- START_RPCC ();
349- for (i = 0 ; i < args -> nthreads ; i ++ )
350- while (job [mypos ].working [i ][CACHE_LINE_SIZE * bufferside ]) {YIELDING ;MB ;};
351- STOP_RPCC (waiting1 );
352-
353347#if defined(FUSED_GEMM ) && !defined(TIMING )
354348
355349 /* Fused operation to copy region of B into workspace and apply kernel */
@@ -387,10 +381,15 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
387381 }
388382#endif
389383
390- /* Set flag so other threads can access local region of B */
391- for (i = mypos_n * nthreads_m ; i < (mypos_n + 1 ) * nthreads_m ; i ++ )
384+ for (i = mypos_n * nthreads_m ; i < (mypos_n + 1 ) * nthreads_m ; i ++ ) {
385+ /* Make sure if no one is using workspace */
386+ START_RPCC ();
387+ while (job [mypos ].working [i ][CACHE_LINE_SIZE * bufferside ]) {YIELDING ;MB ;};
388+ STOP_RPCC (waiting1 );
389+ /* Set flag so other threads can access local region of B */
392390 job [mypos ].working [i ][CACHE_LINE_SIZE * bufferside ] = (BLASLONG )buffer [bufferside ];
393- WMB ;
391+ WMB ;
392+ }
394393 }
395394
396395 /* Get regions of B from other threads and apply kernel */
@@ -426,13 +425,13 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
426425
427426 /* Clear synchronization flag if this thread is done with other region of B */
428427 if (m_to - m_from == min_i ) {
429- job [current ].working [mypos ][CACHE_LINE_SIZE * bufferside ] & = 0 ;
428+ job [current ].working [mypos ][CACHE_LINE_SIZE * bufferside ] = 0 ;
430429 WMB ;
431430 }
432431 }
433432 } while (current != mypos );
434433
435- /* Iterate through steps of m
434+ /* Iterate through steps of m
436435 * Note: First step has already been finished */
437436 for (is = m_from + min_i ; is < m_to ; is += min_i ){
438437 min_i = m_to - is ;
@@ -462,14 +461,14 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
462461 sa , (FLOAT * )job [current ].working [mypos ][CACHE_LINE_SIZE * bufferside ],
463462 c , ldc , is , js );
464463 STOP_RPCC (kernel );
465-
464+
466465#ifdef TIMING
467466 ops += 2 * min_i * MIN (range_n [current + 1 ] - js , div_n ) * min_l ;
468467#endif
469-
468+
470469 /* Clear synchronization flag if this thread is done with region of B */
471470 if (is + min_i >= m_to ) {
472- job [current ].working [mypos ][CACHE_LINE_SIZE * bufferside ] & = 0 ;
471+ job [current ].working [mypos ][CACHE_LINE_SIZE * bufferside ] = 0 ;
473472 WMB ;
474473 }
475474 }
0 commit comments