@@ -344,6 +344,12 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
344344 div_n = (n_to - n_from + DIVIDE_RATE - 1 ) / DIVIDE_RATE ;
345345 for (js = n_from , bufferside = 0 ; js < n_to ; js += div_n , bufferside ++ ) {
346346
347+ /* Make sure if no one is using workspace */
348+ START_RPCC ();
349+ for (i = 0 ; i < args -> nthreads ; i ++ )
350+ while (job [mypos ].working [i ][CACHE_LINE_SIZE * bufferside ]) {YIELDING ;MB ;};
351+ STOP_RPCC (waiting1 );
352+
347353#if defined(FUSED_GEMM ) && !defined(TIMING )
348354
349355 /* Fused operation to copy region of B into workspace and apply kernel */
@@ -381,15 +387,10 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
381387 }
382388#endif
383389
384- for (i = mypos_n * nthreads_m ; i < (mypos_n + 1 ) * nthreads_m ; i ++ ) {
385- /* Make sure if no one is using workspace */
386- START_RPCC ();
387- while (job [mypos ].working [i ][CACHE_LINE_SIZE * bufferside ]) {YIELDING ;MB ;};
388- STOP_RPCC (waiting1 );
389- /* Set flag so other threads can access local region of B */
390+ /* Set flag so other threads can access local region of B */
391+ for (i = mypos_n * nthreads_m ; i < (mypos_n + 1 ) * nthreads_m ; i ++ )
390392 job [mypos ].working [i ][CACHE_LINE_SIZE * bufferside ] = (BLASLONG )buffer [bufferside ];
391- WMB ;
392- }
393+ WMB ;
393394 }
394395
395396 /* Get regions of B from other threads and apply kernel */
@@ -425,13 +426,13 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
425426
426427 /* Clear synchronization flag if this thread is done with other region of B */
427428 if (m_to - m_from == min_i ) {
428- job [current ].working [mypos ][CACHE_LINE_SIZE * bufferside ] = 0 ;
429+ job [current ].working [mypos ][CACHE_LINE_SIZE * bufferside ] & = 0 ;
429430 WMB ;
430431 }
431432 }
432433 } while (current != mypos );
433434
434- /* Iterate through steps of m
435+ /* Iterate through steps of m
435436 * Note: First step has already been finished */
436437 for (is = m_from + min_i ; is < m_to ; is += min_i ){
437438 min_i = m_to - is ;
@@ -461,14 +462,14 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
461462 sa , (FLOAT * )job [current ].working [mypos ][CACHE_LINE_SIZE * bufferside ],
462463 c , ldc , is , js );
463464 STOP_RPCC (kernel );
464-
465+
465466#ifdef TIMING
466467 ops += 2 * min_i * MIN (range_n [current + 1 ] - js , div_n ) * min_l ;
467468#endif
468-
469+
469470 /* Clear synchronization flag if this thread is done with region of B */
470471 if (is + min_i >= m_to ) {
471- job [current ].working [mypos ][CACHE_LINE_SIZE * bufferside ] = 0 ;
472+ job [current ].working [mypos ][CACHE_LINE_SIZE * bufferside ] & = 0 ;
472473 WMB ;
473474 }
474475 }
0 commit comments