@@ -344,6 +344,12 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
344
344
div_n = (n_to - n_from + DIVIDE_RATE - 1 ) / DIVIDE_RATE ;
345
345
for (js = n_from , bufferside = 0 ; js < n_to ; js += div_n , bufferside ++ ) {
346
346
347
+ /* Make sure if no one is using workspace */
348
+ START_RPCC ();
349
+ for (i = 0 ; i < args -> nthreads ; i ++ )
350
+ while (job [mypos ].working [i ][CACHE_LINE_SIZE * bufferside ]) {YIELDING ;MB ;};
351
+ STOP_RPCC (waiting1 );
352
+
347
353
#if defined(FUSED_GEMM ) && !defined(TIMING )
348
354
349
355
/* Fused operation to copy region of B into workspace and apply kernel */
@@ -381,15 +387,10 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
381
387
}
382
388
#endif
383
389
384
- for (i = mypos_n * nthreads_m ; i < (mypos_n + 1 ) * nthreads_m ; i ++ ) {
385
- /* Make sure if no one is using workspace */
386
- START_RPCC ();
387
- while (job [mypos ].working [i ][CACHE_LINE_SIZE * bufferside ]) {YIELDING ;MB ;};
388
- STOP_RPCC (waiting1 );
389
- /* Set flag so other threads can access local region of B */
390
+ /* Set flag so other threads can access local region of B */
391
+ for (i = mypos_n * nthreads_m ; i < (mypos_n + 1 ) * nthreads_m ; i ++ )
390
392
job [mypos ].working [i ][CACHE_LINE_SIZE * bufferside ] = (BLASLONG )buffer [bufferside ];
391
- WMB ;
392
- }
393
+ WMB ;
393
394
}
394
395
395
396
/* Get regions of B from other threads and apply kernel */
@@ -425,13 +426,13 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
425
426
426
427
/* Clear synchronization flag if this thread is done with other region of B */
427
428
if (m_to - m_from == min_i ) {
428
- job [current ].working [mypos ][CACHE_LINE_SIZE * bufferside ] = 0 ;
429
+ job [current ].working [mypos ][CACHE_LINE_SIZE * bufferside ] & = 0 ;
429
430
WMB ;
430
431
}
431
432
}
432
433
} while (current != mypos );
433
434
434
- /* Iterate through steps of m
435
+ /* Iterate through steps of m
435
436
* Note: First step has already been finished */
436
437
for (is = m_from + min_i ; is < m_to ; is += min_i ){
437
438
min_i = m_to - is ;
@@ -461,14 +462,14 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
461
462
sa , (FLOAT * )job [current ].working [mypos ][CACHE_LINE_SIZE * bufferside ],
462
463
c , ldc , is , js );
463
464
STOP_RPCC (kernel );
464
-
465
+
465
466
#ifdef TIMING
466
467
ops += 2 * min_i * MIN (range_n [current + 1 ] - js , div_n ) * min_l ;
467
468
#endif
468
-
469
+
469
470
/* Clear synchronization flag if this thread is done with region of B */
470
471
if (is + min_i >= m_to ) {
471
- job [current ].working [mypos ][CACHE_LINE_SIZE * bufferside ] = 0 ;
472
+ job [current ].working [mypos ][CACHE_LINE_SIZE * bufferside ] & = 0 ;
472
473
WMB ;
473
474
}
474
475
}
0 commit comments