@@ -246,6 +246,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
246
246
247
247
BLASLONG nthreads_m ;
248
248
BLASLONG mypos_m , mypos_n ;
249
+ BLASLONG divide_rate = DIVIDE_RATE ;
249
250
250
251
BLASLONG is , js , ls , bufferside , jjs ;
251
252
BLASLONG min_i , min_l , div_n , min_jj ;
@@ -280,6 +281,11 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
280
281
alpha = (FLOAT * )args -> alpha ;
281
282
beta = (FLOAT * )args -> beta ;
282
283
284
+ /* Disable divide_rate when N of all threads are less than to DIVIDE_LIMIT */
285
+ #ifdef DIVIDE_LIMIT
286
+ if (N < DIVIDE_LIMIT ) divide_rate = 1 ;
287
+ #endif
288
+
283
289
/* Initialize 2D CPU distribution */
284
290
nthreads_m = args -> nthreads ;
285
291
if (range_m ) {
@@ -321,9 +327,9 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
321
327
) return 0 ;
322
328
323
329
/* Initialize workspace for local region of B */
324
- div_n = (n_to - n_from + DIVIDE_RATE - 1 ) / DIVIDE_RATE ;
330
+ div_n = (n_to - n_from + divide_rate - 1 ) / divide_rate ;
325
331
buffer [0 ] = sb ;
326
- for (i = 1 ; i < DIVIDE_RATE ; i ++ ) {
332
+ for (i = 1 ; i < divide_rate ; i ++ ) {
327
333
buffer [i ] = buffer [i - 1 ] + GEMM_Q * ((div_n + GEMM_UNROLL_N - 1 )/GEMM_UNROLL_N ) * GEMM_UNROLL_N * COMPSIZE ;
328
334
}
329
335
@@ -365,7 +371,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
365
371
STOP_RPCC (copy_A );
366
372
367
373
/* Copy local region of B into workspace and apply kernel */
368
- div_n = (n_to - n_from + DIVIDE_RATE - 1 ) / DIVIDE_RATE ;
374
+ div_n = (n_to - n_from + divide_rate - 1 ) / divide_rate ;
369
375
for (js = n_from , bufferside = 0 ; js < n_to ; js += div_n , bufferside ++ ) {
370
376
371
377
/* Make sure if no one is using workspace */
@@ -434,7 +440,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
434
440
if (current >= (mypos_n + 1 ) * nthreads_m ) current = mypos_n * nthreads_m ;
435
441
436
442
/* Split other region of B into parts */
437
- div_n = (range_n [current + 1 ] - range_n [current ] + DIVIDE_RATE - 1 ) / DIVIDE_RATE ;
443
+ div_n = (range_n [current + 1 ] - range_n [current ] + divide_rate - 1 ) / divide_rate ;
438
444
for (js = range_n [current ], bufferside = 0 ; js < range_n [current + 1 ]; js += div_n , bufferside ++ ) {
439
445
if (current != mypos ) {
440
446
@@ -485,7 +491,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
485
491
do {
486
492
487
493
/* Split region of B into parts and apply kernel */
488
- div_n = (range_n [current + 1 ] - range_n [current ] + DIVIDE_RATE - 1 ) / DIVIDE_RATE ;
494
+ div_n = (range_n [current + 1 ] - range_n [current ] + divide_rate - 1 ) / divide_rate ;
489
495
for (js = range_n [current ], bufferside = 0 ; js < range_n [current + 1 ]; js += div_n , bufferside ++ ) {
490
496
491
497
/* Apply kernel with local region of A and part of region of B */
@@ -520,7 +526,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
520
526
/* Wait until all other threads are done with local region of B */
521
527
START_RPCC ();
522
528
for (i = 0 ; i < args -> nthreads ; i ++ ) {
523
- for (js = 0 ; js < DIVIDE_RATE ; js ++ ) {
529
+ for (js = 0 ; js < divide_rate ; js ++ ) {
524
530
while (job [mypos ].working [i ][CACHE_LINE_SIZE * js ] ) {YIELDING ;};
525
531
}
526
532
}
0 commit comments