diff --git a/driver/level3/gemm.c b/driver/level3/gemm.c index a20d6c59ad..e37d86c28d 100644 --- a/driver/level3/gemm.c +++ b/driver/level3/gemm.c @@ -63,6 +63,10 @@ #define DIVIDE_RATE GEMM_DIVIDE_RATE #endif +#ifdef GEMM_DIVIDE_LIMIT +#define DIVIDE_LIMIT GEMM_DIVIDE_LIMIT +#endif + #ifdef THREADED_LEVEL3 #include "level3_thread.c" #else diff --git a/driver/level3/level3_thread.c b/driver/level3/level3_thread.c index cb93591ab0..22f27975bd 100644 --- a/driver/level3/level3_thread.c +++ b/driver/level3/level3_thread.c @@ -246,6 +246,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, BLASLONG nthreads_m; BLASLONG mypos_m, mypos_n; + BLASLONG divide_rate = DIVIDE_RATE; BLASLONG is, js, ls, bufferside, jjs; BLASLONG min_i, min_l, div_n, min_jj; @@ -280,6 +281,11 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, alpha = (FLOAT *)args -> alpha; beta = (FLOAT *)args -> beta; + /* Disable divide_rate when N of all threads are less than to DIVIDE_LIMIT */ +#ifdef DIVIDE_LIMIT + if (N < DIVIDE_LIMIT) divide_rate = 1; +#endif + /* Initialize 2D CPU distribution */ nthreads_m = args -> nthreads; if (range_m) { @@ -321,9 +327,9 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, ) return 0; /* Initialize workspace for local region of B */ - div_n = (n_to - n_from + DIVIDE_RATE - 1) / DIVIDE_RATE; + div_n = (n_to - n_from + divide_rate - 1) / divide_rate; buffer[0] = sb; - for (i = 1; i < DIVIDE_RATE; i++) { + for (i = 1; i < divide_rate; i++) { buffer[i] = buffer[i - 1] + GEMM_Q * ((div_n + GEMM_UNROLL_N - 1)/GEMM_UNROLL_N) * GEMM_UNROLL_N * COMPSIZE; } @@ -365,7 +371,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, STOP_RPCC(copy_A); /* Copy local region of B into workspace and apply kernel */ - div_n = (n_to - n_from + DIVIDE_RATE - 1) / DIVIDE_RATE; + div_n = (n_to - n_from + divide_rate - 1) / divide_rate; for (js = n_from, bufferside = 0; js < n_to; js += div_n, bufferside ++) { /* Make sure if no one is using workspace */ @@ -434,7 +440,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, if (current >= (mypos_n + 1) * nthreads_m) current = mypos_n * nthreads_m; /* Split other region of B into parts */ - div_n = (range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE; + div_n = (range_n[current + 1] - range_n[current] + divide_rate - 1) / divide_rate; for (js = range_n[current], bufferside = 0; js < range_n[current + 1]; js += div_n, bufferside ++) { if (current != mypos) { @@ -485,7 +491,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, do { /* Split region of B into parts and apply kernel */ - div_n = (range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE; + div_n = (range_n[current + 1] - range_n[current] + divide_rate - 1) / divide_rate; for (js = range_n[current], bufferside = 0; js < range_n[current + 1]; js += div_n, bufferside ++) { /* Apply kernel with local region of A and part of region of B */ @@ -520,7 +526,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, /* Wait until all other threads are done with local region of B */ START_RPCC(); for (i = 0; i < args -> nthreads; i++) { - for (js = 0; js < DIVIDE_RATE; js++) { + for (js = 0; js < divide_rate; js++) { while (job[mypos].working[i][CACHE_LINE_SIZE * js] ) {YIELDING;}; } } diff --git a/param.h b/param.h index 46582333ea..77b04fe433 100644 --- a/param.h +++ b/param.h @@ -3585,6 +3585,8 @@ is a big desktop or server with abundant cache rather than a phone or embedded d #elif defined(NEOVERSEV1) // 256-bit SVE +#define GEMM_DIVIDE_LIMIT 3 + #if defined(XDOUBLE) || defined(DOUBLE) #define SWITCH_RATIO 8 #define GEMM_PREFERED_SIZE 4