Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions driver/level3/gemm.c
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,10 @@
#define DIVIDE_RATE GEMM_DIVIDE_RATE
#endif

#ifdef GEMM_DIVIDE_LIMIT
#define DIVIDE_LIMIT GEMM_DIVIDE_LIMIT
#endif

#ifdef THREADED_LEVEL3
#include "level3_thread.c"
#else
Expand Down
18 changes: 12 additions & 6 deletions driver/level3/level3_thread.c
Original file line number Diff line number Diff line change
Expand Up @@ -246,6 +246,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,

BLASLONG nthreads_m;
BLASLONG mypos_m, mypos_n;
BLASLONG divide_rate = DIVIDE_RATE;

BLASLONG is, js, ls, bufferside, jjs;
BLASLONG min_i, min_l, div_n, min_jj;
Expand Down Expand Up @@ -280,6 +281,11 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
alpha = (FLOAT *)args -> alpha;
beta = (FLOAT *)args -> beta;

/* Disable divide_rate when N of all threads are less than to DIVIDE_LIMIT */
#ifdef DIVIDE_LIMIT
if (N < DIVIDE_LIMIT) divide_rate = 1;
#endif

/* Initialize 2D CPU distribution */
nthreads_m = args -> nthreads;
if (range_m) {
Expand Down Expand Up @@ -321,9 +327,9 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
) return 0;

/* Initialize workspace for local region of B */
div_n = (n_to - n_from + DIVIDE_RATE - 1) / DIVIDE_RATE;
div_n = (n_to - n_from + divide_rate - 1) / divide_rate;
buffer[0] = sb;
for (i = 1; i < DIVIDE_RATE; i++) {
for (i = 1; i < divide_rate; i++) {
buffer[i] = buffer[i - 1] + GEMM_Q * ((div_n + GEMM_UNROLL_N - 1)/GEMM_UNROLL_N) * GEMM_UNROLL_N * COMPSIZE;
}

Expand Down Expand Up @@ -365,7 +371,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
STOP_RPCC(copy_A);

/* Copy local region of B into workspace and apply kernel */
div_n = (n_to - n_from + DIVIDE_RATE - 1) / DIVIDE_RATE;
div_n = (n_to - n_from + divide_rate - 1) / divide_rate;
for (js = n_from, bufferside = 0; js < n_to; js += div_n, bufferside ++) {

/* Make sure if no one is using workspace */
Expand Down Expand Up @@ -434,7 +440,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
if (current >= (mypos_n + 1) * nthreads_m) current = mypos_n * nthreads_m;

/* Split other region of B into parts */
div_n = (range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE;
div_n = (range_n[current + 1] - range_n[current] + divide_rate - 1) / divide_rate;
for (js = range_n[current], bufferside = 0; js < range_n[current + 1]; js += div_n, bufferside ++) {
if (current != mypos) {

Expand Down Expand Up @@ -485,7 +491,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
do {

/* Split region of B into parts and apply kernel */
div_n = (range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE;
div_n = (range_n[current + 1] - range_n[current] + divide_rate - 1) / divide_rate;
for (js = range_n[current], bufferside = 0; js < range_n[current + 1]; js += div_n, bufferside ++) {

/* Apply kernel with local region of A and part of region of B */
Expand Down Expand Up @@ -520,7 +526,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
/* Wait until all other threads are done with local region of B */
START_RPCC();
for (i = 0; i < args -> nthreads; i++) {
for (js = 0; js < DIVIDE_RATE; js++) {
for (js = 0; js < divide_rate; js++) {
while (job[mypos].working[i][CACHE_LINE_SIZE * js] ) {YIELDING;};
}
}
Expand Down
2 changes: 2 additions & 0 deletions param.h
Original file line number Diff line number Diff line change
Expand Up @@ -3585,6 +3585,8 @@ is a big desktop or server with abundant cache rather than a phone or embedded d

#elif defined(NEOVERSEV1) // 256-bit SVE

#define GEMM_DIVIDE_LIMIT 3

#if defined(XDOUBLE) || defined(DOUBLE)
#define SWITCH_RATIO 8
#define GEMM_PREFERED_SIZE 4
Expand Down
Loading