Skip to content

Commit f1c0227

Browse files
authored
Merge pull request #1846 from fenrus75/threadsize
gemm/dgemm: add a way for an arch kernel to specify preferred sizes
2 parents 6610354 + 5b708e5 commit f1c0227

File tree

4 files changed

+31
-4
lines changed

4 files changed

+31
-4
lines changed

driver/level3/level3_thread.c

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,10 @@
4848
#define SWITCH_RATIO 2
4949
#endif
5050

51+
#ifndef GEMM_PREFERED_SIZE
52+
#define GEMM_PREFERED_SIZE 1
53+
#endif
54+
5155
//The array of job_t may overflow the stack.
5256
//Instead, use malloc to alloc job_t.
5357
#if MAX_CPU_NUMBER > BLAS3_MEM_ALLOC_THRESHOLD
@@ -510,6 +514,16 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
510514
return 0;
511515
}
512516

517+
static int round_up(int remainder, int width, int multiple)
518+
{
519+
if (multiple > remainder || width <= multiple)
520+
return width;
521+
width = (width + multiple - 1) / multiple;
522+
width = width * multiple;
523+
return width;
524+
}
525+
526+
513527
static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
514528
*range_n, FLOAT *sa, FLOAT *sb,
515529
BLASLONG nthreads_m, BLASLONG nthreads_n) {
@@ -601,9 +615,14 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
601615
num_parts = 0;
602616
while (m > 0){
603617
width = blas_quickdivide(m + nthreads_m - num_parts - 1, nthreads_m - num_parts);
618+
619+
width = round_up(m, width, GEMM_PREFERED_SIZE);
620+
604621
m -= width;
622+
605623
if (m < 0) width = width + m;
606624
range_M[num_parts + 1] = range_M[num_parts] + width;
625+
607626
num_parts ++;
608627
}
609628
for (i = num_parts; i < MAX_CPU_NUMBER; i++) {
@@ -645,9 +664,12 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
645664
if (width < SWITCH_RATIO) {
646665
width = SWITCH_RATIO;
647666
}
667+
width = round_up(n, width, GEMM_PREFERED_SIZE);
668+
648669
n -= width;
649670
if (n < 0) width = width + n;
650671
range_N[num_parts + 1] = range_N[num_parts] + width;
672+
651673
num_parts ++;
652674
}
653675
for (j = num_parts; j < MAX_CPU_NUMBER; j++) {

kernel/x86_64/dgemm_beta_skylakex.c

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta,
5555
return 0;
5656
}
5757

58+
if (m == 0 || n == 0)
59+
return 0;
5860

5961
c_offset = c;
6062

@@ -69,15 +71,15 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta,
6971

7072
i = m;
7173

72-
while (i > 32) {
74+
while (i >= 32) {
7375
_mm512_storeu_pd(c_offset1, z_zero);
7476
_mm512_storeu_pd(c_offset1 + 8, z_zero);
7577
_mm512_storeu_pd(c_offset1 + 16, z_zero);
7678
_mm512_storeu_pd(c_offset1 + 24 , z_zero);
7779
c_offset1 += 32;
7880
i -= 32;
7981
}
80-
while (i > 8) {
82+
while (i >= 8) {
8183
_mm512_storeu_pd(c_offset1, z_zero);
8284
c_offset1 += 8;
8385
i -= 8;

kernel/x86_64/sgemm_beta_skylakex.c

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta,
5555
return 0;
5656
}
5757

58+
if (n == 0 || m == 0)
59+
return;
5860

5961
c_offset = c;
6062

@@ -71,13 +73,13 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta,
7173

7274
i = m;
7375

74-
while (i > 32) {
76+
while (i >= 32) {
7577
_mm512_storeu_ps(c_offset1, z_zero);
7678
_mm512_storeu_ps(c_offset1 + 16, z_zero);
7779
c_offset1 += 32;
7880
i -= 32;
7981
}
80-
while (i > 8) {
82+
while (i >= 8) {
8183
_mm256_storeu_ps(c_offset1, y_zero);
8284
c_offset1 += 8;
8385
i -= 8;

param.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1627,6 +1627,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
16271627
#define SYMV_P 8
16281628

16291629
#define SWITCH_RATIO 32
1630+
#define GEMM_PREFERED_SIZE 32
16301631

16311632
#ifdef ARCH_X86
16321633

0 commit comments

Comments
 (0)