Skip to content

Commit 47a66ae

Browse files
authored
Update limits based on benchmarking the SME code on Apple M4
1 parent 20f5ed1 commit 47a66ae

File tree

1 file changed

+5
-21
lines changed

1 file changed

+5
-21
lines changed

kernel/arm64/sgemm_direct_performant.c

Lines changed: 5 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,30 +1,14 @@
11
#include "common.h"
2-
/* helper for the direct sgemm code written by Arjan van der Ven */
3-
4-
5-
2+
/* helper for the direct sgemm code adapted from Arjan van der Ven's x86_64 version */
63

74
int CNAME(BLASLONG M, BLASLONG N, BLASLONG K)
85
{
9-
if (M<3 || M%2==1) return 0;
6+
if (M<3) return 0;
107
unsigned long long mnk = M * N * K;
11-
/* large matrixes -> not performant */
12-
if (mnk >= 28 * 512 * 512)
13-
return 0;
14-
15-
/*
16-
* if the B matrix is not a nice multiple if 4 we get many unaligned accesses,
17-
* and the regular sgemm copy/realignment of data pays off much quicker
18-
*/
19-
if ((N & 3) != 0 && (mnk >= 8 * 512 * 512))
20-
return 0;
21-
22-
#ifdef SMP
23-
/* if we can run multithreaded, the threading changes the based threshold */
24-
if (mnk > 2 * 350 * 512 && num_cpu_avail(3)> 1)
8+
/* benchmark performance on M4 peaks around 512 and crosses the graph of the NEON SGEMM at about 3100 */
9+
if (mnk >= 3100 * 3100 * 3100)
2510
return 0;
26-
#endif
27-
11+
2812
return 1;
2913
}
3014

0 commit comments

Comments
 (0)